mirror of https://github.com/citusdata/citus.git
Implement partitioned intermediate results.
parent
1aef63abfb
commit
d7aea7fa10
|
@ -92,7 +92,6 @@ static void EnsureTableCanBeColocatedWith(Oid relationId, char replicationModel,
|
||||||
Oid sourceRelationId);
|
Oid sourceRelationId);
|
||||||
static void EnsureLocalTableEmpty(Oid relationId);
|
static void EnsureLocalTableEmpty(Oid relationId);
|
||||||
static void EnsureTableNotDistributed(Oid relationId);
|
static void EnsureTableNotDistributed(Oid relationId);
|
||||||
static char LookupDistributionMethod(Oid distributionMethodOid);
|
|
||||||
static Oid SupportFunctionForColumn(Var *partitionColumn, Oid accessMethodId,
|
static Oid SupportFunctionForColumn(Var *partitionColumn, Oid accessMethodId,
|
||||||
int16 supportFunctionNumber);
|
int16 supportFunctionNumber);
|
||||||
static void EnsureLocalTableEmptyIfNecessary(Oid relationId, char distributionMethod,
|
static void EnsureLocalTableEmptyIfNecessary(Oid relationId, char distributionMethod,
|
||||||
|
@ -936,7 +935,7 @@ EnsureReplicationSettings(Oid relationId, char replicationModel)
|
||||||
*
|
*
|
||||||
* The passed in oid has to belong to a value of citus.distribution_type.
|
* The passed in oid has to belong to a value of citus.distribution_type.
|
||||||
*/
|
*/
|
||||||
static char
|
char
|
||||||
LookupDistributionMethod(Oid distributionMethodOid)
|
LookupDistributionMethod(Oid distributionMethodOid)
|
||||||
{
|
{
|
||||||
char distributionMethod = 0;
|
char distributionMethod = 0;
|
||||||
|
|
|
@ -90,9 +90,7 @@ static void SendCopyDataOverConnection(StringInfo dataBuffer,
|
||||||
static void RemoteFileDestReceiverShutdown(DestReceiver *destReceiver);
|
static void RemoteFileDestReceiverShutdown(DestReceiver *destReceiver);
|
||||||
static void RemoteFileDestReceiverDestroy(DestReceiver *destReceiver);
|
static void RemoteFileDestReceiverDestroy(DestReceiver *destReceiver);
|
||||||
|
|
||||||
static char * CreateIntermediateResultsDirectory(void);
|
|
||||||
static char * IntermediateResultsDirectory(void);
|
static char * IntermediateResultsDirectory(void);
|
||||||
static char * QueryResultFileName(const char *resultId);
|
|
||||||
static void ReadIntermediateResultsIntoFuncOutput(FunctionCallInfo fcinfo,
|
static void ReadIntermediateResultsIntoFuncOutput(FunctionCallInfo fcinfo,
|
||||||
char *copyFormat,
|
char *copyFormat,
|
||||||
Datum *resultIdArray,
|
Datum *resultIdArray,
|
||||||
|
@ -556,7 +554,7 @@ ReceiveQueryResultViaCopy(const char *resultId)
|
||||||
* directory for the current transaction if it does not exist and ensures
|
* directory for the current transaction if it does not exist and ensures
|
||||||
* that the directory is removed at the end of the transaction.
|
* that the directory is removed at the end of the transaction.
|
||||||
*/
|
*/
|
||||||
static char *
|
char *
|
||||||
CreateIntermediateResultsDirectory(void)
|
CreateIntermediateResultsDirectory(void)
|
||||||
{
|
{
|
||||||
char *resultDirectory = IntermediateResultsDirectory();
|
char *resultDirectory = IntermediateResultsDirectory();
|
||||||
|
@ -591,7 +589,7 @@ CreateIntermediateResultsDirectory(void)
|
||||||
* an intermediate result with the given key in the per transaction
|
* an intermediate result with the given key in the per transaction
|
||||||
* result directory.
|
* result directory.
|
||||||
*/
|
*/
|
||||||
static char *
|
char *
|
||||||
QueryResultFileName(const char *resultId)
|
QueryResultFileName(const char *resultId)
|
||||||
{
|
{
|
||||||
StringInfo resultFileName = makeStringInfo();
|
StringInfo resultFileName = makeStringInfo();
|
||||||
|
|
|
@ -0,0 +1,519 @@
|
||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* partition_intermediate_results.c
|
||||||
|
* Functions for writing partitioned intermediate results.
|
||||||
|
*
|
||||||
|
* Copyright (c), Citus Data, Inc.
|
||||||
|
*
|
||||||
|
*-------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "postgres.h"
|
||||||
|
#include "funcapi.h"
|
||||||
|
#include "libpq-fe.h"
|
||||||
|
#include "miscadmin.h"
|
||||||
|
#include "port.h"
|
||||||
|
|
||||||
|
#include "access/nbtree.h"
|
||||||
|
#include "catalog/pg_am.h"
|
||||||
|
#include "catalog/pg_type.h"
|
||||||
|
#include "distributed/intermediate_results.h"
|
||||||
|
#include "distributed/master_metadata_utility.h"
|
||||||
|
#include "distributed/metadata_cache.h"
|
||||||
|
#include "distributed/multi_executor.h"
|
||||||
|
#include "distributed/pg_dist_shard.h"
|
||||||
|
#include "distributed/remote_commands.h"
|
||||||
|
#include "distributed/tuplestore.h"
|
||||||
|
#include "distributed/worker_protocol.h"
|
||||||
|
#include "nodes/makefuncs.h"
|
||||||
|
#include "nodes/primnodes.h"
|
||||||
|
#include "tcop/pquery.h"
|
||||||
|
#include "tcop/tcopprot.h"
|
||||||
|
#include "utils/typcache.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* PartitionedResultDestReceiver is used for streaming tuples into a set of
|
||||||
|
* partitioned result files.
|
||||||
|
*/
|
||||||
|
typedef struct PartitionedResultDestReceiver
|
||||||
|
{
|
||||||
|
/* public DestReceiver interface */
|
||||||
|
DestReceiver pub;
|
||||||
|
|
||||||
|
/* partition file $i is stored at file named $resultIdPrefix_$i. */
|
||||||
|
char *resultIdPrefix;
|
||||||
|
|
||||||
|
/* use binary copy or just text copy format? */
|
||||||
|
bool binaryCopy;
|
||||||
|
|
||||||
|
/* used for deciding which partition a shard belongs to. */
|
||||||
|
DistTableCacheEntry *shardSearchInfo;
|
||||||
|
|
||||||
|
MemoryContext perTupleContext;
|
||||||
|
|
||||||
|
/* how does stream tuples look like? */
|
||||||
|
TupleDesc tupleDescriptor;
|
||||||
|
|
||||||
|
/* which column of streamed tuples to use as partition column? */
|
||||||
|
int partitionColumnIndex;
|
||||||
|
|
||||||
|
/* how many partitions do we have? */
|
||||||
|
int partitionCount;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Tuples for partition[i] are sent to partitionDestReceivers[i], which
|
||||||
|
* writes it to a result file.
|
||||||
|
*/
|
||||||
|
DestReceiver **partitionDestReceivers;
|
||||||
|
} PartitionedResultDestReceiver;
|
||||||
|
|
||||||
|
static Portal StartPortalForQueryExecution(const char *queryString);
|
||||||
|
static DistTableCacheEntry * QueryTupleShardSearchInfo(ArrayType *minValuesArray,
|
||||||
|
ArrayType *maxValuesArray,
|
||||||
|
char partitionMethod,
|
||||||
|
Var *partitionColumn);
|
||||||
|
static PartitionedResultDestReceiver * CreatePartitionedResultDestReceiver(char *resultId,
|
||||||
|
int
|
||||||
|
partitionColumnIndex,
|
||||||
|
int
|
||||||
|
partitionCount,
|
||||||
|
TupleDesc
|
||||||
|
tupleDescriptor,
|
||||||
|
bool binaryCopy,
|
||||||
|
DistTableCacheEntry
|
||||||
|
*
|
||||||
|
shardSearchInfo,
|
||||||
|
MemoryContext
|
||||||
|
perTupleContext);
|
||||||
|
static void PartitionedResultDestReceiverStartup(DestReceiver *dest, int operation,
|
||||||
|
TupleDesc inputTupleDescriptor);
|
||||||
|
static bool PartitionedResultDestReceiverReceive(TupleTableSlot *slot,
|
||||||
|
DestReceiver *dest);
|
||||||
|
static void PartitionedResultDestReceiverShutdown(DestReceiver *destReceiver);
|
||||||
|
static void PartitionedResultDestReceiverDestroy(DestReceiver *destReceiver);
|
||||||
|
|
||||||
|
/* exports for SQL callable functions */
|
||||||
|
PG_FUNCTION_INFO_V1(worker_partition_query_result);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* worker_partition_query_result executes a query and writes the results into a
|
||||||
|
* set of local files according to the partition scheme and the partition column.
|
||||||
|
*/
|
||||||
|
Datum
|
||||||
|
worker_partition_query_result(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
ReturnSetInfo *resultInfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||||
|
|
||||||
|
text *resultIdPrefixText = PG_GETARG_TEXT_P(0);
|
||||||
|
char *resultIdPrefixString = text_to_cstring(resultIdPrefixText);
|
||||||
|
|
||||||
|
/* verify that resultIdPrefix doesn't contain invalid characters */
|
||||||
|
QueryResultFileName(resultIdPrefixString);
|
||||||
|
|
||||||
|
text *queryText = PG_GETARG_TEXT_P(1);
|
||||||
|
char *queryString = text_to_cstring(queryText);
|
||||||
|
|
||||||
|
int partitionColumnIndex = PG_GETARG_INT32(2);
|
||||||
|
Oid partitionMethodOid = PG_GETARG_OID(3);
|
||||||
|
|
||||||
|
char partitionMethod = LookupDistributionMethod(partitionMethodOid);
|
||||||
|
if (partitionMethod != DISTRIBUTE_BY_HASH && partitionMethod != DISTRIBUTE_BY_RANGE)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("only hash and range partitiong schemes are supported")));
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayType *minValuesArray = PG_GETARG_ARRAYTYPE_P(4);
|
||||||
|
int32 minValuesCount = ArrayObjectCount(minValuesArray);
|
||||||
|
|
||||||
|
ArrayType *maxValuesArray = PG_GETARG_ARRAYTYPE_P(5);
|
||||||
|
int32 maxValuesCount = ArrayObjectCount(maxValuesArray);
|
||||||
|
|
||||||
|
bool binaryCopy = PG_GETARG_BOOL(6);
|
||||||
|
|
||||||
|
CheckCitusVersion(ERROR);
|
||||||
|
|
||||||
|
if (!IsMultiStatementTransaction())
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errmsg("worker_partition_query_result can only be used in a "
|
||||||
|
"transaction block")));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Make sure that this transaction has a distributed transaction ID.
|
||||||
|
*
|
||||||
|
* Intermediate results will be stored in a directory that is derived
|
||||||
|
* from the distributed transaction ID.
|
||||||
|
*/
|
||||||
|
UseCoordinatedTransaction();
|
||||||
|
|
||||||
|
CreateIntermediateResultsDirectory();
|
||||||
|
|
||||||
|
if (minValuesCount != maxValuesCount)
|
||||||
|
{
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg(
|
||||||
|
"min values and max values must have the same number of elements")));
|
||||||
|
}
|
||||||
|
|
||||||
|
int partitionCount = minValuesCount;
|
||||||
|
if (partitionCount == 0)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("number of partitions cannot be 0")));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* start execution early in order to extract the tuple descriptor */
|
||||||
|
Portal portal = StartPortalForQueryExecution(queryString);
|
||||||
|
|
||||||
|
/* extract the partition column */
|
||||||
|
TupleDesc tupleDescriptor = portal->tupDesc;
|
||||||
|
if (tupleDescriptor == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("query must generate a set of rows")));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (partitionColumnIndex < 0 || partitionColumnIndex >= tupleDescriptor->natts)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("partition column index must be between 0 and %d",
|
||||||
|
tupleDescriptor->natts - 1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
FormData_pg_attribute *partitionColumnAttr = TupleDescAttr(tupleDescriptor,
|
||||||
|
partitionColumnIndex);
|
||||||
|
Var *partitionColumn = makeVar(partitionColumnIndex, partitionColumnIndex,
|
||||||
|
partitionColumnAttr->atttypid,
|
||||||
|
partitionColumnAttr->atttypmod,
|
||||||
|
partitionColumnAttr->attcollation, 0);
|
||||||
|
|
||||||
|
/* construct an artificial DistTableCacheEntry for shard pruning */
|
||||||
|
DistTableCacheEntry *shardSearchInfo =
|
||||||
|
QueryTupleShardSearchInfo(minValuesArray, maxValuesArray,
|
||||||
|
partitionMethod, partitionColumn);
|
||||||
|
|
||||||
|
/* prepare the output destination */
|
||||||
|
EState *estate = CreateExecutorState();
|
||||||
|
MemoryContext tupleContext = GetPerTupleMemoryContext(estate);
|
||||||
|
PartitionedResultDestReceiver *dest =
|
||||||
|
CreatePartitionedResultDestReceiver(resultIdPrefixString, partitionColumnIndex,
|
||||||
|
partitionCount, tupleDescriptor, binaryCopy,
|
||||||
|
shardSearchInfo, tupleContext);
|
||||||
|
|
||||||
|
/* execute the query */
|
||||||
|
PortalRun(portal, FETCH_ALL, false, true, (DestReceiver *) dest,
|
||||||
|
(DestReceiver *) dest, NULL);
|
||||||
|
|
||||||
|
/* construct the output result */
|
||||||
|
TupleDesc returnTupleDesc = NULL;
|
||||||
|
Tuplestorestate *tupleStore = SetupTuplestore(fcinfo, &returnTupleDesc);
|
||||||
|
resultInfo->returnMode = SFRM_Materialize;
|
||||||
|
resultInfo->setResult = tupleStore;
|
||||||
|
resultInfo->setDesc = returnTupleDesc;
|
||||||
|
|
||||||
|
for (int partitionIndex = 0; partitionIndex < partitionCount; partitionIndex++)
|
||||||
|
{
|
||||||
|
uint64 recordsWritten = 0;
|
||||||
|
uint64 bytesWritten = 0;
|
||||||
|
Datum values[3];
|
||||||
|
bool nulls[3];
|
||||||
|
|
||||||
|
if (dest->partitionDestReceivers[partitionIndex] != NULL)
|
||||||
|
{
|
||||||
|
FileDestReceiverStats(dest->partitionDestReceivers[partitionIndex],
|
||||||
|
&recordsWritten, &bytesWritten);
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(values, 0, sizeof(values));
|
||||||
|
memset(nulls, 0, sizeof(nulls));
|
||||||
|
|
||||||
|
values[0] = Int32GetDatum(partitionIndex);
|
||||||
|
values[1] = UInt64GetDatum(recordsWritten);
|
||||||
|
values[2] = UInt64GetDatum(bytesWritten);
|
||||||
|
|
||||||
|
tuplestore_putvalues(tupleStore, returnTupleDesc, values, nulls);
|
||||||
|
}
|
||||||
|
|
||||||
|
tuplestore_donestoring(tupleStore);
|
||||||
|
PortalDrop(portal, false);
|
||||||
|
FreeExecutorState(estate);
|
||||||
|
|
||||||
|
PG_RETURN_INT64(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* StartPortalForQueryExecution creates and starts a portal which can be
|
||||||
|
* used for running the given query.
|
||||||
|
*/
|
||||||
|
static Portal
|
||||||
|
StartPortalForQueryExecution(const char *queryString)
|
||||||
|
{
|
||||||
|
Query *query = ParseQueryString(queryString, NULL, 0);
|
||||||
|
|
||||||
|
int cursorOptions = CURSOR_OPT_PARALLEL_OK;
|
||||||
|
PlannedStmt *queryPlan = pg_plan_query(query, cursorOptions, NULL);
|
||||||
|
|
||||||
|
Portal portal = CreateNewPortal();
|
||||||
|
|
||||||
|
/* don't display the portal in pg_cursors, it is for internal use only */
|
||||||
|
portal->visible = false;
|
||||||
|
|
||||||
|
PortalDefineQuery(portal, NULL, queryString, "SELECT", list_make1(queryPlan), NULL);
|
||||||
|
int eflags = 0;
|
||||||
|
PortalStart(portal, NULL, eflags, GetActiveSnapshot());
|
||||||
|
|
||||||
|
return portal;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* QueryTupleShardSearchInfo returns a DistTableCacheEntry which has enough
|
||||||
|
* information so that FindShardInterval() can find the shard corresponding
|
||||||
|
* to a tuple.
|
||||||
|
*/
|
||||||
|
static DistTableCacheEntry *
|
||||||
|
QueryTupleShardSearchInfo(ArrayType *minValuesArray, ArrayType *maxValuesArray,
|
||||||
|
char partitionMethod, Var *partitionColumn)
|
||||||
|
{
|
||||||
|
Datum *minValues = 0;
|
||||||
|
Datum *maxValues = 0;
|
||||||
|
bool *minValueNulls = 0;
|
||||||
|
bool *maxValueNulls = 0;
|
||||||
|
int minValuesCount = 0;
|
||||||
|
int maxValuesCount = 0;
|
||||||
|
Oid intervalTypeId = InvalidOid;
|
||||||
|
int32 intervalTypeMod = 0;
|
||||||
|
deconstruct_array(minValuesArray, TEXTOID, -1, false, 'i', &minValues,
|
||||||
|
&minValueNulls, &minValuesCount);
|
||||||
|
deconstruct_array(maxValuesArray, TEXTOID, -1, false, 'i', &maxValues,
|
||||||
|
&maxValueNulls, &maxValuesCount);
|
||||||
|
int partitionCount = minValuesCount;
|
||||||
|
Assert(maxValuesCount == partitionCount);
|
||||||
|
|
||||||
|
GetIntervalTypeInfo(partitionMethod, partitionColumn,
|
||||||
|
&intervalTypeId, &intervalTypeMod);
|
||||||
|
FmgrInfo *shardColumnCompare = GetFunctionInfo(partitionColumn->vartype,
|
||||||
|
BTREE_AM_OID, BTORDER_PROC);
|
||||||
|
FmgrInfo *shardIntervalCompare = GetFunctionInfo(intervalTypeId,
|
||||||
|
BTREE_AM_OID, BTORDER_PROC);
|
||||||
|
FmgrInfo *hashFunction = NULL;
|
||||||
|
if (partitionMethod == DISTRIBUTE_BY_HASH)
|
||||||
|
{
|
||||||
|
TypeCacheEntry *typeEntry = lookup_type_cache(partitionColumn->vartype,
|
||||||
|
TYPECACHE_HASH_PROC_FINFO);
|
||||||
|
|
||||||
|
hashFunction = palloc0(sizeof(FmgrInfo));
|
||||||
|
fmgr_info_copy(hashFunction, &(typeEntry->hash_proc_finfo), CurrentMemoryContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
ShardInterval **shardIntervalArray = palloc0(partitionCount *
|
||||||
|
sizeof(ShardInterval *));
|
||||||
|
for (int partitionIndex = 0; partitionIndex < partitionCount; partitionIndex++)
|
||||||
|
{
|
||||||
|
Datum datumArray[Natts_pg_dist_shard] = {
|
||||||
|
[Anum_pg_dist_shard_logicalrelid - 1] = InvalidOid,
|
||||||
|
[Anum_pg_dist_shard_shardid - 1] = partitionIndex,
|
||||||
|
[Anum_pg_dist_shard_shardstorage - 1] = SHARD_STORAGE_VIRTUAL,
|
||||||
|
[Anum_pg_dist_shard_shardminvalue - 1] = minValues[partitionIndex],
|
||||||
|
[Anum_pg_dist_shard_shardmaxvalue - 1] = maxValues[partitionIndex]
|
||||||
|
};
|
||||||
|
bool nullsArray[Natts_pg_dist_shard] = {
|
||||||
|
[Anum_pg_dist_shard_shardminvalue - 1] = minValueNulls[partitionIndex],
|
||||||
|
[Anum_pg_dist_shard_shardmaxvalue - 1] = maxValueNulls[partitionIndex]
|
||||||
|
};
|
||||||
|
|
||||||
|
shardIntervalArray[partitionIndex] =
|
||||||
|
DeformedDistShardTupleToShardInterval(datumArray, nullsArray,
|
||||||
|
intervalTypeId, intervalTypeMod);
|
||||||
|
shardIntervalArray[partitionIndex]->shardIndex = partitionIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
DistTableCacheEntry *result = palloc0(sizeof(DistTableCacheEntry));
|
||||||
|
result->partitionMethod = partitionMethod;
|
||||||
|
result->partitionColumn = partitionColumn;
|
||||||
|
result->shardIntervalCompareFunction = shardIntervalCompare;
|
||||||
|
result->shardColumnCompareFunction = shardColumnCompare;
|
||||||
|
result->hashFunction = hashFunction;
|
||||||
|
result->sortedShardIntervalArray =
|
||||||
|
SortShardIntervalArray(shardIntervalArray, partitionCount,
|
||||||
|
partitionColumn->varcollid, shardIntervalCompare);
|
||||||
|
result->hasUninitializedShardInterval =
|
||||||
|
HasUninitializedShardInterval(result->sortedShardIntervalArray, partitionCount);
|
||||||
|
result->hasOverlappingShardInterval =
|
||||||
|
result->hasUninitializedShardInterval ||
|
||||||
|
HasOverlappingShardInterval(result->sortedShardIntervalArray, partitionCount,
|
||||||
|
partitionColumn->varcollid, shardIntervalCompare);
|
||||||
|
ErrorIfInconsistentShardIntervals(result);
|
||||||
|
|
||||||
|
result->shardIntervalArrayLength = partitionCount;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CreatePartitionedResultDestReceiver sets up a partitioned dest receiver.
|
||||||
|
*/
|
||||||
|
static PartitionedResultDestReceiver *
|
||||||
|
CreatePartitionedResultDestReceiver(char *resultIdPrefix, int partitionColumnIndex,
|
||||||
|
int partitionCount, TupleDesc tupleDescriptor,
|
||||||
|
bool binaryCopy, DistTableCacheEntry *shardSearchInfo,
|
||||||
|
MemoryContext perTupleContext)
|
||||||
|
{
|
||||||
|
PartitionedResultDestReceiver *resultDest =
|
||||||
|
palloc0(sizeof(PartitionedResultDestReceiver));
|
||||||
|
|
||||||
|
/* set up the DestReceiver function pointers */
|
||||||
|
resultDest->pub.receiveSlot = PartitionedResultDestReceiverReceive;
|
||||||
|
resultDest->pub.rStartup = PartitionedResultDestReceiverStartup;
|
||||||
|
resultDest->pub.rShutdown = PartitionedResultDestReceiverShutdown;
|
||||||
|
resultDest->pub.rDestroy = PartitionedResultDestReceiverDestroy;
|
||||||
|
resultDest->pub.mydest = DestCopyOut;
|
||||||
|
|
||||||
|
/* set up output parameters */
|
||||||
|
resultDest->resultIdPrefix = resultIdPrefix;
|
||||||
|
resultDest->perTupleContext = perTupleContext;
|
||||||
|
resultDest->partitionColumnIndex = partitionColumnIndex;
|
||||||
|
resultDest->partitionCount = partitionCount;
|
||||||
|
resultDest->shardSearchInfo = shardSearchInfo;
|
||||||
|
resultDest->tupleDescriptor = tupleDescriptor;
|
||||||
|
resultDest->binaryCopy = binaryCopy;
|
||||||
|
resultDest->partitionDestReceivers =
|
||||||
|
(DestReceiver **) palloc0(partitionCount * sizeof(DestReceiver *));
|
||||||
|
|
||||||
|
return resultDest;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* PartitionedResultDestReceiverStartup implements the rStartup interface of
|
||||||
|
* PartitionedResultDestReceiver.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
PartitionedResultDestReceiverStartup(DestReceiver *copyDest, int operation,
|
||||||
|
TupleDesc inputTupleDescriptor)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* We don't expect this to be called multiple times, but if it happens,
|
||||||
|
* we will just overwrite previous files.
|
||||||
|
*/
|
||||||
|
PartitionedResultDestReceiver *partitionedDest =
|
||||||
|
(PartitionedResultDestReceiver *) copyDest;
|
||||||
|
int partitionCount = partitionedDest->partitionCount;
|
||||||
|
for (int partitionIndex = 0; partitionIndex < partitionCount; partitionIndex++)
|
||||||
|
{
|
||||||
|
DestReceiver *partitionDest =
|
||||||
|
partitionedDest->partitionDestReceivers[partitionIndex];
|
||||||
|
if (partitionDest != NULL)
|
||||||
|
{
|
||||||
|
partitionDest->rStartup(partitionDest, operation, inputTupleDescriptor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* PartitionedResultDestReceiverReceive implements the receiveSlot interface of
|
||||||
|
* PartitionedResultDestReceiver.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
PartitionedResultDestReceiverReceive(TupleTableSlot *slot, DestReceiver *copyDest)
|
||||||
|
{
|
||||||
|
PartitionedResultDestReceiver *partitionedDest =
|
||||||
|
(PartitionedResultDestReceiver *) copyDest;
|
||||||
|
|
||||||
|
slot_getallattrs(slot);
|
||||||
|
|
||||||
|
Datum *columnValues = slot->tts_values;
|
||||||
|
bool *columnNulls = slot->tts_isnull;
|
||||||
|
|
||||||
|
if (columnNulls[partitionedDest->partitionColumnIndex])
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
|
||||||
|
errmsg("the partition column value cannot be NULL")));
|
||||||
|
}
|
||||||
|
|
||||||
|
Datum partitionColumnValue = columnValues[partitionedDest->partitionColumnIndex];
|
||||||
|
ShardInterval *shardInterval = FindShardInterval(partitionColumnValue,
|
||||||
|
partitionedDest->shardSearchInfo);
|
||||||
|
if (shardInterval == NULL)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("could not find shard for partition column "
|
||||||
|
"value")));
|
||||||
|
}
|
||||||
|
|
||||||
|
int partitionIndex = shardInterval->shardIndex;
|
||||||
|
DestReceiver *partitionDest = partitionedDest->partitionDestReceivers[partitionIndex];
|
||||||
|
if (partitionDest == NULL)
|
||||||
|
{
|
||||||
|
StringInfo resultId = makeStringInfo();
|
||||||
|
appendStringInfo(resultId, "%s_%d", partitionedDest->resultIdPrefix,
|
||||||
|
partitionIndex);
|
||||||
|
char *filePath = QueryResultFileName(resultId->data);
|
||||||
|
|
||||||
|
partitionDest = CreateFileDestReceiver(filePath, partitionedDest->perTupleContext,
|
||||||
|
partitionedDest->binaryCopy);
|
||||||
|
partitionedDest->partitionDestReceivers[partitionIndex] = partitionDest;
|
||||||
|
partitionDest->rStartup(partitionDest, 0, partitionedDest->tupleDescriptor);
|
||||||
|
}
|
||||||
|
|
||||||
|
partitionDest->receiveSlot(slot, partitionDest);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* PartitionedResultDestReceiverShutdown implements the rShutdown interface of
|
||||||
|
* PartitionedResultDestReceiver.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
PartitionedResultDestReceiverShutdown(DestReceiver *copyDest)
|
||||||
|
{
|
||||||
|
PartitionedResultDestReceiver *partitionedDest =
|
||||||
|
(PartitionedResultDestReceiver *) copyDest;
|
||||||
|
int partitionCount = partitionedDest->partitionCount;
|
||||||
|
for (int partitionIndex = 0; partitionIndex < partitionCount; partitionIndex++)
|
||||||
|
{
|
||||||
|
DestReceiver *partitionDest =
|
||||||
|
partitionedDest->partitionDestReceivers[partitionIndex];
|
||||||
|
if (partitionDest != NULL)
|
||||||
|
{
|
||||||
|
partitionDest->rShutdown(partitionDest);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* PartitionedResultDestReceiverDestroy implements the rDestroy interface of
|
||||||
|
* PartitionedResultDestReceiver.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
PartitionedResultDestReceiverDestroy(DestReceiver *copyDest)
|
||||||
|
{
|
||||||
|
PartitionedResultDestReceiver *partitionedDest =
|
||||||
|
(PartitionedResultDestReceiver *) copyDest;
|
||||||
|
int partitionCount = partitionedDest->partitionCount;
|
||||||
|
for (int partitionIndex = 0; partitionIndex < partitionCount; partitionIndex++)
|
||||||
|
{
|
||||||
|
DestReceiver *partitionDest =
|
||||||
|
partitionedDest->partitionDestReceivers[partitionIndex];
|
||||||
|
if (partitionDest != NULL)
|
||||||
|
{
|
||||||
|
/* this call should also free partitionDest, so no need to free it after */
|
||||||
|
partitionDest->rDestroy(partitionDest);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pfree(partitionedDest->partitionDestReceivers);
|
||||||
|
pfree(partitionedDest);
|
||||||
|
}
|
|
@ -194,15 +194,9 @@ static DistTableCacheEntry * LookupDistTableCacheEntry(Oid relationId);
|
||||||
static void BuildDistTableCacheEntry(DistTableCacheEntry *cacheEntry);
|
static void BuildDistTableCacheEntry(DistTableCacheEntry *cacheEntry);
|
||||||
static void BuildCachedShardList(DistTableCacheEntry *cacheEntry);
|
static void BuildCachedShardList(DistTableCacheEntry *cacheEntry);
|
||||||
static void PrepareWorkerNodeCache(void);
|
static void PrepareWorkerNodeCache(void);
|
||||||
static bool HasUninitializedShardInterval(ShardInterval **sortedShardIntervalArray,
|
|
||||||
int shardCount);
|
|
||||||
static bool CheckInstalledVersion(int elevel);
|
static bool CheckInstalledVersion(int elevel);
|
||||||
static char * AvailableExtensionVersion(void);
|
static char * AvailableExtensionVersion(void);
|
||||||
static char * InstalledExtensionVersion(void);
|
static char * InstalledExtensionVersion(void);
|
||||||
static bool HasOverlappingShardInterval(ShardInterval **shardIntervalArray,
|
|
||||||
int shardIntervalArrayLength,
|
|
||||||
Oid shardIntervalCollation,
|
|
||||||
FmgrInfo *shardIntervalSortCompareFunction);
|
|
||||||
static bool CitusHasBeenLoadedInternal(void);
|
static bool CitusHasBeenLoadedInternal(void);
|
||||||
static void InitializeCaches(void);
|
static void InitializeCaches(void);
|
||||||
static void InitializeDistCache(void);
|
static void InitializeDistCache(void);
|
||||||
|
@ -1245,22 +1239,7 @@ BuildCachedShardList(DistTableCacheEntry *cacheEntry)
|
||||||
cacheEntry->hasOverlappingShardInterval = true;
|
cacheEntry->hasOverlappingShardInterval = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
ErrorIfInconsistentShardIntervals(cacheEntry);
|
||||||
* If table is hash-partitioned and has shards, there never should be
|
|
||||||
* any uninitalized shards. Historically we've not prevented that for
|
|
||||||
* range partitioned tables, but it might be a good idea to start
|
|
||||||
* doing so.
|
|
||||||
*/
|
|
||||||
if (cacheEntry->partitionMethod == DISTRIBUTE_BY_HASH &&
|
|
||||||
cacheEntry->hasUninitializedShardInterval)
|
|
||||||
{
|
|
||||||
ereport(ERROR, (errmsg("hash partitioned table has uninitialized shards")));
|
|
||||||
}
|
|
||||||
if (cacheEntry->partitionMethod == DISTRIBUTE_BY_HASH &&
|
|
||||||
cacheEntry->hasOverlappingShardInterval)
|
|
||||||
{
|
|
||||||
ereport(ERROR, (errmsg("hash partitioned table has overlapping shards")));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1330,6 +1309,31 @@ BuildCachedShardList(DistTableCacheEntry *cacheEntry)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ErrorIfInconsistentShardIntervals checks if shard intervals are consistent with
|
||||||
|
* our expectations.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
ErrorIfInconsistentShardIntervals(DistTableCacheEntry *cacheEntry)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If table is hash-partitioned and has shards, there never should be any
|
||||||
|
* uninitalized shards. Historically we've not prevented that for range
|
||||||
|
* partitioned tables, but it might be a good idea to start doing so.
|
||||||
|
*/
|
||||||
|
if (cacheEntry->partitionMethod == DISTRIBUTE_BY_HASH &&
|
||||||
|
cacheEntry->hasUninitializedShardInterval)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errmsg("hash partitioned table has uninitialized shards")));
|
||||||
|
}
|
||||||
|
if (cacheEntry->partitionMethod == DISTRIBUTE_BY_HASH &&
|
||||||
|
cacheEntry->hasOverlappingShardInterval)
|
||||||
|
{
|
||||||
|
ereport(ERROR, (errmsg("hash partitioned table has overlapping shards")));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* HasUniformHashDistribution determines whether the given list of sorted shards
|
* HasUniformHashDistribution determines whether the given list of sorted shards
|
||||||
* has a uniform hash distribution, as produced by master_create_worker_shards for
|
* has a uniform hash distribution, as produced by master_create_worker_shards for
|
||||||
|
@ -1376,7 +1380,7 @@ HasUniformHashDistribution(ShardInterval **shardIntervalArray,
|
||||||
* ensure that input shard interval array is sorted on shardminvalue and uninitialized
|
* ensure that input shard interval array is sorted on shardminvalue and uninitialized
|
||||||
* shard intervals are at the end of the array.
|
* shard intervals are at the end of the array.
|
||||||
*/
|
*/
|
||||||
static bool
|
bool
|
||||||
HasUninitializedShardInterval(ShardInterval **sortedShardIntervalArray, int shardCount)
|
HasUninitializedShardInterval(ShardInterval **sortedShardIntervalArray, int shardCount)
|
||||||
{
|
{
|
||||||
bool hasUninitializedShardInterval = false;
|
bool hasUninitializedShardInterval = false;
|
||||||
|
@ -1406,7 +1410,7 @@ HasUninitializedShardInterval(ShardInterval **sortedShardIntervalArray, int shar
|
||||||
* HasOverlappingShardInterval determines whether the given list of sorted
|
* HasOverlappingShardInterval determines whether the given list of sorted
|
||||||
* shards has overlapping ranges.
|
* shards has overlapping ranges.
|
||||||
*/
|
*/
|
||||||
static bool
|
bool
|
||||||
HasOverlappingShardInterval(ShardInterval **shardIntervalArray,
|
HasOverlappingShardInterval(ShardInterval **shardIntervalArray,
|
||||||
int shardIntervalArrayLength,
|
int shardIntervalArrayLength,
|
||||||
Oid shardIntervalCollation,
|
Oid shardIntervalCollation,
|
||||||
|
@ -3682,8 +3686,9 @@ GetPartitionTypeInputInfo(char *partitionKeyString, char partitionMethod,
|
||||||
Var *partitionColumn = (Var *) partitionNode;
|
Var *partitionColumn = (Var *) partitionNode;
|
||||||
Assert(IsA(partitionNode, Var));
|
Assert(IsA(partitionNode, Var));
|
||||||
|
|
||||||
*intervalTypeId = partitionColumn->vartype;
|
GetIntervalTypeInfo(partitionMethod, partitionColumn,
|
||||||
*intervalTypeMod = partitionColumn->vartypmod;
|
intervalTypeId, intervalTypeMod);
|
||||||
|
|
||||||
*columnTypeId = partitionColumn->vartype;
|
*columnTypeId = partitionColumn->vartype;
|
||||||
*columnTypeMod = partitionColumn->vartypmod;
|
*columnTypeMod = partitionColumn->vartypmod;
|
||||||
break;
|
break;
|
||||||
|
@ -3695,7 +3700,9 @@ GetPartitionTypeInputInfo(char *partitionKeyString, char partitionMethod,
|
||||||
Var *partitionColumn = (Var *) partitionNode;
|
Var *partitionColumn = (Var *) partitionNode;
|
||||||
Assert(IsA(partitionNode, Var));
|
Assert(IsA(partitionNode, Var));
|
||||||
|
|
||||||
*intervalTypeId = INT4OID;
|
GetIntervalTypeInfo(partitionMethod, partitionColumn,
|
||||||
|
intervalTypeId, intervalTypeMod);
|
||||||
|
|
||||||
*columnTypeId = partitionColumn->vartype;
|
*columnTypeId = partitionColumn->vartype;
|
||||||
*columnTypeMod = partitionColumn->vartypmod;
|
*columnTypeMod = partitionColumn->vartypmod;
|
||||||
break;
|
break;
|
||||||
|
@ -3716,6 +3723,42 @@ GetPartitionTypeInputInfo(char *partitionKeyString, char partitionMethod,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* GetIntervalTypeInfo gets type id and type mod of the min/max values
|
||||||
|
* of shard intervals for a distributed table with given partition method
|
||||||
|
* and partition column.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
GetIntervalTypeInfo(char partitionMethod, Var *partitionColumn,
|
||||||
|
Oid *intervalTypeId, int32 *intervalTypeMod)
|
||||||
|
{
|
||||||
|
*intervalTypeId = InvalidOid;
|
||||||
|
*intervalTypeMod = -1;
|
||||||
|
|
||||||
|
switch (partitionMethod)
|
||||||
|
{
|
||||||
|
case DISTRIBUTE_BY_APPEND:
|
||||||
|
case DISTRIBUTE_BY_RANGE:
|
||||||
|
{
|
||||||
|
*intervalTypeId = partitionColumn->vartype;
|
||||||
|
*intervalTypeMod = partitionColumn->vartypmod;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case DISTRIBUTE_BY_HASH:
|
||||||
|
{
|
||||||
|
*intervalTypeId = INT4OID;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* TupleToShardInterval transforms the specified dist_shard tuple into a new
|
* TupleToShardInterval transforms the specified dist_shard tuple into a new
|
||||||
* ShardInterval using the provided descriptor and partition type information.
|
* ShardInterval using the provided descriptor and partition type information.
|
||||||
|
@ -3725,10 +3768,33 @@ TupleToShardInterval(HeapTuple heapTuple, TupleDesc tupleDescriptor, Oid
|
||||||
intervalTypeId,
|
intervalTypeId,
|
||||||
int32 intervalTypeMod)
|
int32 intervalTypeMod)
|
||||||
{
|
{
|
||||||
Oid inputFunctionId = InvalidOid;
|
|
||||||
Oid typeIoParam = InvalidOid;
|
|
||||||
Datum datumArray[Natts_pg_dist_shard];
|
Datum datumArray[Natts_pg_dist_shard];
|
||||||
bool isNullArray[Natts_pg_dist_shard];
|
bool isNullArray[Natts_pg_dist_shard];
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We use heap_deform_tuple() instead of heap_getattr() to expand tuple
|
||||||
|
* to contain missing values when ALTER TABLE ADD COLUMN happens.
|
||||||
|
*/
|
||||||
|
heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray);
|
||||||
|
|
||||||
|
ShardInterval *shardInterval =
|
||||||
|
DeformedDistShardTupleToShardInterval(datumArray, isNullArray,
|
||||||
|
intervalTypeId, intervalTypeMod);
|
||||||
|
|
||||||
|
return shardInterval;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* DeformedDistShardTupleToShardInterval transforms the specified deformed
|
||||||
|
* pg_dist_shard tuple into a new ShardInterval.
|
||||||
|
*/
|
||||||
|
ShardInterval *
|
||||||
|
DeformedDistShardTupleToShardInterval(Datum *datumArray, bool *isNullArray,
|
||||||
|
Oid intervalTypeId, int32 intervalTypeMod)
|
||||||
|
{
|
||||||
|
Oid inputFunctionId = InvalidOid;
|
||||||
|
Oid typeIoParam = InvalidOid;
|
||||||
Datum minValue = 0;
|
Datum minValue = 0;
|
||||||
Datum maxValue = 0;
|
Datum maxValue = 0;
|
||||||
bool minValueExists = false;
|
bool minValueExists = false;
|
||||||
|
@ -3738,14 +3804,8 @@ TupleToShardInterval(HeapTuple heapTuple, TupleDesc tupleDescriptor, Oid
|
||||||
char intervalAlign = '0';
|
char intervalAlign = '0';
|
||||||
char intervalDelim = '0';
|
char intervalDelim = '0';
|
||||||
|
|
||||||
/*
|
Oid relationId =
|
||||||
* We use heap_deform_tuple() instead of heap_getattr() to expand tuple
|
DatumGetObjectId(datumArray[Anum_pg_dist_shard_logicalrelid - 1]);
|
||||||
* to contain missing values when ALTER TABLE ADD COLUMN happens.
|
|
||||||
*/
|
|
||||||
heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray);
|
|
||||||
|
|
||||||
Oid relationId = DatumGetObjectId(datumArray[Anum_pg_dist_shard_logicalrelid -
|
|
||||||
1]);
|
|
||||||
int64 shardId = DatumGetInt64(datumArray[Anum_pg_dist_shard_shardid - 1]);
|
int64 shardId = DatumGetInt64(datumArray[Anum_pg_dist_shard_shardid - 1]);
|
||||||
char storageType = DatumGetChar(datumArray[Anum_pg_dist_shard_shardstorage - 1]);
|
char storageType = DatumGetChar(datumArray[Anum_pg_dist_shard_shardstorage - 1]);
|
||||||
Datum minValueTextDatum = datumArray[Anum_pg_dist_shard_shardminvalue - 1];
|
Datum minValueTextDatum = datumArray[Anum_pg_dist_shard_shardminvalue - 1];
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "udfs/read_intermediate_results/9.2-1.sql"
|
#include "udfs/read_intermediate_results/9.2-1.sql"
|
||||||
#include "udfs/fetch_intermediate_results/9.2-1.sql"
|
#include "udfs/fetch_intermediate_results/9.2-1.sql"
|
||||||
|
#include "udfs/worker_partition_query_result/9.2-1.sql"
|
||||||
|
|
||||||
ALTER TABLE pg_catalog.pg_dist_colocation ADD distributioncolumncollation oid;
|
ALTER TABLE pg_catalog.pg_dist_colocation ADD distributioncolumncollation oid;
|
||||||
UPDATE pg_catalog.pg_dist_colocation dc SET distributioncolumncollation = t.typcollation
|
UPDATE pg_catalog.pg_dist_colocation dc SET distributioncolumncollation = t.typcollation
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.worker_partition_query_result(
|
||||||
|
result_prefix text,
|
||||||
|
query text,
|
||||||
|
partition_column_index int,
|
||||||
|
partition_method citus.distribution_type,
|
||||||
|
partition_min_values text[],
|
||||||
|
partition_max_values text[],
|
||||||
|
binaryCopy boolean,
|
||||||
|
OUT partition_index int,
|
||||||
|
OUT rows_written bigint,
|
||||||
|
OUT bytes_written bigint)
|
||||||
|
RETURNS SETOF record
|
||||||
|
LANGUAGE C STRICT VOLATILE
|
||||||
|
AS 'MODULE_PATHNAME', $$worker_partition_query_result$$;
|
||||||
|
COMMENT ON FUNCTION pg_catalog.worker_partition_query_result(text, text, int, citus.distribution_type, text[], text[], boolean)
|
||||||
|
IS 'execute a query and partitions its results in set of local result files';
|
|
@ -0,0 +1,16 @@
|
||||||
|
CREATE OR REPLACE FUNCTION pg_catalog.worker_partition_query_result(
|
||||||
|
result_prefix text,
|
||||||
|
query text,
|
||||||
|
partition_column_index int,
|
||||||
|
partition_method citus.distribution_type,
|
||||||
|
partition_min_values text[],
|
||||||
|
partition_max_values text[],
|
||||||
|
binaryCopy boolean,
|
||||||
|
OUT partition_index int,
|
||||||
|
OUT rows_written bigint,
|
||||||
|
OUT bytes_written bigint)
|
||||||
|
RETURNS SETOF record
|
||||||
|
LANGUAGE C STRICT VOLATILE
|
||||||
|
AS 'MODULE_PATHNAME', $$worker_partition_query_result$$;
|
||||||
|
COMMENT ON FUNCTION pg_catalog.worker_partition_query_result(text, text, int, citus.distribution_type, text[], text[], boolean)
|
||||||
|
IS 'execute a query and partitions its results in set of local result files';
|
|
@ -24,6 +24,8 @@
|
||||||
/* necessary to get S_IRUSR, S_IWUSR definitions on illumos */
|
/* necessary to get S_IRUSR, S_IWUSR definitions on illumos */
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
|
||||||
|
#define COPY_BUFFER_SIZE (4 * 1024 * 1024)
|
||||||
|
|
||||||
/* TaskFileDestReceiver can be used to stream results into a file */
|
/* TaskFileDestReceiver can be used to stream results into a file */
|
||||||
typedef struct TaskFileDestReceiver
|
typedef struct TaskFileDestReceiver
|
||||||
{
|
{
|
||||||
|
@ -33,8 +35,8 @@ typedef struct TaskFileDestReceiver
|
||||||
/* descriptor of the tuples that are sent to the worker */
|
/* descriptor of the tuples that are sent to the worker */
|
||||||
TupleDesc tupleDescriptor;
|
TupleDesc tupleDescriptor;
|
||||||
|
|
||||||
/* EState for per-tuple memory allocation */
|
/* context for per-tuple memory allocation */
|
||||||
EState *executorState;
|
MemoryContext tupleContext;
|
||||||
|
|
||||||
/* MemoryContext for DestReceiver session */
|
/* MemoryContext for DestReceiver session */
|
||||||
MemoryContext memoryContext;
|
MemoryContext memoryContext;
|
||||||
|
@ -48,13 +50,12 @@ typedef struct TaskFileDestReceiver
|
||||||
CopyOutState copyOutState;
|
CopyOutState copyOutState;
|
||||||
FmgrInfo *columnOutputFunctions;
|
FmgrInfo *columnOutputFunctions;
|
||||||
|
|
||||||
/* number of tuples sent */
|
/* statistics */
|
||||||
uint64 tuplesSent;
|
uint64 tuplesSent;
|
||||||
|
uint64 bytesSent;
|
||||||
} TaskFileDestReceiver;
|
} TaskFileDestReceiver;
|
||||||
|
|
||||||
|
|
||||||
static DestReceiver * CreateTaskFileDestReceiver(char *filePath, EState *executorState,
|
|
||||||
bool binaryCopyFormat);
|
|
||||||
static void TaskFileDestReceiverStartup(DestReceiver *dest, int operation,
|
static void TaskFileDestReceiverStartup(DestReceiver *dest, int operation,
|
||||||
TupleDesc inputTupleDescriptor);
|
TupleDesc inputTupleDescriptor);
|
||||||
static bool TaskFileDestReceiverReceive(TupleTableSlot *slot, DestReceiver *dest);
|
static bool TaskFileDestReceiverReceive(TupleTableSlot *slot, DestReceiver *dest);
|
||||||
|
@ -102,9 +103,10 @@ WorkerExecuteSqlTask(Query *query, char *taskFilename, bool binaryCopyFormat)
|
||||||
ParamListInfo paramListInfo = NULL;
|
ParamListInfo paramListInfo = NULL;
|
||||||
|
|
||||||
EState *estate = CreateExecutorState();
|
EState *estate = CreateExecutorState();
|
||||||
|
MemoryContext tupleContext = GetPerTupleMemoryContext(estate);
|
||||||
TaskFileDestReceiver *taskFileDest =
|
TaskFileDestReceiver *taskFileDest =
|
||||||
(TaskFileDestReceiver *) CreateTaskFileDestReceiver(taskFilename, estate,
|
(TaskFileDestReceiver *) CreateFileDestReceiver(taskFilename, tupleContext,
|
||||||
binaryCopyFormat);
|
binaryCopyFormat);
|
||||||
|
|
||||||
ExecuteQueryIntoDestReceiver(query, paramListInfo, (DestReceiver *) taskFileDest);
|
ExecuteQueryIntoDestReceiver(query, paramListInfo, (DestReceiver *) taskFileDest);
|
||||||
|
|
||||||
|
@ -118,11 +120,11 @@ WorkerExecuteSqlTask(Query *query, char *taskFilename, bool binaryCopyFormat)
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* CreateTaskFileDestReceiver creates a DestReceiver for writing query results
|
* CreateFileDestReceiver creates a DestReceiver for writing query results
|
||||||
* to a task file.
|
* to a file.
|
||||||
*/
|
*/
|
||||||
static DestReceiver *
|
DestReceiver *
|
||||||
CreateTaskFileDestReceiver(char *filePath, EState *executorState, bool binaryCopyFormat)
|
CreateFileDestReceiver(char *filePath, MemoryContext tupleContext, bool binaryCopyFormat)
|
||||||
{
|
{
|
||||||
TaskFileDestReceiver *taskFileDest = (TaskFileDestReceiver *) palloc0(
|
TaskFileDestReceiver *taskFileDest = (TaskFileDestReceiver *) palloc0(
|
||||||
sizeof(TaskFileDestReceiver));
|
sizeof(TaskFileDestReceiver));
|
||||||
|
@ -135,7 +137,7 @@ CreateTaskFileDestReceiver(char *filePath, EState *executorState, bool binaryCop
|
||||||
taskFileDest->pub.mydest = DestCopyOut;
|
taskFileDest->pub.mydest = DestCopyOut;
|
||||||
|
|
||||||
/* set up output parameters */
|
/* set up output parameters */
|
||||||
taskFileDest->executorState = executorState;
|
taskFileDest->tupleContext = tupleContext;
|
||||||
taskFileDest->memoryContext = CurrentMemoryContext;
|
taskFileDest->memoryContext = CurrentMemoryContext;
|
||||||
taskFileDest->filePath = pstrdup(filePath);
|
taskFileDest->filePath = pstrdup(filePath);
|
||||||
taskFileDest->binaryCopyFormat = binaryCopyFormat;
|
taskFileDest->binaryCopyFormat = binaryCopyFormat;
|
||||||
|
@ -173,7 +175,7 @@ TaskFileDestReceiverStartup(DestReceiver *dest, int operation,
|
||||||
copyOutState->null_print_client = (char *) nullPrintCharacter;
|
copyOutState->null_print_client = (char *) nullPrintCharacter;
|
||||||
copyOutState->binary = taskFileDest->binaryCopyFormat;
|
copyOutState->binary = taskFileDest->binaryCopyFormat;
|
||||||
copyOutState->fe_msgbuf = makeStringInfo();
|
copyOutState->fe_msgbuf = makeStringInfo();
|
||||||
copyOutState->rowcontext = GetPerTupleMemoryContext(taskFileDest->executorState);
|
copyOutState->rowcontext = taskFileDest->tupleContext;
|
||||||
taskFileDest->copyOutState = copyOutState;
|
taskFileDest->copyOutState = copyOutState;
|
||||||
|
|
||||||
taskFileDest->columnOutputFunctions = ColumnOutputFunctions(inputTupleDescriptor,
|
taskFileDest->columnOutputFunctions = ColumnOutputFunctions(inputTupleDescriptor,
|
||||||
|
@ -187,10 +189,7 @@ TaskFileDestReceiverStartup(DestReceiver *dest, int operation,
|
||||||
if (copyOutState->binary)
|
if (copyOutState->binary)
|
||||||
{
|
{
|
||||||
/* write headers when using binary encoding */
|
/* write headers when using binary encoding */
|
||||||
resetStringInfo(copyOutState->fe_msgbuf);
|
|
||||||
AppendCopyBinaryHeaders(copyOutState);
|
AppendCopyBinaryHeaders(copyOutState);
|
||||||
|
|
||||||
WriteToLocalFile(copyOutState->fe_msgbuf, taskFileDest);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
MemoryContextSwitchTo(oldContext);
|
MemoryContextSwitchTo(oldContext);
|
||||||
|
@ -214,8 +213,7 @@ TaskFileDestReceiverReceive(TupleTableSlot *slot, DestReceiver *dest)
|
||||||
|
|
||||||
StringInfo copyData = copyOutState->fe_msgbuf;
|
StringInfo copyData = copyOutState->fe_msgbuf;
|
||||||
|
|
||||||
EState *executorState = taskFileDest->executorState;
|
MemoryContext executorTupleContext = taskFileDest->tupleContext;
|
||||||
MemoryContext executorTupleContext = GetPerTupleMemoryContext(executorState);
|
|
||||||
MemoryContext oldContext = MemoryContextSwitchTo(executorTupleContext);
|
MemoryContext oldContext = MemoryContextSwitchTo(executorTupleContext);
|
||||||
|
|
||||||
slot_getallattrs(slot);
|
slot_getallattrs(slot);
|
||||||
|
@ -223,19 +221,21 @@ TaskFileDestReceiverReceive(TupleTableSlot *slot, DestReceiver *dest)
|
||||||
Datum *columnValues = slot->tts_values;
|
Datum *columnValues = slot->tts_values;
|
||||||
bool *columnNulls = slot->tts_isnull;
|
bool *columnNulls = slot->tts_isnull;
|
||||||
|
|
||||||
resetStringInfo(copyData);
|
|
||||||
|
|
||||||
/* construct row in COPY format */
|
/* construct row in COPY format */
|
||||||
AppendCopyRowData(columnValues, columnNulls, tupleDescriptor,
|
AppendCopyRowData(columnValues, columnNulls, tupleDescriptor,
|
||||||
copyOutState, columnOutputFunctions, NULL);
|
copyOutState, columnOutputFunctions, NULL);
|
||||||
|
|
||||||
WriteToLocalFile(copyOutState->fe_msgbuf, taskFileDest);
|
if (copyData->len > COPY_BUFFER_SIZE)
|
||||||
|
{
|
||||||
|
WriteToLocalFile(copyOutState->fe_msgbuf, taskFileDest);
|
||||||
|
resetStringInfo(copyData);
|
||||||
|
}
|
||||||
|
|
||||||
MemoryContextSwitchTo(oldContext);
|
MemoryContextSwitchTo(oldContext);
|
||||||
|
|
||||||
taskFileDest->tuplesSent++;
|
taskFileDest->tuplesSent++;
|
||||||
|
|
||||||
ResetPerTupleExprContext(executorState);
|
MemoryContextReset(executorTupleContext);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -254,6 +254,8 @@ WriteToLocalFile(StringInfo copyData, TaskFileDestReceiver *taskFileDest)
|
||||||
ereport(ERROR, (errcode_for_file_access(),
|
ereport(ERROR, (errcode_for_file_access(),
|
||||||
errmsg("could not append to file: %m")));
|
errmsg("could not append to file: %m")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
taskFileDest->bytesSent += bytesWritten;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -268,12 +270,18 @@ TaskFileDestReceiverShutdown(DestReceiver *destReceiver)
|
||||||
TaskFileDestReceiver *taskFileDest = (TaskFileDestReceiver *) destReceiver;
|
TaskFileDestReceiver *taskFileDest = (TaskFileDestReceiver *) destReceiver;
|
||||||
CopyOutState copyOutState = taskFileDest->copyOutState;
|
CopyOutState copyOutState = taskFileDest->copyOutState;
|
||||||
|
|
||||||
|
if (copyOutState->fe_msgbuf->len > 0)
|
||||||
|
{
|
||||||
|
WriteToLocalFile(copyOutState->fe_msgbuf, taskFileDest);
|
||||||
|
resetStringInfo(copyOutState->fe_msgbuf);
|
||||||
|
}
|
||||||
|
|
||||||
if (copyOutState->binary)
|
if (copyOutState->binary)
|
||||||
{
|
{
|
||||||
/* write footers when using binary encoding */
|
/* write footers when using binary encoding */
|
||||||
resetStringInfo(copyOutState->fe_msgbuf);
|
|
||||||
AppendCopyBinaryFooters(copyOutState);
|
AppendCopyBinaryFooters(copyOutState);
|
||||||
WriteToLocalFile(copyOutState->fe_msgbuf, taskFileDest);
|
WriteToLocalFile(copyOutState->fe_msgbuf, taskFileDest);
|
||||||
|
resetStringInfo(copyOutState->fe_msgbuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
FileClose(taskFileDest->fileCompat.fd);
|
FileClose(taskFileDest->fileCompat.fd);
|
||||||
|
@ -302,3 +310,15 @@ TaskFileDestReceiverDestroy(DestReceiver *destReceiver)
|
||||||
pfree(taskFileDest->filePath);
|
pfree(taskFileDest->filePath);
|
||||||
pfree(taskFileDest);
|
pfree(taskFileDest);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FileDestReceiverStats returns statistics for the given file dest receiver.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
FileDestReceiverStats(DestReceiver *dest, uint64 *rowsSent, uint64 *bytesSent)
|
||||||
|
{
|
||||||
|
TaskFileDestReceiver *fileDestReceiver = (TaskFileDestReceiver *) dest;
|
||||||
|
*rowsSent = fileDestReceiver->tuplesSent;
|
||||||
|
*bytesSent = fileDestReceiver->bytesSent;
|
||||||
|
}
|
||||||
|
|
|
@ -29,6 +29,8 @@ extern void SendQueryResultViaCopy(const char *resultId);
|
||||||
extern void ReceiveQueryResultViaCopy(const char *resultId);
|
extern void ReceiveQueryResultViaCopy(const char *resultId);
|
||||||
extern void RemoveIntermediateResultsDirectory(void);
|
extern void RemoveIntermediateResultsDirectory(void);
|
||||||
extern int64 IntermediateResultSize(char *resultId);
|
extern int64 IntermediateResultSize(char *resultId);
|
||||||
|
extern char * QueryResultFileName(const char *resultId);
|
||||||
|
extern char * CreateIntermediateResultsDirectory(void);
|
||||||
|
|
||||||
|
|
||||||
#endif /* INTERMEDIATE_RESULTS_H */
|
#endif /* INTERMEDIATE_RESULTS_H */
|
||||||
|
|
|
@ -155,6 +155,11 @@ extern Datum StringToDatum(char *inputString, Oid dataType);
|
||||||
extern char * DatumToString(Datum datum, Oid dataType);
|
extern char * DatumToString(Datum datum, Oid dataType);
|
||||||
extern int CompareShardPlacementsByWorker(const void *leftElement,
|
extern int CompareShardPlacementsByWorker(const void *leftElement,
|
||||||
const void *rightElement);
|
const void *rightElement);
|
||||||
|
extern ShardInterval * DeformedDistShardTupleToShardInterval(Datum *datumArray,
|
||||||
|
bool *isNullArray,
|
||||||
|
Oid intervalTypeId,
|
||||||
|
int32 intervalTypeMod);
|
||||||
|
extern void GetIntervalTypeInfo(char partitionMethod, Var *partitionColumn,
|
||||||
|
Oid *intervalTypeId, int32 *intervalTypeMod);
|
||||||
|
|
||||||
#endif /* MASTER_METADATA_UTILITY_H */
|
#endif /* MASTER_METADATA_UTILITY_H */
|
||||||
|
|
|
@ -140,13 +140,20 @@ extern void InvalidateMetadataSystemCache(void);
|
||||||
extern Datum DistNodeMetadata(void);
|
extern Datum DistNodeMetadata(void);
|
||||||
extern bool HasUniformHashDistribution(ShardInterval **shardIntervalArray,
|
extern bool HasUniformHashDistribution(ShardInterval **shardIntervalArray,
|
||||||
int shardIntervalArrayLength);
|
int shardIntervalArrayLength);
|
||||||
|
extern bool HasUninitializedShardInterval(ShardInterval **sortedShardIntervalArray,
|
||||||
|
int shardCount);
|
||||||
|
extern bool HasOverlappingShardInterval(ShardInterval **shardIntervalArray,
|
||||||
|
int shardIntervalArrayLength,
|
||||||
|
Oid shardIntervalCollation,
|
||||||
|
FmgrInfo *shardIntervalSortCompareFunction);
|
||||||
|
|
||||||
extern bool CitusHasBeenLoaded(void);
|
extern bool CitusHasBeenLoaded(void);
|
||||||
extern bool CheckCitusVersion(int elevel);
|
extern bool CheckCitusVersion(int elevel);
|
||||||
extern bool CheckAvailableVersion(int elevel);
|
extern bool CheckAvailableVersion(int elevel);
|
||||||
bool MajorVersionsCompatible(char *leftVersion, char *rightVersion);
|
extern bool MajorVersionsCompatible(char *leftVersion, char *rightVersion);
|
||||||
|
extern void ErrorIfInconsistentShardIntervals(DistTableCacheEntry *cacheEntry);
|
||||||
extern void EnsureModificationsCanRun(void);
|
extern void EnsureModificationsCanRun(void);
|
||||||
|
extern char LookupDistributionMethod(Oid distributionMethodOid);
|
||||||
|
|
||||||
/* access WorkerNodeHash */
|
/* access WorkerNodeHash */
|
||||||
extern HTAB * GetWorkerNodeHash(void);
|
extern HTAB * GetWorkerNodeHash(void);
|
||||||
|
|
|
@ -58,6 +58,7 @@ typedef FormData_pg_dist_shard *Form_pg_dist_shard;
|
||||||
#define SHARD_STORAGE_FOREIGN 'f'
|
#define SHARD_STORAGE_FOREIGN 'f'
|
||||||
#define SHARD_STORAGE_TABLE 't'
|
#define SHARD_STORAGE_TABLE 't'
|
||||||
#define SHARD_STORAGE_COLUMNAR 'c'
|
#define SHARD_STORAGE_COLUMNAR 'c'
|
||||||
|
#define SHARD_STORAGE_VIRTUAL 'v'
|
||||||
|
|
||||||
|
|
||||||
#endif /* PG_DIST_SHARD_H */
|
#endif /* PG_DIST_SHARD_H */
|
||||||
|
|
|
@ -136,6 +136,12 @@ extern StringInfo UserTaskFilename(StringInfo directoryName, uint32 taskId);
|
||||||
extern List * ColumnDefinitionList(List *columnNameList, List *columnTypeList);
|
extern List * ColumnDefinitionList(List *columnNameList, List *columnTypeList);
|
||||||
extern CreateStmt * CreateStatement(RangeVar *relation, List *columnDefinitionList);
|
extern CreateStmt * CreateStatement(RangeVar *relation, List *columnDefinitionList);
|
||||||
extern CopyStmt * CopyStatement(RangeVar *relation, char *sourceFilename);
|
extern CopyStmt * CopyStatement(RangeVar *relation, char *sourceFilename);
|
||||||
|
extern DestReceiver * CreateFileDestReceiver(char *filePath,
|
||||||
|
MemoryContext tupleContext,
|
||||||
|
bool binaryCopyFormat);
|
||||||
|
extern void FileDestReceiverStats(DestReceiver *dest,
|
||||||
|
uint64 *rowsSent,
|
||||||
|
uint64 *bytesSent);
|
||||||
|
|
||||||
/* Function declaration for parsing tree node */
|
/* Function declaration for parsing tree node */
|
||||||
extern Node * ParseTreeNode(const char *ddlCommand);
|
extern Node * ParseTreeNode(const char *ddlCommand);
|
||||||
|
|
|
@ -0,0 +1,513 @@
|
||||||
|
-- Test functions for partitioning intermediate results
|
||||||
|
CREATE SCHEMA partitioned_intermediate_results;
|
||||||
|
SET search_path TO 'partitioned_intermediate_results';
|
||||||
|
-- hash partitioned intermediate results
|
||||||
|
BEGIN;
|
||||||
|
SELECT * FROM worker_partition_query_result('squares_hash',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i', 0, 'hash',
|
||||||
|
'{-2147483648,-1073741824,0,1073741824}'::text[],
|
||||||
|
'{-1073741825,-1,1073741823,2147483647}'::text[], false);
|
||||||
|
partition_index | rows_written | bytes_written
|
||||||
|
-----------------+--------------+---------------
|
||||||
|
0 | 4 | 21
|
||||||
|
1 | 3 | 14
|
||||||
|
2 | 1 | 5
|
||||||
|
3 | 2 | 9
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
SELECT hashint4(x), x, x2 FROM
|
||||||
|
read_intermediate_result('squares_hash_0', 'text') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
hashint4 | x | x2
|
||||||
|
-------------+----+-----
|
||||||
|
-1905060026 | 1 | 1
|
||||||
|
-1330264708 | 5 | 25
|
||||||
|
-2047600124 | 8 | 64
|
||||||
|
-1547814713 | 10 | 100
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
SELECT hashint4(x), x, x2 FROM
|
||||||
|
read_intermediate_result('squares_hash_1', 'text') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
hashint4 | x | x2
|
||||||
|
-------------+---+----
|
||||||
|
-28094569 | 3 | 9
|
||||||
|
-1011077333 | 4 | 16
|
||||||
|
-978793473 | 7 | 49
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
SELECT hashint4(x), x, x2 FROM
|
||||||
|
read_intermediate_result('squares_hash_2', 'text') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
hashint4 | x | x2
|
||||||
|
-----------+---+----
|
||||||
|
566031088 | 6 | 36
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT hashint4(x), x, x2 FROM
|
||||||
|
read_intermediate_result('squares_hash_3', 'text') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
hashint4 | x | x2
|
||||||
|
------------+---+----
|
||||||
|
1134484726 | 2 | 4
|
||||||
|
1672378334 | 9 | 81
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
END;
|
||||||
|
-- range partitioned intermediate results
|
||||||
|
BEGIN;
|
||||||
|
SELECT * FROM worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, /* partition by x^2 */
|
||||||
|
'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true /* binary format */);
|
||||||
|
partition_index | rows_written | bytes_written
|
||||||
|
-----------------+--------------+---------------
|
||||||
|
0 | 4 | 93
|
||||||
|
1 | 2 | 57
|
||||||
|
2 | 1 | 39
|
||||||
|
3 | 3 | 75
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
SELECT x, x2 FROM
|
||||||
|
read_intermediate_result('squares_range_0', 'binary') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
x | x2
|
||||||
|
---+----
|
||||||
|
1 | 1
|
||||||
|
2 | 4
|
||||||
|
3 | 9
|
||||||
|
4 | 16
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
SELECT x, x2 FROM
|
||||||
|
read_intermediate_result('squares_range_1', 'binary') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
x | x2
|
||||||
|
---+----
|
||||||
|
5 | 25
|
||||||
|
6 | 36
|
||||||
|
(2 rows)
|
||||||
|
|
||||||
|
SELECT x, x2 FROM
|
||||||
|
read_intermediate_result('squares_range_2', 'binary') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
x | x2
|
||||||
|
---+----
|
||||||
|
7 | 49
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT x, x2 FROM
|
||||||
|
read_intermediate_result('squares_range_3', 'binary') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
x | x2
|
||||||
|
----+-----
|
||||||
|
8 | 64
|
||||||
|
9 | 81
|
||||||
|
10 | 100
|
||||||
|
(3 rows)
|
||||||
|
|
||||||
|
END;
|
||||||
|
-- 1M rows, just in case. text format.
|
||||||
|
BEGIN;
|
||||||
|
SELECT * FROM worker_partition_query_result('doubles_hash',
|
||||||
|
'SELECT i, i * 2 FROM generate_series(1, 1000000) i', 0, 'hash',
|
||||||
|
'{-2147483648,-1073741824,0,1073741824}'::text[],
|
||||||
|
'{-1073741825,-1,1073741823,2147483647}'::text[], false);
|
||||||
|
partition_index | rows_written | bytes_written
|
||||||
|
-----------------+--------------+---------------
|
||||||
|
0 | 250199 | 3586179
|
||||||
|
1 | 249872 | 3581280
|
||||||
|
2 | 250278 | 3587487
|
||||||
|
3 | 249651 | 3578401
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
SELECT count(*) FROM read_intermediate_results(ARRAY['doubles_hash_0',
|
||||||
|
'doubles_hash_1',
|
||||||
|
'doubles_hash_2',
|
||||||
|
'doubles_hash_3'], 'text') AS res (x int, x2 int);
|
||||||
|
count
|
||||||
|
---------
|
||||||
|
1000000
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
END;
|
||||||
|
-- 1M rows, just in case. binary format.
|
||||||
|
BEGIN;
|
||||||
|
SELECT * FROM worker_partition_query_result('doubles_range',
|
||||||
|
'SELECT i, i * 2 FROM generate_series(1, 1000000) i', 0, 'range',
|
||||||
|
'{0,250001,500001,750001}'::text[],
|
||||||
|
'{250000,500000,750000,1000000}'::text[], true);
|
||||||
|
partition_index | rows_written | bytes_written
|
||||||
|
-----------------+--------------+---------------
|
||||||
|
0 | 250000 | 4500021
|
||||||
|
1 | 250000 | 4500021
|
||||||
|
2 | 250000 | 4500021
|
||||||
|
3 | 250000 | 4500021
|
||||||
|
(4 rows)
|
||||||
|
|
||||||
|
SELECT count(*) FROM read_intermediate_results(ARRAY['doubles_range_0',
|
||||||
|
'doubles_range_1',
|
||||||
|
'doubles_range_2',
|
||||||
|
'doubles_range_3'], 'binary') AS res (x int, x2 int);
|
||||||
|
count
|
||||||
|
---------
|
||||||
|
1000000
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
END;
|
||||||
|
--
|
||||||
|
-- Some error cases
|
||||||
|
--
|
||||||
|
-- not allowed outside transaction block
|
||||||
|
SELECT * FROM worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range', '{0}'::text[], '{20}'::text[], true);
|
||||||
|
ERROR: worker_partition_query_result can only be used in a transaction block
|
||||||
|
BEGIN;
|
||||||
|
SAVEPOINT s1;
|
||||||
|
-- syntax error in query
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECxT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ERROR: syntax error at or near "SELECxT"
|
||||||
|
LINE 1: SELECT worker_partition_query_result('squares_range',
|
||||||
|
^
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
-- invalid result prefix
|
||||||
|
SELECT worker_partition_query_result('squares_range/a/',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ERROR: result key "squares_range/a/" contains invalid character
|
||||||
|
HINT: Result keys may only contain letters, numbers, underscores and hyphens.
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
-- empty min/max values
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range', ARRAY[]::text[], ARRAY[]::text[], true);
|
||||||
|
ERROR: number of partitions cannot be 0
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
-- append partitioning
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'append',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ERROR: only hash and range partitiong schemes are supported
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
-- query with no results
|
||||||
|
CREATE TABLE t(a int);
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'INSERT INTO t VALUES (1), (2)',
|
||||||
|
1, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ERROR: query must generate a set of rows
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
-- negative partition index
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
-1, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ERROR: partition column index must be between 0 and 1
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
-- too large partition index
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
2, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ERROR: partition column index must be between 0 and 1
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
-- min/max values of different lengths
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range',
|
||||||
|
'{0,21,41,61,101}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ERROR: min values and max values must have the same number of elements
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
-- null values in min/max values of hash partitioned results
|
||||||
|
SELECT worker_partition_query_result('squares_hash',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'hash',
|
||||||
|
'{NULL,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ERROR: hash partitioned table has uninitialized shards
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
-- multiple queries
|
||||||
|
SELECT worker_partition_query_result('squares_hash',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i; SELECT 4, 16;',
|
||||||
|
1, 'hash',
|
||||||
|
'{NULL,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ERROR: cannot execute multiple utility events
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
ROLLBACK;
|
||||||
|
--
|
||||||
|
-- Procedure for conveniently testing worker_partition_query_result(). It uses
|
||||||
|
-- worker_partition_query_results to partition result of query using the same
|
||||||
|
-- scheme as the distributed table rel, and then compares if it did the partitioning
|
||||||
|
-- the same way as shards of rel.
|
||||||
|
--
|
||||||
|
CREATE OR REPLACE PROCEDURE test_partition_query_results(rel regclass, query text,
|
||||||
|
binaryCopy boolean DEFAULT true)
|
||||||
|
AS $$
|
||||||
|
DECLARE
|
||||||
|
partition_min_values text[];
|
||||||
|
partition_max_values text[];
|
||||||
|
partition_column_index int;
|
||||||
|
partition_method citus.distribution_type;
|
||||||
|
partitioned_results_row_counts text[];
|
||||||
|
distributed_table_row_counts text[];
|
||||||
|
tuple_def text;
|
||||||
|
partition_result_names text[];
|
||||||
|
non_empty_partitions int[];
|
||||||
|
rows_different int;
|
||||||
|
BEGIN
|
||||||
|
-- get tuple definition
|
||||||
|
SELECT string_agg(a.attname || ' ' || pg_catalog.format_type(a.atttypid, a.atttypmod), ', ' ORDER BY a.attnum)
|
||||||
|
INTO tuple_def
|
||||||
|
FROM pg_catalog.pg_attribute a
|
||||||
|
WHERE a.attrelid = rel::oid AND a.attnum > 0 AND NOT a.attisdropped;
|
||||||
|
|
||||||
|
-- get min/max value arrays
|
||||||
|
SELECT array_agg(shardminvalue ORDER BY shardid),
|
||||||
|
array_agg(shardmaxvalue ORDER BY shardid)
|
||||||
|
INTO partition_min_values, partition_max_values
|
||||||
|
FROM pg_dist_shard
|
||||||
|
WHERE logicalrelid=rel;
|
||||||
|
|
||||||
|
-- get partition column index and partition method
|
||||||
|
SELECT (regexp_matches(partkey, ':varattno ([0-9]+)'))[1]::int - 1,
|
||||||
|
(CASE WHEN partmethod='h' THEN 'hash' ELSE 'range' END)
|
||||||
|
INTO partition_column_index, partition_method
|
||||||
|
FROM pg_dist_partition
|
||||||
|
WHERE logicalrelid=rel;
|
||||||
|
|
||||||
|
-- insert into the distributed table
|
||||||
|
EXECUTE 'INSERT INTO ' || rel::text || ' ' || query;
|
||||||
|
|
||||||
|
-- repartition the query locally
|
||||||
|
SELECT array_agg(rows_written::text ORDER BY partition_index),
|
||||||
|
array_agg(partition_index) FILTER (WHERE rows_written > 0)
|
||||||
|
INTO partitioned_results_row_counts,
|
||||||
|
non_empty_partitions
|
||||||
|
FROM worker_partition_query_result('test_prefix', query, partition_column_index,
|
||||||
|
partition_method, partition_min_values,
|
||||||
|
partition_max_values, binaryCopy);
|
||||||
|
|
||||||
|
SELECT array_agg('test_prefix_' || i::text)
|
||||||
|
INTO partition_result_names
|
||||||
|
FROM unnest(non_empty_partitions) i;
|
||||||
|
|
||||||
|
EXECUTE 'SELECT count(*) FROM ((' || query || ') EXCEPT (SELECT * FROM read_intermediate_results($1,$2) AS res (' || tuple_def || '))) t'
|
||||||
|
INTO rows_different
|
||||||
|
USING partition_result_names, (CASE WHEN binaryCopy THEN 'binary' ELSE 'text' END)::pg_catalog.citus_copy_format;
|
||||||
|
|
||||||
|
-- commit so results are available in run_command_on_shards
|
||||||
|
COMMIT;
|
||||||
|
|
||||||
|
-- rows per shard of the distributed table
|
||||||
|
SELECT array_agg(result order by shardid) INTO distributed_table_row_counts
|
||||||
|
FROM run_command_on_shards(rel, 'SELECT count(*) FROM %s');
|
||||||
|
|
||||||
|
IF partitioned_results_row_counts = distributed_table_row_counts THEN
|
||||||
|
RAISE NOTICE 'Rows per partition match ...';
|
||||||
|
ELSE
|
||||||
|
RAISE 'FAILED: rows per partition do not match, expecting % got %', distributed_table_row_counts, partitioned_results_row_counts;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
IF rows_different = 0 THEN
|
||||||
|
RAISE NOTICE 'Row values match ...';
|
||||||
|
ELSE
|
||||||
|
RAISE 'FAILED: Could not find % of expected rows in partitions', rows_different;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RAISE NOTICE 'PASSED.';
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
--
|
||||||
|
-- Procedure for creating shards for range partitioned distributed table.
|
||||||
|
--
|
||||||
|
CREATE OR REPLACE PROCEDURE create_range_partitioned_shards(rel regclass, minvalues text[], maxvalues text[])
|
||||||
|
AS $$
|
||||||
|
DECLARE
|
||||||
|
new_shardid bigint;
|
||||||
|
idx int;
|
||||||
|
BEGIN
|
||||||
|
FOR idx IN SELECT * FROM generate_series(1, array_length(minvalues, 1))
|
||||||
|
LOOP
|
||||||
|
SELECT master_create_empty_shard(rel::text) INTO new_shardid;
|
||||||
|
UPDATE pg_dist_shard SET shardminvalue=minvalues[idx], shardmaxvalue=maxvalues[idx] WHERE shardid=new_shardid;
|
||||||
|
END LOOP;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
\set VERBOSITY terse
|
||||||
|
-- hash partitioning, 32 shards
|
||||||
|
SET citus.shard_count TO 32;
|
||||||
|
CREATE TABLE t(a int, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x FROM generate_series(1, 100) x');
|
||||||
|
NOTICE: Rows per partition match ...
|
||||||
|
NOTICE: Row values match ...
|
||||||
|
NOTICE: PASSED.
|
||||||
|
DROP TABLE t;
|
||||||
|
-- hash partitioning, 1 shard
|
||||||
|
SET citus.shard_count TO 1;
|
||||||
|
CREATE TABLE t(a int, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x FROM generate_series(1, 100) x');
|
||||||
|
NOTICE: Rows per partition match ...
|
||||||
|
NOTICE: Row values match ...
|
||||||
|
NOTICE: PASSED.
|
||||||
|
DROP TABLE t;
|
||||||
|
-- hash partitioning, 17 shards (so hash partitions aren't uniform)
|
||||||
|
SET citus.shard_count TO 17;
|
||||||
|
CREATE TABLE t(a int, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x FROM generate_series(1, 100) x');
|
||||||
|
NOTICE: Rows per partition match ...
|
||||||
|
NOTICE: Row values match ...
|
||||||
|
NOTICE: PASSED.
|
||||||
|
DROP TABLE t;
|
||||||
|
-- hash partitioning, date partition column
|
||||||
|
SET citus.shard_count TO 8;
|
||||||
|
CREATE TABLE t(a DATE, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL test_partition_query_results('t', 'SELECT (''1985-05-18''::date + (x::text || '' days'')::interval)::date, x * x FROM generate_series(1, 100) x');
|
||||||
|
NOTICE: Rows per partition match ...
|
||||||
|
NOTICE: Row values match ...
|
||||||
|
NOTICE: PASSED.
|
||||||
|
DROP TABLE t;
|
||||||
|
-- hash partitioning, int4 range partition column
|
||||||
|
SET citus.shard_count TO 8;
|
||||||
|
CREATE TABLE t(a int4range, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL test_partition_query_results('t', 'SELECT int4range(x,2*x+10), x * x FROM generate_series(1, 100) x');
|
||||||
|
NOTICE: Rows per partition match ...
|
||||||
|
NOTICE: Row values match ...
|
||||||
|
NOTICE: PASSED.
|
||||||
|
DROP TABLE t;
|
||||||
|
-- range partitioning, int partition column
|
||||||
|
CREATE TABLE t(key int, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL create_range_partitioned_shards('t', '{0,25,50,76}',
|
||||||
|
'{24,49,75,200}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x * x FROM generate_series(1, 105) x');
|
||||||
|
NOTICE: Rows per partition match ...
|
||||||
|
NOTICE: Row values match ...
|
||||||
|
NOTICE: PASSED.
|
||||||
|
DROP TABLE t;
|
||||||
|
-- not covering ranges, should ERROR
|
||||||
|
CREATE TABLE t(key int, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL create_range_partitioned_shards('t', '{0,25,50,100}',
|
||||||
|
'{24,49,75,200}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x * x FROM generate_series(1, 105) x');
|
||||||
|
ERROR: could not find shard for partition column value
|
||||||
|
DROP TABLE t;
|
||||||
|
-- overlapping ranges, we allow this in range partitioned distributed tables, should be fine
|
||||||
|
CREATE TABLE t(key int, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL create_range_partitioned_shards('t', '{0,25,50,76}',
|
||||||
|
'{50,49,90,200}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x * x FROM generate_series(1, 105) x');
|
||||||
|
NOTICE: Rows per partition match ...
|
||||||
|
NOTICE: Row values match ...
|
||||||
|
NOTICE: PASSED.
|
||||||
|
DROP TABLE t;
|
||||||
|
-- range partitioning, composite partition column
|
||||||
|
CREATE TYPE composite_key_type AS (f1 int, f2 text);
|
||||||
|
SET citus.shard_count TO 8;
|
||||||
|
CREATE TABLE t(key composite_key_type, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL create_range_partitioned_shards('t', '{"(0,a)","(25,a)","(50,a)","(75,a)"}',
|
||||||
|
'{"(24,z)","(49,z)","(74,z)","(100,z)"}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT (x, ''f2_'' || x::text)::composite_key_type, x * x * x FROM generate_series(1, 100) x');
|
||||||
|
NOTICE: Rows per partition match ...
|
||||||
|
NOTICE: Row values match ...
|
||||||
|
NOTICE: PASSED.
|
||||||
|
DROP TABLE t;
|
||||||
|
DROP TYPE composite_key_type;
|
||||||
|
-- unsorted ranges
|
||||||
|
CREATE TABLE t(key int, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
create_distributed_table
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
CALL create_range_partitioned_shards('t', '{50,25,76,0}',
|
||||||
|
'{75,49,200,24}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x * x FROM generate_series(1, 105) x');
|
||||||
|
NOTICE: Rows per partition match ...
|
||||||
|
NOTICE: Row values match ...
|
||||||
|
NOTICE: PASSED.
|
||||||
|
DROP TABLE t;
|
||||||
|
SET client_min_messages TO WARNING;
|
||||||
|
DROP SCHEMA partitioned_intermediate_results CASCADE;
|
||||||
|
\set VERBOSITY default
|
||||||
|
SET client_min_messages TO DEFAULT;
|
||||||
|
SET citus.shard_count TO DEFAULT;
|
|
@ -70,7 +70,7 @@ test: subquery_prepared_statements pg12
|
||||||
# Miscellaneous tests to check our query planning behavior
|
# Miscellaneous tests to check our query planning behavior
|
||||||
# ----------
|
# ----------
|
||||||
test: multi_deparse_shard_query multi_distributed_transaction_id multi_real_time_transaction intermediate_results limit_intermediate_size
|
test: multi_deparse_shard_query multi_distributed_transaction_id multi_real_time_transaction intermediate_results limit_intermediate_size
|
||||||
test: multi_explain hyperscale_tutorial
|
test: multi_explain hyperscale_tutorial partitioned_intermediate_results
|
||||||
test: multi_basic_queries multi_complex_expressions multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics
|
test: multi_basic_queries multi_complex_expressions multi_subquery multi_subquery_complex_queries multi_subquery_behavioral_analytics
|
||||||
test: multi_subquery_complex_reference_clause multi_subquery_window_functions multi_view multi_sql_function multi_prepare_sql
|
test: multi_subquery_complex_reference_clause multi_subquery_window_functions multi_view multi_sql_function multi_prepare_sql
|
||||||
test: sql_procedure multi_function_in_join row_types materialized_view
|
test: sql_procedure multi_function_in_join row_types materialized_view
|
||||||
|
|
|
@ -0,0 +1,368 @@
|
||||||
|
-- Test functions for partitioning intermediate results
|
||||||
|
CREATE SCHEMA partitioned_intermediate_results;
|
||||||
|
SET search_path TO 'partitioned_intermediate_results';
|
||||||
|
|
||||||
|
|
||||||
|
-- hash partitioned intermediate results
|
||||||
|
BEGIN;
|
||||||
|
SELECT * FROM worker_partition_query_result('squares_hash',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i', 0, 'hash',
|
||||||
|
'{-2147483648,-1073741824,0,1073741824}'::text[],
|
||||||
|
'{-1073741825,-1,1073741823,2147483647}'::text[], false);
|
||||||
|
SELECT hashint4(x), x, x2 FROM
|
||||||
|
read_intermediate_result('squares_hash_0', 'text') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
|
||||||
|
SELECT hashint4(x), x, x2 FROM
|
||||||
|
read_intermediate_result('squares_hash_1', 'text') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
|
||||||
|
SELECT hashint4(x), x, x2 FROM
|
||||||
|
read_intermediate_result('squares_hash_2', 'text') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
|
||||||
|
SELECT hashint4(x), x, x2 FROM
|
||||||
|
read_intermediate_result('squares_hash_3', 'text') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
|
||||||
|
END;
|
||||||
|
|
||||||
|
-- range partitioned intermediate results
|
||||||
|
BEGIN;
|
||||||
|
SELECT * FROM worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, /* partition by x^2 */
|
||||||
|
'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true /* binary format */);
|
||||||
|
SELECT x, x2 FROM
|
||||||
|
read_intermediate_result('squares_range_0', 'binary') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
|
||||||
|
SELECT x, x2 FROM
|
||||||
|
read_intermediate_result('squares_range_1', 'binary') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
|
||||||
|
SELECT x, x2 FROM
|
||||||
|
read_intermediate_result('squares_range_2', 'binary') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
|
||||||
|
SELECT x, x2 FROM
|
||||||
|
read_intermediate_result('squares_range_3', 'binary') AS res (x int, x2 int)
|
||||||
|
ORDER BY x;
|
||||||
|
|
||||||
|
END;
|
||||||
|
|
||||||
|
-- 1M rows, just in case. text format.
|
||||||
|
BEGIN;
|
||||||
|
SELECT * FROM worker_partition_query_result('doubles_hash',
|
||||||
|
'SELECT i, i * 2 FROM generate_series(1, 1000000) i', 0, 'hash',
|
||||||
|
'{-2147483648,-1073741824,0,1073741824}'::text[],
|
||||||
|
'{-1073741825,-1,1073741823,2147483647}'::text[], false);
|
||||||
|
SELECT count(*) FROM read_intermediate_results(ARRAY['doubles_hash_0',
|
||||||
|
'doubles_hash_1',
|
||||||
|
'doubles_hash_2',
|
||||||
|
'doubles_hash_3'], 'text') AS res (x int, x2 int);
|
||||||
|
END;
|
||||||
|
|
||||||
|
-- 1M rows, just in case. binary format.
|
||||||
|
BEGIN;
|
||||||
|
SELECT * FROM worker_partition_query_result('doubles_range',
|
||||||
|
'SELECT i, i * 2 FROM generate_series(1, 1000000) i', 0, 'range',
|
||||||
|
'{0,250001,500001,750001}'::text[],
|
||||||
|
'{250000,500000,750000,1000000}'::text[], true);
|
||||||
|
SELECT count(*) FROM read_intermediate_results(ARRAY['doubles_range_0',
|
||||||
|
'doubles_range_1',
|
||||||
|
'doubles_range_2',
|
||||||
|
'doubles_range_3'], 'binary') AS res (x int, x2 int);
|
||||||
|
END;
|
||||||
|
|
||||||
|
--
|
||||||
|
-- Some error cases
|
||||||
|
--
|
||||||
|
|
||||||
|
-- not allowed outside transaction block
|
||||||
|
SELECT * FROM worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range', '{0}'::text[], '{20}'::text[], true);
|
||||||
|
|
||||||
|
BEGIN;
|
||||||
|
SAVEPOINT s1;
|
||||||
|
-- syntax error in query
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECxT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
|
||||||
|
-- invalid result prefix
|
||||||
|
SELECT worker_partition_query_result('squares_range/a/',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
|
||||||
|
-- empty min/max values
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range', ARRAY[]::text[], ARRAY[]::text[], true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
|
||||||
|
-- append partitioning
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'append',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
|
||||||
|
-- query with no results
|
||||||
|
CREATE TABLE t(a int);
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'INSERT INTO t VALUES (1), (2)',
|
||||||
|
1, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
|
||||||
|
-- negative partition index
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
-1, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
|
||||||
|
-- too large partition index
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
2, 'range',
|
||||||
|
'{0,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
|
||||||
|
-- min/max values of different lengths
|
||||||
|
SELECT worker_partition_query_result('squares_range',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'range',
|
||||||
|
'{0,21,41,61,101}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
|
||||||
|
-- null values in min/max values of hash partitioned results
|
||||||
|
SELECT worker_partition_query_result('squares_hash',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i',
|
||||||
|
1, 'hash',
|
||||||
|
'{NULL,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
|
||||||
|
-- multiple queries
|
||||||
|
SELECT worker_partition_query_result('squares_hash',
|
||||||
|
'SELECT i, i * i FROM generate_series(1, 10) i; SELECT 4, 16;',
|
||||||
|
1, 'hash',
|
||||||
|
'{NULL,21,41,61}'::text[],
|
||||||
|
'{20,40,60,100}'::text[],
|
||||||
|
true);
|
||||||
|
ROLLBACK TO SAVEPOINT s1;
|
||||||
|
ROLLBACK;
|
||||||
|
|
||||||
|
--
|
||||||
|
-- Procedure for conveniently testing worker_partition_query_result(). It uses
|
||||||
|
-- worker_partition_query_results to partition result of query using the same
|
||||||
|
-- scheme as the distributed table rel, and then compares if it did the partitioning
|
||||||
|
-- the same way as shards of rel.
|
||||||
|
--
|
||||||
|
CREATE OR REPLACE PROCEDURE test_partition_query_results(rel regclass, query text,
|
||||||
|
binaryCopy boolean DEFAULT true)
|
||||||
|
AS $$
|
||||||
|
DECLARE
|
||||||
|
partition_min_values text[];
|
||||||
|
partition_max_values text[];
|
||||||
|
partition_column_index int;
|
||||||
|
partition_method citus.distribution_type;
|
||||||
|
partitioned_results_row_counts text[];
|
||||||
|
distributed_table_row_counts text[];
|
||||||
|
tuple_def text;
|
||||||
|
partition_result_names text[];
|
||||||
|
non_empty_partitions int[];
|
||||||
|
rows_different int;
|
||||||
|
BEGIN
|
||||||
|
-- get tuple definition
|
||||||
|
SELECT string_agg(a.attname || ' ' || pg_catalog.format_type(a.atttypid, a.atttypmod), ', ' ORDER BY a.attnum)
|
||||||
|
INTO tuple_def
|
||||||
|
FROM pg_catalog.pg_attribute a
|
||||||
|
WHERE a.attrelid = rel::oid AND a.attnum > 0 AND NOT a.attisdropped;
|
||||||
|
|
||||||
|
-- get min/max value arrays
|
||||||
|
SELECT array_agg(shardminvalue ORDER BY shardid),
|
||||||
|
array_agg(shardmaxvalue ORDER BY shardid)
|
||||||
|
INTO partition_min_values, partition_max_values
|
||||||
|
FROM pg_dist_shard
|
||||||
|
WHERE logicalrelid=rel;
|
||||||
|
|
||||||
|
-- get partition column index and partition method
|
||||||
|
SELECT (regexp_matches(partkey, ':varattno ([0-9]+)'))[1]::int - 1,
|
||||||
|
(CASE WHEN partmethod='h' THEN 'hash' ELSE 'range' END)
|
||||||
|
INTO partition_column_index, partition_method
|
||||||
|
FROM pg_dist_partition
|
||||||
|
WHERE logicalrelid=rel;
|
||||||
|
|
||||||
|
-- insert into the distributed table
|
||||||
|
EXECUTE 'INSERT INTO ' || rel::text || ' ' || query;
|
||||||
|
|
||||||
|
-- repartition the query locally
|
||||||
|
SELECT array_agg(rows_written::text ORDER BY partition_index),
|
||||||
|
array_agg(partition_index) FILTER (WHERE rows_written > 0)
|
||||||
|
INTO partitioned_results_row_counts,
|
||||||
|
non_empty_partitions
|
||||||
|
FROM worker_partition_query_result('test_prefix', query, partition_column_index,
|
||||||
|
partition_method, partition_min_values,
|
||||||
|
partition_max_values, binaryCopy);
|
||||||
|
|
||||||
|
SELECT array_agg('test_prefix_' || i::text)
|
||||||
|
INTO partition_result_names
|
||||||
|
FROM unnest(non_empty_partitions) i;
|
||||||
|
|
||||||
|
EXECUTE 'SELECT count(*) FROM ((' || query || ') EXCEPT (SELECT * FROM read_intermediate_results($1,$2) AS res (' || tuple_def || '))) t'
|
||||||
|
INTO rows_different
|
||||||
|
USING partition_result_names, (CASE WHEN binaryCopy THEN 'binary' ELSE 'text' END)::pg_catalog.citus_copy_format;
|
||||||
|
|
||||||
|
-- commit so results are available in run_command_on_shards
|
||||||
|
COMMIT;
|
||||||
|
|
||||||
|
-- rows per shard of the distributed table
|
||||||
|
SELECT array_agg(result order by shardid) INTO distributed_table_row_counts
|
||||||
|
FROM run_command_on_shards(rel, 'SELECT count(*) FROM %s');
|
||||||
|
|
||||||
|
IF partitioned_results_row_counts = distributed_table_row_counts THEN
|
||||||
|
RAISE NOTICE 'Rows per partition match ...';
|
||||||
|
ELSE
|
||||||
|
RAISE 'FAILED: rows per partition do not match, expecting % got %', distributed_table_row_counts, partitioned_results_row_counts;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
IF rows_different = 0 THEN
|
||||||
|
RAISE NOTICE 'Row values match ...';
|
||||||
|
ELSE
|
||||||
|
RAISE 'FAILED: Could not find % of expected rows in partitions', rows_different;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RAISE NOTICE 'PASSED.';
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
--
|
||||||
|
-- Procedure for creating shards for range partitioned distributed table.
|
||||||
|
--
|
||||||
|
CREATE OR REPLACE PROCEDURE create_range_partitioned_shards(rel regclass, minvalues text[], maxvalues text[])
|
||||||
|
AS $$
|
||||||
|
DECLARE
|
||||||
|
new_shardid bigint;
|
||||||
|
idx int;
|
||||||
|
BEGIN
|
||||||
|
FOR idx IN SELECT * FROM generate_series(1, array_length(minvalues, 1))
|
||||||
|
LOOP
|
||||||
|
SELECT master_create_empty_shard(rel::text) INTO new_shardid;
|
||||||
|
UPDATE pg_dist_shard SET shardminvalue=minvalues[idx], shardmaxvalue=maxvalues[idx] WHERE shardid=new_shardid;
|
||||||
|
END LOOP;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
\set VERBOSITY terse
|
||||||
|
|
||||||
|
-- hash partitioning, 32 shards
|
||||||
|
SET citus.shard_count TO 32;
|
||||||
|
CREATE TABLE t(a int, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x FROM generate_series(1, 100) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
|
||||||
|
-- hash partitioning, 1 shard
|
||||||
|
SET citus.shard_count TO 1;
|
||||||
|
CREATE TABLE t(a int, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x FROM generate_series(1, 100) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
|
||||||
|
-- hash partitioning, 17 shards (so hash partitions aren't uniform)
|
||||||
|
SET citus.shard_count TO 17;
|
||||||
|
CREATE TABLE t(a int, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x FROM generate_series(1, 100) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
|
||||||
|
-- hash partitioning, date partition column
|
||||||
|
SET citus.shard_count TO 8;
|
||||||
|
CREATE TABLE t(a DATE, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT (''1985-05-18''::date + (x::text || '' days'')::interval)::date, x * x FROM generate_series(1, 100) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
|
||||||
|
-- hash partitioning, int4 range partition column
|
||||||
|
SET citus.shard_count TO 8;
|
||||||
|
CREATE TABLE t(a int4range, b int);
|
||||||
|
SELECT create_distributed_table('t', 'a');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT int4range(x,2*x+10), x * x FROM generate_series(1, 100) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
|
||||||
|
-- range partitioning, int partition column
|
||||||
|
CREATE TABLE t(key int, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
CALL create_range_partitioned_shards('t', '{0,25,50,76}',
|
||||||
|
'{24,49,75,200}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x * x FROM generate_series(1, 105) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
|
||||||
|
-- not covering ranges, should ERROR
|
||||||
|
CREATE TABLE t(key int, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
CALL create_range_partitioned_shards('t', '{0,25,50,100}',
|
||||||
|
'{24,49,75,200}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x * x FROM generate_series(1, 105) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
|
||||||
|
-- overlapping ranges, we allow this in range partitioned distributed tables, should be fine
|
||||||
|
CREATE TABLE t(key int, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
CALL create_range_partitioned_shards('t', '{0,25,50,76}',
|
||||||
|
'{50,49,90,200}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x * x FROM generate_series(1, 105) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
|
||||||
|
-- range partitioning, composite partition column
|
||||||
|
CREATE TYPE composite_key_type AS (f1 int, f2 text);
|
||||||
|
SET citus.shard_count TO 8;
|
||||||
|
CREATE TABLE t(key composite_key_type, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
CALL create_range_partitioned_shards('t', '{"(0,a)","(25,a)","(50,a)","(75,a)"}',
|
||||||
|
'{"(24,z)","(49,z)","(74,z)","(100,z)"}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT (x, ''f2_'' || x::text)::composite_key_type, x * x * x FROM generate_series(1, 100) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
DROP TYPE composite_key_type;
|
||||||
|
|
||||||
|
-- unsorted ranges
|
||||||
|
CREATE TABLE t(key int, value int);
|
||||||
|
SELECT create_distributed_table('t', 'key', 'range');
|
||||||
|
CALL create_range_partitioned_shards('t', '{50,25,76,0}',
|
||||||
|
'{75,49,200,24}');
|
||||||
|
CALL test_partition_query_results('t', 'SELECT x, x * x * x FROM generate_series(1, 105) x');
|
||||||
|
DROP TABLE t;
|
||||||
|
|
||||||
|
|
||||||
|
SET client_min_messages TO WARNING;
|
||||||
|
DROP SCHEMA partitioned_intermediate_results CASCADE;
|
||||||
|
|
||||||
|
\set VERBOSITY default
|
||||||
|
SET client_min_messages TO DEFAULT;
|
||||||
|
SET citus.shard_count TO DEFAULT;
|
Loading…
Reference in New Issue