mirror of https://github.com/citusdata/citus.git
1398 lines
42 KiB
C
1398 lines
42 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* worker_partition_protocol.c
|
|
*
|
|
* Routines for partitioning table data into multiple files. Table partitioning
|
|
* is one of the three distributed execution primitives that we apply on worker
|
|
* nodes; and when partitioning data, we follow Hadoop's naming conventions as
|
|
* much as possible.
|
|
*
|
|
* Copyright (c) 2012, Citus Data, Inc.
|
|
*
|
|
* $Id$
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
#include "funcapi.h"
|
|
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
#ifdef HAVE_INTTYPES_H
|
|
#include <inttypes.h>
|
|
#endif
|
|
|
|
#include "access/hash.h"
|
|
#include "access/htup_details.h"
|
|
#include "access/nbtree.h"
|
|
#include "catalog/pg_collation.h"
|
|
#include "commands/copy.h"
|
|
#include "commands/defrem.h"
|
|
#include "distributed/resource_lock.h"
|
|
#include "distributed/transmit.h"
|
|
#include "distributed/worker_protocol.h"
|
|
#include "executor/spi.h"
|
|
#include "mb/pg_wchar.h"
|
|
#include "storage/lmgr.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/memutils.h"
|
|
|
|
|
|
/* Config variables managed via guc.c */
|
|
bool BinaryWorkerCopyFormat = false; /* binary format for copying between workers */
|
|
int PartitionBufferSize = 16384; /* total partitioning buffer size in KB */
|
|
|
|
/* Local variables */
|
|
static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
|
|
static uint32 FileBufferSizeInBytes = 0; /* file buffer size to init later */
|
|
|
|
|
|
/* Local functions forward declarations */
|
|
static StringInfo InitTaskAttemptDirectory(uint64 jobId, uint32 taskId);
|
|
static uint32 FileBufferSize(int partitionBufferSizeInKB, uint32 fileCount);
|
|
static FileOutputStream * OpenPartitionFiles(StringInfo directoryName, uint32 fileCount);
|
|
static void ClosePartitionFiles(FileOutputStream *partitionFileArray, uint32 fileCount);
|
|
static void RenameDirectory(StringInfo oldDirectoryName, StringInfo newDirectoryName);
|
|
static void FileOutputStreamWrite(FileOutputStream file, StringInfo dataToWrite);
|
|
static void FileOutputStreamFlush(FileOutputStream file);
|
|
static void FilterAndPartitionTable(const char *filterQuery,
|
|
const char *columnName, Oid columnType,
|
|
uint32 (*PartitionIdFunction)(Datum, const void *),
|
|
const void *partitionIdContext,
|
|
FileOutputStream *partitionFileArray,
|
|
uint32 fileCount);
|
|
static int ColumnIndex(TupleDesc rowDescriptor, const char *columnName);
|
|
static FmgrInfo * ColumnOutputFunctions(TupleDesc rowDescriptor, bool binaryFormat);
|
|
static PartialCopyState InitRowOutputState(void);
|
|
static void ClearRowOutputState(PartialCopyState copyState);
|
|
static void OutputRow(HeapTuple row, TupleDesc rowDescriptor,
|
|
PartialCopyState rowOutputState, FmgrInfo *columnOutputFunctions);
|
|
static void OutputBinaryHeaders(FileOutputStream *partitionFileArray, uint32 fileCount);
|
|
static void OutputBinaryFooters(FileOutputStream *partitionFileArray, uint32 fileCount);
|
|
static void CopySendData(PartialCopyState outputState, const void *databuf, int datasize);
|
|
static void CopySendString(PartialCopyState outputState, const char *str);
|
|
static void CopySendChar(PartialCopyState outputState, char c);
|
|
static void CopySendInt32(PartialCopyState outputState, int32 val);
|
|
static void CopySendInt16(PartialCopyState outputState, int16 val);
|
|
static void CopyAttributeOutText(PartialCopyState outputState, char *string);
|
|
static inline void CopyFlushOutput(PartialCopyState outputState,
|
|
char *start, char *pointer);
|
|
static uint32 RangePartitionId(Datum partitionValue, const void *context);
|
|
static uint32 HashPartitionId(Datum partitionValue, const void *context);
|
|
|
|
|
|
/* exports for SQL callable functions */
|
|
PG_FUNCTION_INFO_V1(worker_range_partition_table);
|
|
PG_FUNCTION_INFO_V1(worker_hash_partition_table);
|
|
|
|
|
|
/*
|
|
* worker_range_partition_table executes the given filter query, repartitions
|
|
* the filter query's results on a partitioning column, and writes the resulting
|
|
* rows to a set of text files on local disk. The function then atomically
|
|
* renames the directory in which the text files live to ensure deterministic
|
|
* behavior.
|
|
*
|
|
* This function applies range partitioning through the use of a function
|
|
* pointer and a range context object; for details, see RangePartitionId().
|
|
*/
|
|
Datum
|
|
worker_range_partition_table(PG_FUNCTION_ARGS)
|
|
{
|
|
uint64 jobId = PG_GETARG_INT64(0);
|
|
uint32 taskId = PG_GETARG_UINT32(1);
|
|
text *filterQueryText = PG_GETARG_TEXT_P(2);
|
|
text *partitionColumnText = PG_GETARG_TEXT_P(3);
|
|
Oid partitionColumnType = PG_GETARG_OID(4);
|
|
ArrayType *splitPointObject = PG_GETARG_ARRAYTYPE_P(5);
|
|
|
|
const char *filterQuery = text_to_cstring(filterQueryText);
|
|
const char *partitionColumn = text_to_cstring(partitionColumnText);
|
|
|
|
RangePartitionContext *partitionContext = NULL;
|
|
FmgrInfo *comparisonFunction = NULL;
|
|
Datum *splitPointArray = NULL;
|
|
int32 splitPointCount = 0;
|
|
uint32 fileCount = 0;
|
|
StringInfo taskDirectory = NULL;
|
|
StringInfo taskAttemptDirectory = NULL;
|
|
FileOutputStream *partitionFileArray = NULL;
|
|
|
|
/* first check that array element's and partition column's types match */
|
|
Oid splitPointType = ARR_ELEMTYPE(splitPointObject);
|
|
if (splitPointType != partitionColumnType)
|
|
{
|
|
ereport(ERROR, (errmsg("partition column type %u and split point type %u "
|
|
"do not match", partitionColumnType, splitPointType)));
|
|
}
|
|
|
|
/* use column's type information to get the comparison function */
|
|
comparisonFunction = GetFunctionInfo(partitionColumnType,
|
|
BTREE_AM_OID, BTORDER_PROC);
|
|
|
|
/* deserialize split points into their array representation */
|
|
splitPointArray = DeconstructArrayObject(splitPointObject);
|
|
splitPointCount = ArrayObjectCount(splitPointObject);
|
|
fileCount = splitPointCount + 1; /* range partitioning needs an extra bucket */
|
|
|
|
/* create range partition context object */
|
|
partitionContext = palloc0(sizeof(RangePartitionContext));
|
|
partitionContext->comparisonFunction = comparisonFunction;
|
|
partitionContext->splitPointArray = splitPointArray;
|
|
partitionContext->splitPointCount = splitPointCount;
|
|
|
|
/* init directories and files to write the partitioned data to */
|
|
taskDirectory = InitTaskDirectory(jobId, taskId);
|
|
taskAttemptDirectory = InitTaskAttemptDirectory(jobId, taskId);
|
|
|
|
partitionFileArray = OpenPartitionFiles(taskAttemptDirectory, fileCount);
|
|
FileBufferSizeInBytes = FileBufferSize(PartitionBufferSize, fileCount);
|
|
|
|
/* call the partitioning function that does the actual work */
|
|
FilterAndPartitionTable(filterQuery, partitionColumn, partitionColumnType,
|
|
&RangePartitionId, (const void *) partitionContext,
|
|
partitionFileArray, fileCount);
|
|
|
|
/* close partition files and atomically rename (commit) them */
|
|
ClosePartitionFiles(partitionFileArray, fileCount);
|
|
RemoveDirectory(taskDirectory);
|
|
RenameDirectory(taskAttemptDirectory, taskDirectory);
|
|
|
|
PG_RETURN_VOID();
|
|
}
|
|
|
|
|
|
/*
|
|
* worker_hash_partition_table executes the given filter query, repartitions the
|
|
* filter query's results on a partitioning column, and writes the resulting
|
|
* rows to a set of text files on local disk. The function then atomically
|
|
* renames the directory in which the text files live to ensure deterministic
|
|
* behavior.
|
|
*
|
|
* This function applies hash partitioning through the use of a function pointer
|
|
* and a hash context object; for details, see HashPartitionId().
|
|
*/
|
|
Datum
|
|
worker_hash_partition_table(PG_FUNCTION_ARGS)
|
|
{
|
|
uint64 jobId = PG_GETARG_INT64(0);
|
|
uint32 taskId = PG_GETARG_UINT32(1);
|
|
text *filterQueryText = PG_GETARG_TEXT_P(2);
|
|
text *partitionColumnText = PG_GETARG_TEXT_P(3);
|
|
Oid partitionColumnType = PG_GETARG_OID(4);
|
|
uint32 partitionCount = PG_GETARG_UINT32(5);
|
|
|
|
const char *filterQuery = text_to_cstring(filterQueryText);
|
|
const char *partitionColumn = text_to_cstring(partitionColumnText);
|
|
|
|
HashPartitionContext *partitionContext = NULL;
|
|
FmgrInfo *hashFunction = NULL;
|
|
StringInfo taskDirectory = NULL;
|
|
StringInfo taskAttemptDirectory = NULL;
|
|
FileOutputStream *partitionFileArray = NULL;
|
|
uint32 fileCount = partitionCount;
|
|
|
|
/* use column's type information to get the hashing function */
|
|
hashFunction = GetFunctionInfo(partitionColumnType, HASH_AM_OID, HASHPROC);
|
|
|
|
/* create hash partition context object */
|
|
partitionContext = palloc0(sizeof(HashPartitionContext));
|
|
partitionContext->hashFunction = hashFunction;
|
|
partitionContext->partitionCount = partitionCount;
|
|
|
|
/* init directories and files to write the partitioned data to */
|
|
taskDirectory = InitTaskDirectory(jobId, taskId);
|
|
taskAttemptDirectory = InitTaskAttemptDirectory(jobId, taskId);
|
|
|
|
partitionFileArray = OpenPartitionFiles(taskAttemptDirectory, fileCount);
|
|
FileBufferSizeInBytes = FileBufferSize(PartitionBufferSize, fileCount);
|
|
|
|
/* call the partitioning function that does the actual work */
|
|
FilterAndPartitionTable(filterQuery, partitionColumn, partitionColumnType,
|
|
&HashPartitionId, (const void *) partitionContext,
|
|
partitionFileArray, fileCount);
|
|
|
|
/* close partition files and atomically rename (commit) them */
|
|
ClosePartitionFiles(partitionFileArray, fileCount);
|
|
RemoveDirectory(taskDirectory);
|
|
RenameDirectory(taskAttemptDirectory, taskDirectory);
|
|
|
|
PG_RETURN_VOID();
|
|
}
|
|
|
|
|
|
/*
|
|
* GetFunctionInfo first resolves the operator for the given data type, access
|
|
* method, and support procedure. The function then uses the resolved operator's
|
|
* identifier to fill in a function manager object, and returns this object.
|
|
*/
|
|
FmgrInfo *
|
|
GetFunctionInfo(Oid typeId, Oid accessMethodId, int16 procedureId)
|
|
{
|
|
FmgrInfo *functionInfo = (FmgrInfo *) palloc0(sizeof(FmgrInfo));
|
|
|
|
/* get default operator class from pg_opclass for datum type */
|
|
Oid operatorClassId = GetDefaultOpClass(typeId, accessMethodId);
|
|
|
|
Oid operatorFamilyId = get_opclass_family(operatorClassId);
|
|
Oid operatorClassInputType = get_opclass_input_type(operatorClassId);
|
|
|
|
Oid operatorId = get_opfamily_proc(operatorFamilyId, operatorClassInputType,
|
|
operatorClassInputType, procedureId);
|
|
|
|
if (operatorId == InvalidOid)
|
|
{
|
|
ereport(ERROR, (errmsg("could not find function for data typeId %u", typeId)));
|
|
}
|
|
|
|
/* fill in the FmgrInfo struct using the operatorId */
|
|
fmgr_info(operatorId, functionInfo);
|
|
|
|
return functionInfo;
|
|
}
|
|
|
|
|
|
/*
|
|
* DeconstructArrayObject takes in a single dimensional array, and deserializes
|
|
* this array's members into an array of datum objects. The function then
|
|
* returns this datum array.
|
|
*/
|
|
Datum *
|
|
DeconstructArrayObject(ArrayType *arrayObject)
|
|
{
|
|
Datum *datumArray = NULL;
|
|
bool *datumArrayNulls = NULL;
|
|
int datumArrayLength = 0;
|
|
|
|
Oid typeId = InvalidOid;
|
|
bool typeByVal = false;
|
|
char typeAlign = 0;
|
|
int16 typeLength = 0;
|
|
|
|
bool arrayHasNull = ARR_HASNULL(arrayObject);
|
|
if (arrayHasNull)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
|
|
errmsg("worker array object cannot contain null values")));
|
|
}
|
|
|
|
typeId = ARR_ELEMTYPE(arrayObject);
|
|
get_typlenbyvalalign(typeId, &typeLength, &typeByVal, &typeAlign);
|
|
|
|
deconstruct_array(arrayObject, typeId, typeLength, typeByVal, typeAlign,
|
|
&datumArray, &datumArrayNulls, &datumArrayLength);
|
|
|
|
return datumArray;
|
|
}
|
|
|
|
|
|
/*
|
|
* ArrayObjectCount takes in a single dimensional array, and returns the number
|
|
* of elements in this array.
|
|
*/
|
|
int32
|
|
ArrayObjectCount(ArrayType *arrayObject)
|
|
{
|
|
int32 dimensionCount = ARR_NDIM(arrayObject);
|
|
int32 *dimensionLengthArray = ARR_DIMS(arrayObject);
|
|
int32 arrayLength = 0;
|
|
|
|
/* we currently allow split point arrays to have only one subarray */
|
|
Assert(dimensionCount == 1);
|
|
|
|
arrayLength = ArrayGetNItems(dimensionCount, dimensionLengthArray);
|
|
if (arrayLength <= 0)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
|
|
errmsg("worker array object cannot be empty")));
|
|
}
|
|
|
|
return arrayLength;
|
|
}
|
|
|
|
|
|
/*
|
|
* InitTaskDirectory creates a job and task directory using given identifiers,
|
|
* if these directories do not already exist. The function then returns the task
|
|
* directory's name.
|
|
*/
|
|
StringInfo
|
|
InitTaskDirectory(uint64 jobId, uint32 taskId)
|
|
{
|
|
bool jobDirectoryExists = false;
|
|
bool taskDirectoryExists = false;
|
|
|
|
/*
|
|
* If the task tracker assigned this task (regular case), the tracker should
|
|
* have already created the job directory.
|
|
*/
|
|
StringInfo jobDirectoryName = JobDirectoryName(jobId);
|
|
StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
|
|
|
|
LockJobResource(jobId, AccessExclusiveLock);
|
|
|
|
jobDirectoryExists = DirectoryExists(jobDirectoryName);
|
|
if (!jobDirectoryExists)
|
|
{
|
|
CreateDirectory(jobDirectoryName);
|
|
}
|
|
|
|
taskDirectoryExists = DirectoryExists(taskDirectoryName);
|
|
if (!taskDirectoryExists)
|
|
{
|
|
CreateDirectory(taskDirectoryName);
|
|
}
|
|
|
|
UnlockJobResource(jobId, AccessExclusiveLock);
|
|
|
|
return taskDirectoryName;
|
|
}
|
|
|
|
|
|
/*
|
|
* InitTaskAttemptDirectory finds a task attempt directory that is not taken,
|
|
* and creates that directory. The function then returns the task attempt
|
|
* directory's name.
|
|
*/
|
|
static StringInfo
|
|
InitTaskAttemptDirectory(uint64 jobId, uint32 taskId)
|
|
{
|
|
StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
|
|
uint32 randomId = (uint32) random();
|
|
|
|
/*
|
|
* We should have only one process executing this task. Still, we append a
|
|
* random id just in case.
|
|
*/
|
|
StringInfo taskAttemptDirectoryName = makeStringInfo();
|
|
appendStringInfo(taskAttemptDirectoryName, "%s_%0*u",
|
|
taskDirectoryName->data, MIN_TASK_FILENAME_WIDTH, randomId);
|
|
|
|
/*
|
|
* If this task previously failed, and gets re-executed and improbably draws
|
|
* the same randomId, the task will fail to create the directory.
|
|
*/
|
|
CreateDirectory(taskAttemptDirectoryName);
|
|
|
|
return taskAttemptDirectoryName;
|
|
}
|
|
|
|
|
|
/* Calculates and returns the buffer size to use for each file. */
|
|
static uint32
|
|
FileBufferSize(int partitionBufferSizeInKB, uint32 fileCount)
|
|
{
|
|
double partitionBufferSize = (double) partitionBufferSizeInKB * 1024.0;
|
|
uint32 fileBufferSize = (uint32) rint(partitionBufferSize / fileCount);
|
|
|
|
return fileBufferSize;
|
|
}
|
|
|
|
|
|
/*
|
|
* OpenPartitionFiles takes in a directory name and file count, and opens new
|
|
* partition files in this directory. The names for these new files are modeled
|
|
* after Hadoop's naming conventions for map files. These file names, virtual
|
|
* file descriptors, and file buffers are stored together in file output stream
|
|
* objects. These objects are then returned in an array from this function.
|
|
*/
|
|
static FileOutputStream *
|
|
OpenPartitionFiles(StringInfo directoryName, uint32 fileCount)
|
|
{
|
|
FileOutputStream *partitionFileArray = NULL;
|
|
File fileDescriptor = 0;
|
|
uint32 fileIndex = 0;
|
|
const int fileFlags = (O_APPEND | O_CREAT | O_RDWR | PG_BINARY);
|
|
const int fileMode = (S_IRUSR | S_IWUSR);
|
|
|
|
partitionFileArray = palloc0(fileCount * sizeof(FileOutputStream));
|
|
|
|
for (fileIndex = 0; fileIndex < fileCount; fileIndex++)
|
|
{
|
|
StringInfo filePath = PartitionFilename(directoryName, fileIndex);
|
|
|
|
fileDescriptor = PathNameOpenFile(filePath->data, fileFlags, fileMode);
|
|
if (fileDescriptor < 0)
|
|
{
|
|
ereport(ERROR, (errcode_for_file_access(),
|
|
errmsg("could not open file \"%s\": %m", filePath->data)));
|
|
}
|
|
|
|
partitionFileArray[fileIndex].fileDescriptor = fileDescriptor;
|
|
partitionFileArray[fileIndex].fileBuffer = makeStringInfo();
|
|
partitionFileArray[fileIndex].filePath = filePath;
|
|
}
|
|
|
|
return partitionFileArray;
|
|
}
|
|
|
|
|
|
/*
|
|
* ClosePartitionFiles walks over each file output stream object, and flushes
|
|
* any remaining data in the file's buffer. The function then closes the file,
|
|
* and deletes any allocated memory for the file stream object.
|
|
*/
|
|
static void
|
|
ClosePartitionFiles(FileOutputStream *partitionFileArray, uint32 fileCount)
|
|
{
|
|
uint32 fileIndex = 0;
|
|
for (fileIndex = 0; fileIndex < fileCount; fileIndex++)
|
|
{
|
|
FileOutputStream partitionFile = partitionFileArray[fileIndex];
|
|
|
|
FileOutputStreamFlush(partitionFile);
|
|
|
|
FileClose(partitionFile.fileDescriptor);
|
|
FreeStringInfo(partitionFile.fileBuffer);
|
|
FreeStringInfo(partitionFile.filePath);
|
|
}
|
|
|
|
pfree(partitionFileArray);
|
|
}
|
|
|
|
|
|
/* Constructs a standardized job directory path for the given job id. */
|
|
StringInfo
|
|
JobDirectoryName(uint64 jobId)
|
|
{
|
|
/*
|
|
* We use the default tablespace in {datadir}/base. Further, we need to
|
|
* apply padding on our 64-bit job id, and hence can't use UINT64_FORMAT.
|
|
*/
|
|
#ifdef HAVE_INTTYPES_H
|
|
StringInfo jobDirectoryName = makeStringInfo();
|
|
appendStringInfo(jobDirectoryName, "base/%s/%s%0*" PRIu64,
|
|
PG_JOB_CACHE_DIR, JOB_DIRECTORY_PREFIX,
|
|
MIN_JOB_DIRNAME_WIDTH, jobId);
|
|
#else
|
|
StringInfo jobDirectoryName = makeStringInfo();
|
|
appendStringInfo(jobDirectoryName, "base/%s/%s%0*llu",
|
|
PG_JOB_CACHE_DIR, JOB_DIRECTORY_PREFIX,
|
|
MIN_JOB_DIRNAME_WIDTH, jobId);
|
|
#endif
|
|
|
|
return jobDirectoryName;
|
|
}
|
|
|
|
|
|
/* Constructs a standardized task directory path for given job and task ids. */
|
|
StringInfo
|
|
TaskDirectoryName(uint64 jobId, uint32 taskId)
|
|
{
|
|
StringInfo jobDirectoryName = JobDirectoryName(jobId);
|
|
|
|
StringInfo taskDirectoryName = makeStringInfo();
|
|
appendStringInfo(taskDirectoryName, "%s/%s%0*u",
|
|
jobDirectoryName->data,
|
|
TASK_FILE_PREFIX, MIN_TASK_FILENAME_WIDTH, taskId);
|
|
|
|
return taskDirectoryName;
|
|
}
|
|
|
|
|
|
/* Constructs a standardized partition file path for given directory and id. */
|
|
StringInfo
|
|
PartitionFilename(StringInfo directoryName, uint32 partitionId)
|
|
{
|
|
StringInfo partitionFilename = makeStringInfo();
|
|
appendStringInfo(partitionFilename, "%s/%s%0*u",
|
|
directoryName->data,
|
|
PARTITION_FILE_PREFIX, MIN_PARTITION_FILENAME_WIDTH, partitionId);
|
|
|
|
return partitionFilename;
|
|
}
|
|
|
|
|
|
/*
|
|
* JobDirectoryElement takes in a filename, and checks if this name lives in the
|
|
* directory path that is used for task output files. Note that this function's
|
|
* implementation is coupled with JobDirectoryName().
|
|
*/
|
|
bool
|
|
JobDirectoryElement(const char *filename)
|
|
{
|
|
bool directoryElement = false;
|
|
char *directoryPathFound = NULL;
|
|
|
|
StringInfo directoryPath = makeStringInfo();
|
|
appendStringInfo(directoryPath, "base/%s/%s", PG_JOB_CACHE_DIR, JOB_DIRECTORY_PREFIX);
|
|
|
|
directoryPathFound = strstr(filename, directoryPath->data);
|
|
if (directoryPathFound != NULL)
|
|
{
|
|
directoryElement = true;
|
|
}
|
|
|
|
pfree(directoryPath);
|
|
|
|
return directoryElement;
|
|
}
|
|
|
|
|
|
/* Checks if a directory exists for the given directory name. */
|
|
bool
|
|
DirectoryExists(StringInfo directoryName)
|
|
{
|
|
bool directoryExists = true;
|
|
struct stat directoryStat;
|
|
|
|
int statOK = stat(directoryName->data, &directoryStat);
|
|
if (statOK == 0)
|
|
{
|
|
/* file already exists; just assert that it is a directory */
|
|
Assert(S_ISDIR(directoryStat.st_mode));
|
|
}
|
|
else
|
|
{
|
|
if (errno == ENOENT)
|
|
{
|
|
directoryExists = false;
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errcode_for_file_access(),
|
|
errmsg("could not stat directory \"%s\": %m",
|
|
directoryName->data)));
|
|
}
|
|
}
|
|
|
|
return directoryExists;
|
|
}
|
|
|
|
|
|
/* Creates a new directory with the given directory name. */
|
|
void
|
|
CreateDirectory(StringInfo directoryName)
|
|
{
|
|
int makeOK = mkdir(directoryName->data, S_IRWXU);
|
|
if (makeOK != 0)
|
|
{
|
|
ereport(ERROR, (errcode_for_file_access(),
|
|
errmsg("could not create directory \"%s\": %m",
|
|
directoryName->data)));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* RemoveDirectory first checks if the given directory exists. If it does, the
|
|
* function recursively deletes the contents of the given directory, and then
|
|
* deletes the directory itself. This function is modeled on the Boost file
|
|
* system library's remove_all() method.
|
|
*/
|
|
void
|
|
RemoveDirectory(StringInfo filename)
|
|
{
|
|
struct stat fileStat;
|
|
int removed = 0;
|
|
|
|
int fileStated = stat(filename->data, &fileStat);
|
|
if (fileStated < 0)
|
|
{
|
|
if (errno == ENOENT)
|
|
{
|
|
return; /* if file does not exist, return */
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errcode_for_file_access(),
|
|
errmsg("could not stat file \"%s\": %m", filename->data)));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If this is a directory, iterate over all its contents and for each
|
|
* content, recurse into this function. Also, make sure that we do not
|
|
* recurse into symbolic links.
|
|
*/
|
|
if (S_ISDIR(fileStat.st_mode) && !S_ISLNK(fileStat.st_mode))
|
|
{
|
|
const char *directoryName = filename->data;
|
|
struct dirent *directoryEntry = NULL;
|
|
|
|
DIR *directory = AllocateDir(directoryName);
|
|
if (directory == NULL)
|
|
{
|
|
ereport(ERROR, (errcode_for_file_access(),
|
|
errmsg("could not open directory \"%s\": %m",
|
|
directoryName)));
|
|
}
|
|
|
|
directoryEntry = ReadDir(directory, directoryName);
|
|
for (; directoryEntry != NULL; directoryEntry = ReadDir(directory, directoryName))
|
|
{
|
|
const char *baseFilename = directoryEntry->d_name;
|
|
StringInfo fullFilename = NULL;
|
|
|
|
/* if system file, skip it */
|
|
if (strncmp(baseFilename, ".", MAXPGPATH) == 0 ||
|
|
strncmp(baseFilename, "..", MAXPGPATH) == 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
fullFilename = makeStringInfo();
|
|
appendStringInfo(fullFilename, "%s/%s", directoryName, baseFilename);
|
|
|
|
RemoveDirectory(fullFilename);
|
|
|
|
FreeStringInfo(fullFilename);
|
|
}
|
|
|
|
FreeDir(directory);
|
|
}
|
|
|
|
/* we now have an empty directory or a regular file, remove it */
|
|
if (S_ISDIR(fileStat.st_mode))
|
|
{
|
|
removed = rmdir(filename->data);
|
|
}
|
|
else
|
|
{
|
|
removed = unlink(filename->data);
|
|
}
|
|
|
|
if (removed != 0)
|
|
{
|
|
ereport(ERROR, (errcode_for_file_access(),
|
|
errmsg("could not remove file \"%s\": %m", filename->data)));
|
|
}
|
|
}
|
|
|
|
|
|
/* Moves directory from old path to the new one. */
|
|
static void
|
|
RenameDirectory(StringInfo oldDirectoryName, StringInfo newDirectoryName)
|
|
{
|
|
int renamed = rename(oldDirectoryName->data, newDirectoryName->data);
|
|
if (renamed != 0)
|
|
{
|
|
ereport(ERROR, (errcode_for_file_access(),
|
|
errmsg("could not rename directory \"%s\" to \"%s\": %m",
|
|
oldDirectoryName->data, newDirectoryName->data)));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* FileOutputStreamWrite appends given data to file stream's internal buffers.
|
|
* The function then checks if buffered data exceeds preconfigured buffer size;
|
|
* if so, the function flushes the buffer to the underlying file.
|
|
*/
|
|
static void
|
|
FileOutputStreamWrite(FileOutputStream file, StringInfo dataToWrite)
|
|
{
|
|
StringInfo fileBuffer = file.fileBuffer;
|
|
uint32 newBufferSize = fileBuffer->len + dataToWrite->len;
|
|
|
|
appendBinaryStringInfo(fileBuffer, dataToWrite->data, dataToWrite->len);
|
|
|
|
if (newBufferSize > FileBufferSizeInBytes)
|
|
{
|
|
FileOutputStreamFlush(file);
|
|
|
|
resetStringInfo(fileBuffer);
|
|
}
|
|
}
|
|
|
|
|
|
/* Flushes data buffered in the file stream object to the underlying file. */
|
|
static void
|
|
FileOutputStreamFlush(FileOutputStream file)
|
|
{
|
|
StringInfo fileBuffer = file.fileBuffer;
|
|
int written = 0;
|
|
|
|
errno = 0;
|
|
written = FileWrite(file.fileDescriptor, fileBuffer->data, fileBuffer->len);
|
|
if (written != fileBuffer->len)
|
|
{
|
|
ereport(ERROR, (errcode_for_file_access(),
|
|
errmsg("could not write %d bytes to partition file \"%s\"",
|
|
fileBuffer->len, file.filePath->data)));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* FilterAndPartitionTable executes a given SQL query, and iterates over query
|
|
* results in a read-only fashion. For each resulting row, the function applies
|
|
* the partitioning function and determines the partition identifier. Then, the
|
|
* function chooses the partition file corresponding to this identifier, and
|
|
* serializes the row into this file using the copy command's text format.
|
|
*/
|
|
static void
|
|
FilterAndPartitionTable(const char *filterQuery,
|
|
const char *partitionColumnName, Oid partitionColumnType,
|
|
uint32 (*PartitionIdFunction)(Datum, const void *),
|
|
const void *partitionIdContext,
|
|
FileOutputStream *partitionFileArray,
|
|
uint32 fileCount)
|
|
{
|
|
PartialCopyState rowOutputState = NULL;
|
|
FmgrInfo *columnOutputFunctions = NULL;
|
|
int partitionColumnIndex = 0;
|
|
Oid partitionColumnTypeId = InvalidOid;
|
|
Portal queryPortal = NULL;
|
|
int connected = 0;
|
|
int finished = 0;
|
|
|
|
const char *noPortalName = NULL;
|
|
const bool readOnly = true;
|
|
const bool fetchForward = true;
|
|
const int noCursorOptions = 0;
|
|
const int prefetchCount = ROW_PREFETCH_COUNT;
|
|
|
|
connected = SPI_connect();
|
|
if (connected != SPI_OK_CONNECT)
|
|
{
|
|
ereport(ERROR, (errmsg("could not connect to SPI manager")));
|
|
}
|
|
|
|
queryPortal = SPI_cursor_open_with_args(noPortalName, filterQuery,
|
|
0, NULL, NULL, NULL, /* no arguments */
|
|
readOnly, noCursorOptions);
|
|
if (queryPortal == NULL)
|
|
{
|
|
ereport(ERROR, (errmsg("could not open implicit cursor for query \"%s\"",
|
|
filterQuery)));
|
|
}
|
|
|
|
rowOutputState = InitRowOutputState();
|
|
|
|
SPI_cursor_fetch(queryPortal, fetchForward, prefetchCount);
|
|
if (SPI_processed > 0)
|
|
{
|
|
TupleDesc rowDescriptor = SPI_tuptable->tupdesc;
|
|
partitionColumnIndex = ColumnIndex(rowDescriptor, partitionColumnName);
|
|
|
|
partitionColumnTypeId = SPI_gettypeid(rowDescriptor, partitionColumnIndex);
|
|
if (partitionColumnType != partitionColumnTypeId)
|
|
{
|
|
ereport(ERROR, (errmsg("partition column types %u and %u do not match",
|
|
partitionColumnTypeId, partitionColumnType)));
|
|
}
|
|
|
|
columnOutputFunctions = ColumnOutputFunctions(rowDescriptor,
|
|
rowOutputState->binary);
|
|
}
|
|
|
|
if (BinaryWorkerCopyFormat)
|
|
{
|
|
OutputBinaryHeaders(partitionFileArray, fileCount);
|
|
}
|
|
|
|
while (SPI_processed > 0)
|
|
{
|
|
int rowIndex = 0;
|
|
for (rowIndex = 0; rowIndex < SPI_processed; rowIndex++)
|
|
{
|
|
HeapTuple row = SPI_tuptable->vals[rowIndex];
|
|
TupleDesc rowDescriptor = SPI_tuptable->tupdesc;
|
|
FileOutputStream partitionFile = { 0, 0, 0 };
|
|
StringInfo rowText = NULL;
|
|
Datum partitionKey = 0;
|
|
bool partitionKeyNull = false;
|
|
uint32 partitionId = 0;
|
|
|
|
partitionKey = SPI_getbinval(row, rowDescriptor,
|
|
partitionColumnIndex, &partitionKeyNull);
|
|
|
|
/*
|
|
* If we have a partition key, we compute its bucket. Else if we have
|
|
* a null key, we then put this tuple into the 0th bucket. Note that
|
|
* the 0th bucket may hold other tuples as well, such as tuples whose
|
|
* partition keys hash to the value 0.
|
|
*/
|
|
if (!partitionKeyNull)
|
|
{
|
|
partitionId = (*PartitionIdFunction)(partitionKey, partitionIdContext);
|
|
}
|
|
else
|
|
{
|
|
partitionId = 0;
|
|
}
|
|
|
|
OutputRow(row, rowDescriptor, rowOutputState, columnOutputFunctions);
|
|
rowText = rowOutputState->fe_msgbuf;
|
|
|
|
partitionFile = partitionFileArray[partitionId];
|
|
FileOutputStreamWrite(partitionFile, rowText);
|
|
|
|
resetStringInfo(rowText);
|
|
}
|
|
|
|
SPI_freetuptable(SPI_tuptable);
|
|
|
|
SPI_cursor_fetch(queryPortal, fetchForward, prefetchCount);
|
|
}
|
|
|
|
SPI_cursor_close(queryPortal);
|
|
|
|
if (BinaryWorkerCopyFormat)
|
|
{
|
|
OutputBinaryFooters(partitionFileArray, fileCount);
|
|
}
|
|
|
|
/* delete row output memory context */
|
|
ClearRowOutputState(rowOutputState);
|
|
|
|
finished = SPI_finish();
|
|
if (finished != SPI_OK_FINISH)
|
|
{
|
|
ereport(ERROR, (errmsg("could not disconnect from SPI manager")));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* Determines the column number for the given column name. The column number
|
|
* count starts at 1.
|
|
*/
|
|
static int
|
|
ColumnIndex(TupleDesc rowDescriptor, const char *columnName)
|
|
{
|
|
int columnIndex = SPI_fnumber(rowDescriptor, columnName);
|
|
if (columnIndex == SPI_ERROR_NOATTRIBUTE)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN),
|
|
errmsg("could not find column name \"%s\"", columnName)));
|
|
}
|
|
|
|
Assert(columnIndex >= 1);
|
|
return columnIndex;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnOutputFunctions walks over a table's columns, and finds each column's
|
|
* type information. The function then resolves each type's output function,
|
|
* and stores and returns these output functions in an array.
|
|
*/
|
|
static FmgrInfo *
|
|
ColumnOutputFunctions(TupleDesc rowDescriptor, bool binaryFormat)
|
|
{
|
|
uint32 columnCount = (uint32) rowDescriptor->natts;
|
|
FmgrInfo *columnOutputFunctions = palloc0(columnCount * sizeof(FmgrInfo));
|
|
|
|
uint32 columnIndex = 0;
|
|
for (columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
|
{
|
|
FmgrInfo *currentOutputFunction = &columnOutputFunctions[columnIndex];
|
|
Form_pg_attribute currentColumn = rowDescriptor->attrs[columnIndex];
|
|
Oid columnTypeId = currentColumn->atttypid;
|
|
Oid outputFunctionId = InvalidOid;
|
|
bool typeVariableLength = false;
|
|
|
|
if (binaryFormat)
|
|
{
|
|
getTypeBinaryOutputInfo(columnTypeId, &outputFunctionId, &typeVariableLength);
|
|
}
|
|
else
|
|
{
|
|
getTypeOutputInfo(columnTypeId, &outputFunctionId, &typeVariableLength);
|
|
}
|
|
|
|
Assert(currentColumn->attisdropped == false);
|
|
|
|
fmgr_info(outputFunctionId, currentOutputFunction);
|
|
}
|
|
|
|
return columnOutputFunctions;
|
|
}
|
|
|
|
|
|
/*
|
|
* InitRowOutputState creates and initializes a copy state object. This object
|
|
* is internal to the copy command's implementation in Postgres; and we refactor
|
|
* and refer to it here to avoid code duplication. We also only initialize the
|
|
* fields needed for writing row data to text files, and skip the other fields.
|
|
*
|
|
* Note that the default field values used in commands/copy.c and this function
|
|
* must match one another. Therefore, any changes to the default values in the
|
|
* copy command must be propagated to this function.
|
|
*/
|
|
static PartialCopyState
|
|
InitRowOutputState(void)
|
|
{
|
|
PartialCopyState rowOutputState =
|
|
(PartialCopyState) palloc0(sizeof(PartialCopyStateData));
|
|
|
|
int fileEncoding = pg_get_client_encoding();
|
|
int databaseEncoding = GetDatabaseEncoding();
|
|
int databaseEncodingMaxLength = pg_database_encoding_max_length();
|
|
|
|
/* initialize defaults for printing null values */
|
|
char *nullPrint = pstrdup("\\N");
|
|
int nullPrintLen = strlen(nullPrint);
|
|
char *nullPrintClient = pg_server_to_any(nullPrint, nullPrintLen, fileEncoding);
|
|
|
|
/* set default text output characters */
|
|
rowOutputState->null_print = nullPrint;
|
|
rowOutputState->null_print_client = nullPrintClient;
|
|
rowOutputState->delim = pstrdup("\t");
|
|
|
|
rowOutputState->binary = BinaryWorkerCopyFormat;
|
|
|
|
/* set encoding conversion information */
|
|
rowOutputState->file_encoding = fileEncoding;
|
|
|
|
if (PG_ENCODING_IS_CLIENT_ONLY(fileEncoding))
|
|
{
|
|
ereport(ERROR, (errmsg("cannot repartition into encoding caller cannot "
|
|
"receive")));
|
|
}
|
|
|
|
/* set up transcoding information and default text output characters */
|
|
if ((fileEncoding != databaseEncoding) || (databaseEncodingMaxLength > 1))
|
|
{
|
|
rowOutputState->need_transcoding = true;
|
|
}
|
|
else
|
|
{
|
|
rowOutputState->need_transcoding = false;
|
|
}
|
|
|
|
/*
|
|
* Create a temporary memory context that we can reset once per row to
|
|
* recover palloc'd memory. This avoids any problems with leaks inside data
|
|
* type output routines, and should be faster than retail pfree's anyway.
|
|
*/
|
|
rowOutputState->rowcontext = AllocSetContextCreate(CurrentMemoryContext,
|
|
"WorkerRowOutputContext",
|
|
ALLOCSET_DEFAULT_MINSIZE,
|
|
ALLOCSET_DEFAULT_INITSIZE,
|
|
ALLOCSET_DEFAULT_MAXSIZE);
|
|
|
|
/* allocate the message buffer to use for serializing a row */
|
|
rowOutputState->fe_msgbuf = makeStringInfo();
|
|
|
|
return rowOutputState;
|
|
}
|
|
|
|
|
|
/* Clears copy state used for outputting row data. */
|
|
static void
|
|
ClearRowOutputState(PartialCopyState rowOutputState)
|
|
{
|
|
Assert(rowOutputState != NULL);
|
|
|
|
MemoryContextDelete(rowOutputState->rowcontext);
|
|
|
|
FreeStringInfo(rowOutputState->fe_msgbuf);
|
|
|
|
pfree(rowOutputState->null_print_client);
|
|
pfree(rowOutputState->delim);
|
|
|
|
pfree(rowOutputState);
|
|
}
|
|
|
|
|
|
/*
|
|
* OutputRow serializes one row using the column output functions,
|
|
* and appends the data to the row output state object's message buffer.
|
|
* This function is modeled after the CopyOneRowTo() function in
|
|
* commands/copy.c, but only implements a subset of that functionality.
|
|
*/
|
|
static void
|
|
OutputRow(HeapTuple row, TupleDesc rowDescriptor,
|
|
PartialCopyState rowOutputState, FmgrInfo *columnOutputFunctions)
|
|
{
|
|
MemoryContext oldContext = NULL;
|
|
uint32 columnIndex = 0;
|
|
|
|
uint32 columnCount = (uint32) rowDescriptor->natts;
|
|
Datum *valueArray = (Datum *) palloc0(columnCount * sizeof(Datum));
|
|
bool *isNullArray = (bool *) palloc0(columnCount * sizeof(bool));
|
|
|
|
/* deconstruct the tuple; this is faster than repeated heap_getattr */
|
|
heap_deform_tuple(row, rowDescriptor, valueArray, isNullArray);
|
|
|
|
/* reset previous tuple's output data, and the temporary memory context */
|
|
resetStringInfo(rowOutputState->fe_msgbuf);
|
|
|
|
MemoryContextReset(rowOutputState->rowcontext);
|
|
oldContext = MemoryContextSwitchTo(rowOutputState->rowcontext);
|
|
|
|
if (rowOutputState->binary)
|
|
{
|
|
CopySendInt16(rowOutputState, rowDescriptor->natts);
|
|
}
|
|
|
|
for (columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
|
{
|
|
Datum value = valueArray[columnIndex];
|
|
bool isNull = isNullArray[columnIndex];
|
|
bool lastColumn = false;
|
|
|
|
if (rowOutputState->binary)
|
|
{
|
|
if (!isNull)
|
|
{
|
|
FmgrInfo *outputFunctionPointer = &columnOutputFunctions[columnIndex];
|
|
bytea *outputBytes = SendFunctionCall(outputFunctionPointer, value);
|
|
|
|
CopySendInt32(rowOutputState, VARSIZE(outputBytes) - VARHDRSZ);
|
|
CopySendData(rowOutputState, VARDATA(outputBytes),
|
|
VARSIZE(outputBytes) - VARHDRSZ);
|
|
}
|
|
else
|
|
{
|
|
CopySendInt32(rowOutputState, -1);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!isNull)
|
|
{
|
|
FmgrInfo *outputFunctionPointer = &columnOutputFunctions[columnIndex];
|
|
char *columnText = OutputFunctionCall(outputFunctionPointer, value);
|
|
|
|
CopyAttributeOutText(rowOutputState, columnText);
|
|
}
|
|
else
|
|
{
|
|
CopySendString(rowOutputState, rowOutputState->null_print_client);
|
|
}
|
|
|
|
lastColumn = ((columnIndex + 1) == columnCount);
|
|
if (!lastColumn)
|
|
{
|
|
CopySendChar(rowOutputState, rowOutputState->delim[0]);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!rowOutputState->binary)
|
|
{
|
|
/* append default line termination string depending on the platform */
|
|
#ifndef WIN32
|
|
CopySendChar(rowOutputState, '\n');
|
|
#else
|
|
CopySendString(rowOutputState, "\r\n");
|
|
#endif
|
|
}
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
pfree(valueArray);
|
|
pfree(isNullArray);
|
|
}
|
|
|
|
|
|
/*
|
|
* Write the header of postgres' binary serialization format to each partition file.
|
|
* This function is used when binary_worker_copy_format is enabled.
|
|
*/
|
|
static void
|
|
OutputBinaryHeaders(FileOutputStream *partitionFileArray, uint32 fileCount)
|
|
{
|
|
uint32 fileIndex = 0;
|
|
for (fileIndex = 0; fileIndex < fileCount; fileIndex++)
|
|
{
|
|
/* Generate header for a binary copy */
|
|
const int32 zero = 0;
|
|
FileOutputStream partitionFile = { 0, 0, 0 };
|
|
PartialCopyStateData headerOutputStateData;
|
|
PartialCopyState headerOutputState = (PartialCopyState) & headerOutputStateData;
|
|
|
|
memset(headerOutputState, 0, sizeof(PartialCopyStateData));
|
|
headerOutputState->fe_msgbuf = makeStringInfo();
|
|
|
|
/* Signature */
|
|
CopySendData(headerOutputState, BinarySignature, 11);
|
|
|
|
/* Flags field (no OIDs) */
|
|
CopySendInt32(headerOutputState, zero);
|
|
|
|
/* No header extension */
|
|
CopySendInt32(headerOutputState, zero);
|
|
|
|
partitionFile = partitionFileArray[fileIndex];
|
|
FileOutputStreamWrite(partitionFile, headerOutputState->fe_msgbuf);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* Write the footer of postgres' binary serialization format to each partition file.
|
|
* This function is used when binary_worker_copy_format is enabled.
|
|
*/
|
|
static void
|
|
OutputBinaryFooters(FileOutputStream *partitionFileArray, uint32 fileCount)
|
|
{
|
|
uint32 fileIndex = 0;
|
|
for (fileIndex = 0; fileIndex < fileCount; fileIndex++)
|
|
{
|
|
/* Generate footer for a binary copy */
|
|
int16 negative = -1;
|
|
FileOutputStream partitionFile = { 0, 0, 0 };
|
|
PartialCopyStateData footerOutputStateData;
|
|
PartialCopyState footerOutputState = (PartialCopyState) & footerOutputStateData;
|
|
|
|
memset(footerOutputState, 0, sizeof(PartialCopyStateData));
|
|
footerOutputState->fe_msgbuf = makeStringInfo();
|
|
|
|
CopySendInt16(footerOutputState, negative);
|
|
|
|
partitionFile = partitionFileArray[fileIndex];
|
|
FileOutputStreamWrite(partitionFile, footerOutputState->fe_msgbuf);
|
|
}
|
|
}
|
|
|
|
|
|
/* *INDENT-OFF* */
|
|
/* Append data to the copy buffer in outputState */
|
|
static void
|
|
CopySendData(PartialCopyState outputState, const void *databuf, int datasize)
|
|
{
|
|
appendBinaryStringInfo(outputState->fe_msgbuf, databuf, datasize);
|
|
}
|
|
|
|
|
|
/* Append a striong to the copy buffer in outputState. */
|
|
static void
|
|
CopySendString(PartialCopyState outputState, const char *str)
|
|
{
|
|
appendBinaryStringInfo(outputState->fe_msgbuf, str, strlen(str));
|
|
}
|
|
|
|
|
|
/* Append a char to the copy buffer in outputState. */
|
|
static void
|
|
CopySendChar(PartialCopyState outputState, char c)
|
|
{
|
|
appendStringInfoCharMacro(outputState->fe_msgbuf, c);
|
|
}
|
|
|
|
|
|
/* Append an int32 to the copy buffer in outputState. */
|
|
static void
|
|
CopySendInt32(PartialCopyState outputState, int32 val)
|
|
{
|
|
uint32 buf = htonl((uint32) val);
|
|
CopySendData(outputState, &buf, sizeof(buf));
|
|
}
|
|
|
|
|
|
/* Append an int16 to the copy buffer in outputState. */
|
|
static void
|
|
CopySendInt16(PartialCopyState outputState, int16 val)
|
|
{
|
|
uint16 buf = htons((uint16) val);
|
|
CopySendData(outputState, &buf, sizeof(buf));
|
|
}
|
|
|
|
|
|
/*
|
|
* Send text representation of one column, with conversion and escaping.
|
|
*
|
|
* NB: This function is based on commands/copy.c and doesn't fully conform to
|
|
* our coding style. The function should be kept in sync with copy.c.
|
|
*/
|
|
static void
|
|
CopyAttributeOutText(PartialCopyState cstate, char *string)
|
|
{
|
|
char *pointer = NULL;
|
|
char *start = NULL;
|
|
char c = '\0';
|
|
char delimc = cstate->delim[0];
|
|
|
|
if (cstate->need_transcoding)
|
|
{
|
|
pointer = pg_server_to_any(string, strlen(string), cstate->file_encoding);
|
|
}
|
|
else
|
|
{
|
|
pointer = string;
|
|
}
|
|
|
|
/*
|
|
* We have to grovel through the string searching for control characters
|
|
* and instances of the delimiter character. In most cases, though, these
|
|
* are infrequent. To avoid overhead from calling CopySendData once per
|
|
* character, we dump out all characters between escaped characters in a
|
|
* single call. The loop invariant is that the data from "start" to "pointer"
|
|
* can be sent literally, but hasn't yet been.
|
|
*
|
|
* As all encodings here are safe, i.e. backend supported ones, we can
|
|
* skip doing pg_encoding_mblen(), because in valid backend encodings,
|
|
* extra bytes of a multibyte character never look like ASCII.
|
|
*/
|
|
start = pointer;
|
|
while ((c = *pointer) != '\0')
|
|
{
|
|
if ((unsigned char) c < (unsigned char) 0x20)
|
|
{
|
|
/*
|
|
* \r and \n must be escaped, the others are traditional. We
|
|
* prefer to dump these using the C-like notation, rather than
|
|
* a backslash and the literal character, because it makes the
|
|
* dump file a bit more proof against Microsoftish data
|
|
* mangling.
|
|
*/
|
|
switch (c)
|
|
{
|
|
case '\b':
|
|
c = 'b';
|
|
break;
|
|
case '\f':
|
|
c = 'f';
|
|
break;
|
|
case '\n':
|
|
c = 'n';
|
|
break;
|
|
case '\r':
|
|
c = 'r';
|
|
break;
|
|
case '\t':
|
|
c = 't';
|
|
break;
|
|
case '\v':
|
|
c = 'v';
|
|
break;
|
|
default:
|
|
/* If it's the delimiter, must backslash it */
|
|
if (c == delimc)
|
|
break;
|
|
/* All ASCII control chars are length 1 */
|
|
pointer++;
|
|
continue; /* fall to end of loop */
|
|
}
|
|
/* if we get here, we need to convert the control char */
|
|
CopyFlushOutput(cstate, start, pointer);
|
|
CopySendChar(cstate, '\\');
|
|
CopySendChar(cstate, c);
|
|
start = ++pointer; /* do not include char in next run */
|
|
}
|
|
else if (c == '\\' || c == delimc)
|
|
{
|
|
CopyFlushOutput(cstate, start, pointer);
|
|
CopySendChar(cstate, '\\');
|
|
start = pointer++; /* we include char in next run */
|
|
}
|
|
else
|
|
{
|
|
pointer++;
|
|
}
|
|
}
|
|
|
|
CopyFlushOutput(cstate, start, pointer);
|
|
}
|
|
|
|
|
|
/* *INDENT-ON* */
|
|
/* Helper function to send pending copy output */
|
|
static inline void
|
|
CopyFlushOutput(PartialCopyState cstate, char *start, char *pointer)
|
|
{
|
|
if (pointer > start)
|
|
{
|
|
CopySendData(cstate, start, pointer - start);
|
|
}
|
|
}
|
|
|
|
|
|
/* Helper function that invokes a function with the default collation oid. */
|
|
Datum
|
|
CompareCall2(FmgrInfo *functionInfo, Datum leftArgument, Datum rightArgument)
|
|
{
|
|
Datum result = FunctionCall2Coll(functionInfo, DEFAULT_COLLATION_OID,
|
|
leftArgument, rightArgument);
|
|
return result;
|
|
}
|
|
|
|
|
|
/*
|
|
* RangePartitionId determines the partition number for the given data value
|
|
* by applying range partitioning. More specifically, the function takes in a
|
|
* data value and an array of sorted split points, and performs a binary search
|
|
* within that array to determine the bucket the data value falls into. The
|
|
* function then returns that bucket number.
|
|
*
|
|
* Note that we employ a version of binary search known as upper_bound; this
|
|
* ensures that all null values fall into the zeroth bucket and that we maintain
|
|
* full compatibility with the semantics of Hadoop's TotalOrderPartitioner.
|
|
*/
|
|
static uint32
|
|
RangePartitionId(Datum partitionValue, const void *context)
|
|
{
|
|
RangePartitionContext *rangePartitionContext = (RangePartitionContext *) context;
|
|
FmgrInfo *comparisonFunction = rangePartitionContext->comparisonFunction;
|
|
Datum *pointArray = rangePartitionContext->splitPointArray;
|
|
int32 currentLength = rangePartitionContext->splitPointCount;
|
|
int32 halfLength = 0;
|
|
uint32 firstIndex = 0;
|
|
|
|
/*
|
|
* We implement a binary search variant known as upper_bound. This variant
|
|
* gives us the semantics we need for partitioned joins; and is also used by
|
|
* Hadoop's TotalOrderPartitioner. To implement this variant, we rely on SGI
|
|
* STL v3.3's source code for upper_bound(). Note that elements in the point
|
|
* array cannot be null.
|
|
*/
|
|
while (currentLength > 0)
|
|
{
|
|
uint32 middleIndex = 0;
|
|
Datum middlePoint = 0;
|
|
Datum comparisonDatum = 0;
|
|
int comparisonResult = 0;
|
|
|
|
halfLength = currentLength >> 1;
|
|
middleIndex = firstIndex;
|
|
middleIndex += halfLength;
|
|
|
|
middlePoint = pointArray[middleIndex];
|
|
|
|
comparisonDatum = CompareCall2(comparisonFunction, partitionValue, middlePoint);
|
|
comparisonResult = DatumGetInt32(comparisonDatum);
|
|
|
|
/* if partition value is less than middle point */
|
|
if (comparisonResult < 0)
|
|
{
|
|
currentLength = halfLength;
|
|
}
|
|
else
|
|
{
|
|
firstIndex = middleIndex;
|
|
firstIndex++;
|
|
currentLength = currentLength - halfLength - 1;
|
|
}
|
|
}
|
|
|
|
return firstIndex;
|
|
}
|
|
|
|
|
|
/*
|
|
* HashPartitionId determines the partition number for the given data value
|
|
* using hash partitioning. More specifically, the function returns zero if the
|
|
* given data value is null. If not, the function applies the standard Postgres
|
|
* hashing function for the given data type, and mods the hashed result with the
|
|
* number of partitions. The function then returns the modded number as the
|
|
* partition number.
|
|
*
|
|
* Note that any changes to PostgreSQL's hashing functions will reshuffle the
|
|
* entire distribution created by this function. For a discussion of this issue,
|
|
* see Google "PL/Proxy Users: Hash Functions Have Changed in PostgreSQL 8.4."
|
|
*/
|
|
static uint32
|
|
HashPartitionId(Datum partitionValue, const void *context)
|
|
{
|
|
HashPartitionContext *hashPartitionContext = (HashPartitionContext *) context;
|
|
FmgrInfo *hashFunction = hashPartitionContext->hashFunction;
|
|
uint32 partitionCount = hashPartitionContext->partitionCount;
|
|
Datum hashDatum = 0;
|
|
uint32 hashResult = 0;
|
|
uint32 hashPartitionId = 0;
|
|
|
|
/* hash functions return unsigned 32-bit integers */
|
|
hashDatum = FunctionCall1(hashFunction, partitionValue);
|
|
hashResult = DatumGetUInt32(hashDatum);
|
|
hashPartitionId = (hashResult % partitionCount);
|
|
|
|
return hashPartitionId;
|
|
}
|