/*------------------------------------------------------------------------- * * worker_partition_protocol.c * * Routines for partitioning table data into multiple files. Table partitioning * is one of the three distributed execution primitives that we apply on worker * nodes; and when partitioning data, we follow Hadoop's naming conventions as * much as possible. * * Copyright (c) 2012, Citus Data, Inc. * * $Id$ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "funcapi.h" #include #include #ifdef HAVE_INTTYPES_H #include #endif #include "access/hash.h" #include "access/htup_details.h" #include "access/nbtree.h" #include "catalog/pg_collation.h" #include "commands/copy.h" #include "commands/defrem.h" #include "distributed/resource_lock.h" #include "distributed/transmit.h" #include "distributed/worker_protocol.h" #include "executor/spi.h" #include "mb/pg_wchar.h" #include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/memutils.h" /* Config variables managed via guc.c */ bool BinaryWorkerCopyFormat = false; /* binary format for copying between workers */ int PartitionBufferSize = 16384; /* total partitioning buffer size in KB */ /* Local variables */ static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; static uint32 FileBufferSizeInBytes = 0; /* file buffer size to init later */ /* Local functions forward declarations */ static StringInfo InitTaskAttemptDirectory(uint64 jobId, uint32 taskId); static uint32 FileBufferSize(int partitionBufferSizeInKB, uint32 fileCount); static FileOutputStream * OpenPartitionFiles(StringInfo directoryName, uint32 fileCount); static void ClosePartitionFiles(FileOutputStream *partitionFileArray, uint32 fileCount); static void RenameDirectory(StringInfo oldDirectoryName, StringInfo newDirectoryName); static void FileOutputStreamWrite(FileOutputStream file, StringInfo dataToWrite); static void FileOutputStreamFlush(FileOutputStream file); static void FilterAndPartitionTable(const char *filterQuery, const char *columnName, Oid columnType, uint32 (*PartitionIdFunction)(Datum, const void *), const void *partitionIdContext, FileOutputStream *partitionFileArray, uint32 fileCount); static int ColumnIndex(TupleDesc rowDescriptor, const char *columnName); static FmgrInfo * ColumnOutputFunctions(TupleDesc rowDescriptor, bool binaryFormat); static PartialCopyState InitRowOutputState(void); static void ClearRowOutputState(PartialCopyState copyState); static void OutputRow(HeapTuple row, TupleDesc rowDescriptor, PartialCopyState rowOutputState, FmgrInfo *columnOutputFunctions); static void OutputBinaryHeaders(FileOutputStream *partitionFileArray, uint32 fileCount); static void OutputBinaryFooters(FileOutputStream *partitionFileArray, uint32 fileCount); static void CopySendData(PartialCopyState outputState, const void *databuf, int datasize); static void CopySendString(PartialCopyState outputState, const char *str); static void CopySendChar(PartialCopyState outputState, char c); static void CopySendInt32(PartialCopyState outputState, int32 val); static void CopySendInt16(PartialCopyState outputState, int16 val); static void CopyAttributeOutText(PartialCopyState outputState, char *string); static inline void CopyFlushOutput(PartialCopyState outputState, char *start, char *pointer); static uint32 RangePartitionId(Datum partitionValue, const void *context); static uint32 HashPartitionId(Datum partitionValue, const void *context); /* exports for SQL callable functions */ PG_FUNCTION_INFO_V1(worker_range_partition_table); PG_FUNCTION_INFO_V1(worker_hash_partition_table); /* * worker_range_partition_table executes the given filter query, repartitions * the filter query's results on a partitioning column, and writes the resulting * rows to a set of text files on local disk. The function then atomically * renames the directory in which the text files live to ensure deterministic * behavior. * * This function applies range partitioning through the use of a function * pointer and a range context object; for details, see RangePartitionId(). */ Datum worker_range_partition_table(PG_FUNCTION_ARGS) { uint64 jobId = PG_GETARG_INT64(0); uint32 taskId = PG_GETARG_UINT32(1); text *filterQueryText = PG_GETARG_TEXT_P(2); text *partitionColumnText = PG_GETARG_TEXT_P(3); Oid partitionColumnType = PG_GETARG_OID(4); ArrayType *splitPointObject = PG_GETARG_ARRAYTYPE_P(5); const char *filterQuery = text_to_cstring(filterQueryText); const char *partitionColumn = text_to_cstring(partitionColumnText); RangePartitionContext *partitionContext = NULL; FmgrInfo *comparisonFunction = NULL; Datum *splitPointArray = NULL; int32 splitPointCount = 0; uint32 fileCount = 0; StringInfo taskDirectory = NULL; StringInfo taskAttemptDirectory = NULL; FileOutputStream *partitionFileArray = NULL; /* first check that array element's and partition column's types match */ Oid splitPointType = ARR_ELEMTYPE(splitPointObject); if (splitPointType != partitionColumnType) { ereport(ERROR, (errmsg("partition column type %u and split point type %u " "do not match", partitionColumnType, splitPointType))); } /* use column's type information to get the comparison function */ comparisonFunction = GetFunctionInfo(partitionColumnType, BTREE_AM_OID, BTORDER_PROC); /* deserialize split points into their array representation */ splitPointArray = DeconstructArrayObject(splitPointObject); splitPointCount = ArrayObjectCount(splitPointObject); fileCount = splitPointCount + 1; /* range partitioning needs an extra bucket */ /* create range partition context object */ partitionContext = palloc0(sizeof(RangePartitionContext)); partitionContext->comparisonFunction = comparisonFunction; partitionContext->splitPointArray = splitPointArray; partitionContext->splitPointCount = splitPointCount; /* init directories and files to write the partitioned data to */ taskDirectory = InitTaskDirectory(jobId, taskId); taskAttemptDirectory = InitTaskAttemptDirectory(jobId, taskId); partitionFileArray = OpenPartitionFiles(taskAttemptDirectory, fileCount); FileBufferSizeInBytes = FileBufferSize(PartitionBufferSize, fileCount); /* call the partitioning function that does the actual work */ FilterAndPartitionTable(filterQuery, partitionColumn, partitionColumnType, &RangePartitionId, (const void *) partitionContext, partitionFileArray, fileCount); /* close partition files and atomically rename (commit) them */ ClosePartitionFiles(partitionFileArray, fileCount); RemoveDirectory(taskDirectory); RenameDirectory(taskAttemptDirectory, taskDirectory); PG_RETURN_VOID(); } /* * worker_hash_partition_table executes the given filter query, repartitions the * filter query's results on a partitioning column, and writes the resulting * rows to a set of text files on local disk. The function then atomically * renames the directory in which the text files live to ensure deterministic * behavior. * * This function applies hash partitioning through the use of a function pointer * and a hash context object; for details, see HashPartitionId(). */ Datum worker_hash_partition_table(PG_FUNCTION_ARGS) { uint64 jobId = PG_GETARG_INT64(0); uint32 taskId = PG_GETARG_UINT32(1); text *filterQueryText = PG_GETARG_TEXT_P(2); text *partitionColumnText = PG_GETARG_TEXT_P(3); Oid partitionColumnType = PG_GETARG_OID(4); uint32 partitionCount = PG_GETARG_UINT32(5); const char *filterQuery = text_to_cstring(filterQueryText); const char *partitionColumn = text_to_cstring(partitionColumnText); HashPartitionContext *partitionContext = NULL; FmgrInfo *hashFunction = NULL; StringInfo taskDirectory = NULL; StringInfo taskAttemptDirectory = NULL; FileOutputStream *partitionFileArray = NULL; uint32 fileCount = partitionCount; /* use column's type information to get the hashing function */ hashFunction = GetFunctionInfo(partitionColumnType, HASH_AM_OID, HASHPROC); /* create hash partition context object */ partitionContext = palloc0(sizeof(HashPartitionContext)); partitionContext->hashFunction = hashFunction; partitionContext->partitionCount = partitionCount; /* init directories and files to write the partitioned data to */ taskDirectory = InitTaskDirectory(jobId, taskId); taskAttemptDirectory = InitTaskAttemptDirectory(jobId, taskId); partitionFileArray = OpenPartitionFiles(taskAttemptDirectory, fileCount); FileBufferSizeInBytes = FileBufferSize(PartitionBufferSize, fileCount); /* call the partitioning function that does the actual work */ FilterAndPartitionTable(filterQuery, partitionColumn, partitionColumnType, &HashPartitionId, (const void *) partitionContext, partitionFileArray, fileCount); /* close partition files and atomically rename (commit) them */ ClosePartitionFiles(partitionFileArray, fileCount); RemoveDirectory(taskDirectory); RenameDirectory(taskAttemptDirectory, taskDirectory); PG_RETURN_VOID(); } /* * GetFunctionInfo first resolves the operator for the given data type, access * method, and support procedure. The function then uses the resolved operator's * identifier to fill in a function manager object, and returns this object. */ FmgrInfo * GetFunctionInfo(Oid typeId, Oid accessMethodId, int16 procedureId) { FmgrInfo *functionInfo = (FmgrInfo *) palloc0(sizeof(FmgrInfo)); /* get default operator class from pg_opclass for datum type */ Oid operatorClassId = GetDefaultOpClass(typeId, accessMethodId); Oid operatorFamilyId = get_opclass_family(operatorClassId); Oid operatorClassInputType = get_opclass_input_type(operatorClassId); Oid operatorId = get_opfamily_proc(operatorFamilyId, operatorClassInputType, operatorClassInputType, procedureId); if (operatorId == InvalidOid) { ereport(ERROR, (errmsg("could not find function for data typeId %u", typeId))); } /* fill in the FmgrInfo struct using the operatorId */ fmgr_info(operatorId, functionInfo); return functionInfo; } /* * DeconstructArrayObject takes in a single dimensional array, and deserializes * this array's members into an array of datum objects. The function then * returns this datum array. */ Datum * DeconstructArrayObject(ArrayType *arrayObject) { Datum *datumArray = NULL; bool *datumArrayNulls = NULL; int datumArrayLength = 0; Oid typeId = InvalidOid; bool typeByVal = false; char typeAlign = 0; int16 typeLength = 0; bool arrayHasNull = ARR_HASNULL(arrayObject); if (arrayHasNull) { ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("worker array object cannot contain null values"))); } typeId = ARR_ELEMTYPE(arrayObject); get_typlenbyvalalign(typeId, &typeLength, &typeByVal, &typeAlign); deconstruct_array(arrayObject, typeId, typeLength, typeByVal, typeAlign, &datumArray, &datumArrayNulls, &datumArrayLength); return datumArray; } /* * ArrayObjectCount takes in a single dimensional array, and returns the number * of elements in this array. */ int32 ArrayObjectCount(ArrayType *arrayObject) { int32 dimensionCount = ARR_NDIM(arrayObject); int32 *dimensionLengthArray = ARR_DIMS(arrayObject); int32 arrayLength = 0; /* we currently allow split point arrays to have only one subarray */ Assert(dimensionCount == 1); arrayLength = ArrayGetNItems(dimensionCount, dimensionLengthArray); if (arrayLength <= 0) { ereport(ERROR, (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), errmsg("worker array object cannot be empty"))); } return arrayLength; } /* * InitTaskDirectory creates a job and task directory using given identifiers, * if these directories do not already exist. The function then returns the task * directory's name. */ StringInfo InitTaskDirectory(uint64 jobId, uint32 taskId) { bool jobDirectoryExists = false; bool taskDirectoryExists = false; /* * If the task tracker assigned this task (regular case), the tracker should * have already created the job directory. */ StringInfo jobDirectoryName = JobDirectoryName(jobId); StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId); LockJobResource(jobId, AccessExclusiveLock); jobDirectoryExists = DirectoryExists(jobDirectoryName); if (!jobDirectoryExists) { CreateDirectory(jobDirectoryName); } taskDirectoryExists = DirectoryExists(taskDirectoryName); if (!taskDirectoryExists) { CreateDirectory(taskDirectoryName); } UnlockJobResource(jobId, AccessExclusiveLock); return taskDirectoryName; } /* * InitTaskAttemptDirectory finds a task attempt directory that is not taken, * and creates that directory. The function then returns the task attempt * directory's name. */ static StringInfo InitTaskAttemptDirectory(uint64 jobId, uint32 taskId) { StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId); uint32 randomId = (uint32) random(); /* * We should have only one process executing this task. Still, we append a * random id just in case. */ StringInfo taskAttemptDirectoryName = makeStringInfo(); appendStringInfo(taskAttemptDirectoryName, "%s_%0*u", taskDirectoryName->data, MIN_TASK_FILENAME_WIDTH, randomId); /* * If this task previously failed, and gets re-executed and improbably draws * the same randomId, the task will fail to create the directory. */ CreateDirectory(taskAttemptDirectoryName); return taskAttemptDirectoryName; } /* Calculates and returns the buffer size to use for each file. */ static uint32 FileBufferSize(int partitionBufferSizeInKB, uint32 fileCount) { double partitionBufferSize = (double) partitionBufferSizeInKB * 1024.0; uint32 fileBufferSize = (uint32) rint(partitionBufferSize / fileCount); return fileBufferSize; } /* * OpenPartitionFiles takes in a directory name and file count, and opens new * partition files in this directory. The names for these new files are modeled * after Hadoop's naming conventions for map files. These file names, virtual * file descriptors, and file buffers are stored together in file output stream * objects. These objects are then returned in an array from this function. */ static FileOutputStream * OpenPartitionFiles(StringInfo directoryName, uint32 fileCount) { FileOutputStream *partitionFileArray = NULL; File fileDescriptor = 0; uint32 fileIndex = 0; const int fileFlags = (O_APPEND | O_CREAT | O_RDWR | PG_BINARY); const int fileMode = (S_IRUSR | S_IWUSR); partitionFileArray = palloc0(fileCount * sizeof(FileOutputStream)); for (fileIndex = 0; fileIndex < fileCount; fileIndex++) { StringInfo filePath = PartitionFilename(directoryName, fileIndex); fileDescriptor = PathNameOpenFile(filePath->data, fileFlags, fileMode); if (fileDescriptor < 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", filePath->data))); } partitionFileArray[fileIndex].fileDescriptor = fileDescriptor; partitionFileArray[fileIndex].fileBuffer = makeStringInfo(); partitionFileArray[fileIndex].filePath = filePath; } return partitionFileArray; } /* * ClosePartitionFiles walks over each file output stream object, and flushes * any remaining data in the file's buffer. The function then closes the file, * and deletes any allocated memory for the file stream object. */ static void ClosePartitionFiles(FileOutputStream *partitionFileArray, uint32 fileCount) { uint32 fileIndex = 0; for (fileIndex = 0; fileIndex < fileCount; fileIndex++) { FileOutputStream partitionFile = partitionFileArray[fileIndex]; FileOutputStreamFlush(partitionFile); FileClose(partitionFile.fileDescriptor); FreeStringInfo(partitionFile.fileBuffer); FreeStringInfo(partitionFile.filePath); } pfree(partitionFileArray); } /* Constructs a standardized job directory path for the given job id. */ StringInfo JobDirectoryName(uint64 jobId) { /* * We use the default tablespace in {datadir}/base. Further, we need to * apply padding on our 64-bit job id, and hence can't use UINT64_FORMAT. */ #ifdef HAVE_INTTYPES_H StringInfo jobDirectoryName = makeStringInfo(); appendStringInfo(jobDirectoryName, "base/%s/%s%0*" PRIu64, PG_JOB_CACHE_DIR, JOB_DIRECTORY_PREFIX, MIN_JOB_DIRNAME_WIDTH, jobId); #else StringInfo jobDirectoryName = makeStringInfo(); appendStringInfo(jobDirectoryName, "base/%s/%s%0*llu", PG_JOB_CACHE_DIR, JOB_DIRECTORY_PREFIX, MIN_JOB_DIRNAME_WIDTH, jobId); #endif return jobDirectoryName; } /* Constructs a standardized task directory path for given job and task ids. */ StringInfo TaskDirectoryName(uint64 jobId, uint32 taskId) { StringInfo jobDirectoryName = JobDirectoryName(jobId); StringInfo taskDirectoryName = makeStringInfo(); appendStringInfo(taskDirectoryName, "%s/%s%0*u", jobDirectoryName->data, TASK_FILE_PREFIX, MIN_TASK_FILENAME_WIDTH, taskId); return taskDirectoryName; } /* Constructs a standardized partition file path for given directory and id. */ StringInfo PartitionFilename(StringInfo directoryName, uint32 partitionId) { StringInfo partitionFilename = makeStringInfo(); appendStringInfo(partitionFilename, "%s/%s%0*u", directoryName->data, PARTITION_FILE_PREFIX, MIN_PARTITION_FILENAME_WIDTH, partitionId); return partitionFilename; } /* * JobDirectoryElement takes in a filename, and checks if this name lives in the * directory path that is used for task output files. Note that this function's * implementation is coupled with JobDirectoryName(). */ bool JobDirectoryElement(const char *filename) { bool directoryElement = false; char *directoryPathFound = NULL; StringInfo directoryPath = makeStringInfo(); appendStringInfo(directoryPath, "base/%s/%s", PG_JOB_CACHE_DIR, JOB_DIRECTORY_PREFIX); directoryPathFound = strstr(filename, directoryPath->data); if (directoryPathFound != NULL) { directoryElement = true; } pfree(directoryPath); return directoryElement; } /* Checks if a directory exists for the given directory name. */ bool DirectoryExists(StringInfo directoryName) { bool directoryExists = true; struct stat directoryStat; int statOK = stat(directoryName->data, &directoryStat); if (statOK == 0) { /* file already exists; just assert that it is a directory */ Assert(S_ISDIR(directoryStat.st_mode)); } else { if (errno == ENOENT) { directoryExists = false; } else { ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat directory \"%s\": %m", directoryName->data))); } } return directoryExists; } /* Creates a new directory with the given directory name. */ void CreateDirectory(StringInfo directoryName) { int makeOK = mkdir(directoryName->data, S_IRWXU); if (makeOK != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", directoryName->data))); } } /* * RemoveDirectory first checks if the given directory exists. If it does, the * function recursively deletes the contents of the given directory, and then * deletes the directory itself. This function is modeled on the Boost file * system library's remove_all() method. */ void RemoveDirectory(StringInfo filename) { struct stat fileStat; int removed = 0; int fileStated = stat(filename->data, &fileStat); if (fileStated < 0) { if (errno == ENOENT) { return; /* if file does not exist, return */ } else { ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", filename->data))); } } /* * If this is a directory, iterate over all its contents and for each * content, recurse into this function. Also, make sure that we do not * recurse into symbolic links. */ if (S_ISDIR(fileStat.st_mode) && !S_ISLNK(fileStat.st_mode)) { const char *directoryName = filename->data; struct dirent *directoryEntry = NULL; DIR *directory = AllocateDir(directoryName); if (directory == NULL) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", directoryName))); } directoryEntry = ReadDir(directory, directoryName); for (; directoryEntry != NULL; directoryEntry = ReadDir(directory, directoryName)) { const char *baseFilename = directoryEntry->d_name; StringInfo fullFilename = NULL; /* if system file, skip it */ if (strncmp(baseFilename, ".", MAXPGPATH) == 0 || strncmp(baseFilename, "..", MAXPGPATH) == 0) { continue; } fullFilename = makeStringInfo(); appendStringInfo(fullFilename, "%s/%s", directoryName, baseFilename); RemoveDirectory(fullFilename); FreeStringInfo(fullFilename); } FreeDir(directory); } /* we now have an empty directory or a regular file, remove it */ if (S_ISDIR(fileStat.st_mode)) { removed = rmdir(filename->data); } else { removed = unlink(filename->data); } if (removed != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", filename->data))); } } /* Moves directory from old path to the new one. */ static void RenameDirectory(StringInfo oldDirectoryName, StringInfo newDirectoryName) { int renamed = rename(oldDirectoryName->data, newDirectoryName->data); if (renamed != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename directory \"%s\" to \"%s\": %m", oldDirectoryName->data, newDirectoryName->data))); } } /* * FileOutputStreamWrite appends given data to file stream's internal buffers. * The function then checks if buffered data exceeds preconfigured buffer size; * if so, the function flushes the buffer to the underlying file. */ static void FileOutputStreamWrite(FileOutputStream file, StringInfo dataToWrite) { StringInfo fileBuffer = file.fileBuffer; uint32 newBufferSize = fileBuffer->len + dataToWrite->len; appendBinaryStringInfo(fileBuffer, dataToWrite->data, dataToWrite->len); if (newBufferSize > FileBufferSizeInBytes) { FileOutputStreamFlush(file); resetStringInfo(fileBuffer); } } /* Flushes data buffered in the file stream object to the underlying file. */ static void FileOutputStreamFlush(FileOutputStream file) { StringInfo fileBuffer = file.fileBuffer; int written = 0; errno = 0; written = FileWrite(file.fileDescriptor, fileBuffer->data, fileBuffer->len); if (written != fileBuffer->len) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not write %d bytes to partition file \"%s\"", fileBuffer->len, file.filePath->data))); } } /* * FilterAndPartitionTable executes a given SQL query, and iterates over query * results in a read-only fashion. For each resulting row, the function applies * the partitioning function and determines the partition identifier. Then, the * function chooses the partition file corresponding to this identifier, and * serializes the row into this file using the copy command's text format. */ static void FilterAndPartitionTable(const char *filterQuery, const char *partitionColumnName, Oid partitionColumnType, uint32 (*PartitionIdFunction)(Datum, const void *), const void *partitionIdContext, FileOutputStream *partitionFileArray, uint32 fileCount) { PartialCopyState rowOutputState = NULL; FmgrInfo *columnOutputFunctions = NULL; int partitionColumnIndex = 0; Oid partitionColumnTypeId = InvalidOid; Portal queryPortal = NULL; int connected = 0; int finished = 0; const char *noPortalName = NULL; const bool readOnly = true; const bool fetchForward = true; const int noCursorOptions = 0; const int prefetchCount = ROW_PREFETCH_COUNT; connected = SPI_connect(); if (connected != SPI_OK_CONNECT) { ereport(ERROR, (errmsg("could not connect to SPI manager"))); } queryPortal = SPI_cursor_open_with_args(noPortalName, filterQuery, 0, NULL, NULL, NULL, /* no arguments */ readOnly, noCursorOptions); if (queryPortal == NULL) { ereport(ERROR, (errmsg("could not open implicit cursor for query \"%s\"", filterQuery))); } rowOutputState = InitRowOutputState(); SPI_cursor_fetch(queryPortal, fetchForward, prefetchCount); if (SPI_processed > 0) { TupleDesc rowDescriptor = SPI_tuptable->tupdesc; partitionColumnIndex = ColumnIndex(rowDescriptor, partitionColumnName); partitionColumnTypeId = SPI_gettypeid(rowDescriptor, partitionColumnIndex); if (partitionColumnType != partitionColumnTypeId) { ereport(ERROR, (errmsg("partition column types %u and %u do not match", partitionColumnTypeId, partitionColumnType))); } columnOutputFunctions = ColumnOutputFunctions(rowDescriptor, rowOutputState->binary); } if (BinaryWorkerCopyFormat) { OutputBinaryHeaders(partitionFileArray, fileCount); } while (SPI_processed > 0) { int rowIndex = 0; for (rowIndex = 0; rowIndex < SPI_processed; rowIndex++) { HeapTuple row = SPI_tuptable->vals[rowIndex]; TupleDesc rowDescriptor = SPI_tuptable->tupdesc; FileOutputStream partitionFile = { 0, 0, 0 }; StringInfo rowText = NULL; Datum partitionKey = 0; bool partitionKeyNull = false; uint32 partitionId = 0; partitionKey = SPI_getbinval(row, rowDescriptor, partitionColumnIndex, &partitionKeyNull); /* * If we have a partition key, we compute its bucket. Else if we have * a null key, we then put this tuple into the 0th bucket. Note that * the 0th bucket may hold other tuples as well, such as tuples whose * partition keys hash to the value 0. */ if (!partitionKeyNull) { partitionId = (*PartitionIdFunction)(partitionKey, partitionIdContext); } else { partitionId = 0; } OutputRow(row, rowDescriptor, rowOutputState, columnOutputFunctions); rowText = rowOutputState->fe_msgbuf; partitionFile = partitionFileArray[partitionId]; FileOutputStreamWrite(partitionFile, rowText); resetStringInfo(rowText); } SPI_freetuptable(SPI_tuptable); SPI_cursor_fetch(queryPortal, fetchForward, prefetchCount); } SPI_cursor_close(queryPortal); if (BinaryWorkerCopyFormat) { OutputBinaryFooters(partitionFileArray, fileCount); } /* delete row output memory context */ ClearRowOutputState(rowOutputState); finished = SPI_finish(); if (finished != SPI_OK_FINISH) { ereport(ERROR, (errmsg("could not disconnect from SPI manager"))); } } /* * Determines the column number for the given column name. The column number * count starts at 1. */ static int ColumnIndex(TupleDesc rowDescriptor, const char *columnName) { int columnIndex = SPI_fnumber(rowDescriptor, columnName); if (columnIndex == SPI_ERROR_NOATTRIBUTE) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("could not find column name \"%s\"", columnName))); } Assert(columnIndex >= 1); return columnIndex; } /* * ColumnOutputFunctions walks over a table's columns, and finds each column's * type information. The function then resolves each type's output function, * and stores and returns these output functions in an array. */ static FmgrInfo * ColumnOutputFunctions(TupleDesc rowDescriptor, bool binaryFormat) { uint32 columnCount = (uint32) rowDescriptor->natts; FmgrInfo *columnOutputFunctions = palloc0(columnCount * sizeof(FmgrInfo)); uint32 columnIndex = 0; for (columnIndex = 0; columnIndex < columnCount; columnIndex++) { FmgrInfo *currentOutputFunction = &columnOutputFunctions[columnIndex]; Form_pg_attribute currentColumn = rowDescriptor->attrs[columnIndex]; Oid columnTypeId = currentColumn->atttypid; Oid outputFunctionId = InvalidOid; bool typeVariableLength = false; if (binaryFormat) { getTypeBinaryOutputInfo(columnTypeId, &outputFunctionId, &typeVariableLength); } else { getTypeOutputInfo(columnTypeId, &outputFunctionId, &typeVariableLength); } Assert(currentColumn->attisdropped == false); fmgr_info(outputFunctionId, currentOutputFunction); } return columnOutputFunctions; } /* * InitRowOutputState creates and initializes a copy state object. This object * is internal to the copy command's implementation in Postgres; and we refactor * and refer to it here to avoid code duplication. We also only initialize the * fields needed for writing row data to text files, and skip the other fields. * * Note that the default field values used in commands/copy.c and this function * must match one another. Therefore, any changes to the default values in the * copy command must be propagated to this function. */ static PartialCopyState InitRowOutputState(void) { PartialCopyState rowOutputState = (PartialCopyState) palloc0(sizeof(PartialCopyStateData)); int fileEncoding = pg_get_client_encoding(); int databaseEncoding = GetDatabaseEncoding(); int databaseEncodingMaxLength = pg_database_encoding_max_length(); /* initialize defaults for printing null values */ char *nullPrint = pstrdup("\\N"); int nullPrintLen = strlen(nullPrint); char *nullPrintClient = pg_server_to_any(nullPrint, nullPrintLen, fileEncoding); /* set default text output characters */ rowOutputState->null_print = nullPrint; rowOutputState->null_print_client = nullPrintClient; rowOutputState->delim = pstrdup("\t"); rowOutputState->binary = BinaryWorkerCopyFormat; /* set encoding conversion information */ rowOutputState->file_encoding = fileEncoding; if (PG_ENCODING_IS_CLIENT_ONLY(fileEncoding)) { ereport(ERROR, (errmsg("cannot repartition into encoding caller cannot " "receive"))); } /* set up transcoding information and default text output characters */ if ((fileEncoding != databaseEncoding) || (databaseEncodingMaxLength > 1)) { rowOutputState->need_transcoding = true; } else { rowOutputState->need_transcoding = false; } /* * Create a temporary memory context that we can reset once per row to * recover palloc'd memory. This avoids any problems with leaks inside data * type output routines, and should be faster than retail pfree's anyway. */ rowOutputState->rowcontext = AllocSetContextCreate(CurrentMemoryContext, "WorkerRowOutputContext", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* allocate the message buffer to use for serializing a row */ rowOutputState->fe_msgbuf = makeStringInfo(); return rowOutputState; } /* Clears copy state used for outputting row data. */ static void ClearRowOutputState(PartialCopyState rowOutputState) { Assert(rowOutputState != NULL); MemoryContextDelete(rowOutputState->rowcontext); FreeStringInfo(rowOutputState->fe_msgbuf); pfree(rowOutputState->null_print_client); pfree(rowOutputState->delim); pfree(rowOutputState); } /* * OutputRow serializes one row using the column output functions, * and appends the data to the row output state object's message buffer. * This function is modeled after the CopyOneRowTo() function in * commands/copy.c, but only implements a subset of that functionality. */ static void OutputRow(HeapTuple row, TupleDesc rowDescriptor, PartialCopyState rowOutputState, FmgrInfo *columnOutputFunctions) { MemoryContext oldContext = NULL; uint32 columnIndex = 0; uint32 columnCount = (uint32) rowDescriptor->natts; Datum *valueArray = (Datum *) palloc0(columnCount * sizeof(Datum)); bool *isNullArray = (bool *) palloc0(columnCount * sizeof(bool)); /* deconstruct the tuple; this is faster than repeated heap_getattr */ heap_deform_tuple(row, rowDescriptor, valueArray, isNullArray); /* reset previous tuple's output data, and the temporary memory context */ resetStringInfo(rowOutputState->fe_msgbuf); MemoryContextReset(rowOutputState->rowcontext); oldContext = MemoryContextSwitchTo(rowOutputState->rowcontext); if (rowOutputState->binary) { CopySendInt16(rowOutputState, rowDescriptor->natts); } for (columnIndex = 0; columnIndex < columnCount; columnIndex++) { Datum value = valueArray[columnIndex]; bool isNull = isNullArray[columnIndex]; bool lastColumn = false; if (rowOutputState->binary) { if (!isNull) { FmgrInfo *outputFunctionPointer = &columnOutputFunctions[columnIndex]; bytea *outputBytes = SendFunctionCall(outputFunctionPointer, value); CopySendInt32(rowOutputState, VARSIZE(outputBytes) - VARHDRSZ); CopySendData(rowOutputState, VARDATA(outputBytes), VARSIZE(outputBytes) - VARHDRSZ); } else { CopySendInt32(rowOutputState, -1); } } else { if (!isNull) { FmgrInfo *outputFunctionPointer = &columnOutputFunctions[columnIndex]; char *columnText = OutputFunctionCall(outputFunctionPointer, value); CopyAttributeOutText(rowOutputState, columnText); } else { CopySendString(rowOutputState, rowOutputState->null_print_client); } lastColumn = ((columnIndex + 1) == columnCount); if (!lastColumn) { CopySendChar(rowOutputState, rowOutputState->delim[0]); } } } if (!rowOutputState->binary) { /* append default line termination string depending on the platform */ #ifndef WIN32 CopySendChar(rowOutputState, '\n'); #else CopySendString(rowOutputState, "\r\n"); #endif } MemoryContextSwitchTo(oldContext); pfree(valueArray); pfree(isNullArray); } /* * Write the header of postgres' binary serialization format to each partition file. * This function is used when binary_worker_copy_format is enabled. */ static void OutputBinaryHeaders(FileOutputStream *partitionFileArray, uint32 fileCount) { uint32 fileIndex = 0; for (fileIndex = 0; fileIndex < fileCount; fileIndex++) { /* Generate header for a binary copy */ const int32 zero = 0; FileOutputStream partitionFile = { 0, 0, 0 }; PartialCopyStateData headerOutputStateData; PartialCopyState headerOutputState = (PartialCopyState) & headerOutputStateData; memset(headerOutputState, 0, sizeof(PartialCopyStateData)); headerOutputState->fe_msgbuf = makeStringInfo(); /* Signature */ CopySendData(headerOutputState, BinarySignature, 11); /* Flags field (no OIDs) */ CopySendInt32(headerOutputState, zero); /* No header extension */ CopySendInt32(headerOutputState, zero); partitionFile = partitionFileArray[fileIndex]; FileOutputStreamWrite(partitionFile, headerOutputState->fe_msgbuf); } } /* * Write the footer of postgres' binary serialization format to each partition file. * This function is used when binary_worker_copy_format is enabled. */ static void OutputBinaryFooters(FileOutputStream *partitionFileArray, uint32 fileCount) { uint32 fileIndex = 0; for (fileIndex = 0; fileIndex < fileCount; fileIndex++) { /* Generate footer for a binary copy */ int16 negative = -1; FileOutputStream partitionFile = { 0, 0, 0 }; PartialCopyStateData footerOutputStateData; PartialCopyState footerOutputState = (PartialCopyState) & footerOutputStateData; memset(footerOutputState, 0, sizeof(PartialCopyStateData)); footerOutputState->fe_msgbuf = makeStringInfo(); CopySendInt16(footerOutputState, negative); partitionFile = partitionFileArray[fileIndex]; FileOutputStreamWrite(partitionFile, footerOutputState->fe_msgbuf); } } /* *INDENT-OFF* */ /* Append data to the copy buffer in outputState */ static void CopySendData(PartialCopyState outputState, const void *databuf, int datasize) { appendBinaryStringInfo(outputState->fe_msgbuf, databuf, datasize); } /* Append a striong to the copy buffer in outputState. */ static void CopySendString(PartialCopyState outputState, const char *str) { appendBinaryStringInfo(outputState->fe_msgbuf, str, strlen(str)); } /* Append a char to the copy buffer in outputState. */ static void CopySendChar(PartialCopyState outputState, char c) { appendStringInfoCharMacro(outputState->fe_msgbuf, c); } /* Append an int32 to the copy buffer in outputState. */ static void CopySendInt32(PartialCopyState outputState, int32 val) { uint32 buf = htonl((uint32) val); CopySendData(outputState, &buf, sizeof(buf)); } /* Append an int16 to the copy buffer in outputState. */ static void CopySendInt16(PartialCopyState outputState, int16 val) { uint16 buf = htons((uint16) val); CopySendData(outputState, &buf, sizeof(buf)); } /* * Send text representation of one column, with conversion and escaping. * * NB: This function is based on commands/copy.c and doesn't fully conform to * our coding style. The function should be kept in sync with copy.c. */ static void CopyAttributeOutText(PartialCopyState cstate, char *string) { char *pointer = NULL; char *start = NULL; char c = '\0'; char delimc = cstate->delim[0]; if (cstate->need_transcoding) { pointer = pg_server_to_any(string, strlen(string), cstate->file_encoding); } else { pointer = string; } /* * We have to grovel through the string searching for control characters * and instances of the delimiter character. In most cases, though, these * are infrequent. To avoid overhead from calling CopySendData once per * character, we dump out all characters between escaped characters in a * single call. The loop invariant is that the data from "start" to "pointer" * can be sent literally, but hasn't yet been. * * As all encodings here are safe, i.e. backend supported ones, we can * skip doing pg_encoding_mblen(), because in valid backend encodings, * extra bytes of a multibyte character never look like ASCII. */ start = pointer; while ((c = *pointer) != '\0') { if ((unsigned char) c < (unsigned char) 0x20) { /* * \r and \n must be escaped, the others are traditional. We * prefer to dump these using the C-like notation, rather than * a backslash and the literal character, because it makes the * dump file a bit more proof against Microsoftish data * mangling. */ switch (c) { case '\b': c = 'b'; break; case '\f': c = 'f'; break; case '\n': c = 'n'; break; case '\r': c = 'r'; break; case '\t': c = 't'; break; case '\v': c = 'v'; break; default: /* If it's the delimiter, must backslash it */ if (c == delimc) break; /* All ASCII control chars are length 1 */ pointer++; continue; /* fall to end of loop */ } /* if we get here, we need to convert the control char */ CopyFlushOutput(cstate, start, pointer); CopySendChar(cstate, '\\'); CopySendChar(cstate, c); start = ++pointer; /* do not include char in next run */ } else if (c == '\\' || c == delimc) { CopyFlushOutput(cstate, start, pointer); CopySendChar(cstate, '\\'); start = pointer++; /* we include char in next run */ } else { pointer++; } } CopyFlushOutput(cstate, start, pointer); } /* *INDENT-ON* */ /* Helper function to send pending copy output */ static inline void CopyFlushOutput(PartialCopyState cstate, char *start, char *pointer) { if (pointer > start) { CopySendData(cstate, start, pointer - start); } } /* Helper function that invokes a function with the default collation oid. */ Datum CompareCall2(FmgrInfo *functionInfo, Datum leftArgument, Datum rightArgument) { Datum result = FunctionCall2Coll(functionInfo, DEFAULT_COLLATION_OID, leftArgument, rightArgument); return result; } /* * RangePartitionId determines the partition number for the given data value * by applying range partitioning. More specifically, the function takes in a * data value and an array of sorted split points, and performs a binary search * within that array to determine the bucket the data value falls into. The * function then returns that bucket number. * * Note that we employ a version of binary search known as upper_bound; this * ensures that all null values fall into the zeroth bucket and that we maintain * full compatibility with the semantics of Hadoop's TotalOrderPartitioner. */ static uint32 RangePartitionId(Datum partitionValue, const void *context) { RangePartitionContext *rangePartitionContext = (RangePartitionContext *) context; FmgrInfo *comparisonFunction = rangePartitionContext->comparisonFunction; Datum *pointArray = rangePartitionContext->splitPointArray; int32 currentLength = rangePartitionContext->splitPointCount; int32 halfLength = 0; uint32 firstIndex = 0; /* * We implement a binary search variant known as upper_bound. This variant * gives us the semantics we need for partitioned joins; and is also used by * Hadoop's TotalOrderPartitioner. To implement this variant, we rely on SGI * STL v3.3's source code for upper_bound(). Note that elements in the point * array cannot be null. */ while (currentLength > 0) { uint32 middleIndex = 0; Datum middlePoint = 0; Datum comparisonDatum = 0; int comparisonResult = 0; halfLength = currentLength >> 1; middleIndex = firstIndex; middleIndex += halfLength; middlePoint = pointArray[middleIndex]; comparisonDatum = CompareCall2(comparisonFunction, partitionValue, middlePoint); comparisonResult = DatumGetInt32(comparisonDatum); /* if partition value is less than middle point */ if (comparisonResult < 0) { currentLength = halfLength; } else { firstIndex = middleIndex; firstIndex++; currentLength = currentLength - halfLength - 1; } } return firstIndex; } /* * HashPartitionId determines the partition number for the given data value * using hash partitioning. More specifically, the function returns zero if the * given data value is null. If not, the function applies the standard Postgres * hashing function for the given data type, and mods the hashed result with the * number of partitions. The function then returns the modded number as the * partition number. * * Note that any changes to PostgreSQL's hashing functions will reshuffle the * entire distribution created by this function. For a discussion of this issue, * see Google "PL/Proxy Users: Hash Functions Have Changed in PostgreSQL 8.4." */ static uint32 HashPartitionId(Datum partitionValue, const void *context) { HashPartitionContext *hashPartitionContext = (HashPartitionContext *) context; FmgrInfo *hashFunction = hashPartitionContext->hashFunction; uint32 partitionCount = hashPartitionContext->partitionCount; Datum hashDatum = 0; uint32 hashResult = 0; uint32 hashPartitionId = 0; /* hash functions return unsigned 32-bit integers */ hashDatum = FunctionCall1(hashFunction, partitionValue); hashResult = DatumGetUInt32(hashDatum); hashPartitionId = (hashResult % partitionCount); return hashPartitionId; }