mirror of https://github.com/citusdata/citus.git
867 lines
24 KiB
C
867 lines
24 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* columnar_storage.c
|
|
*
|
|
* Copyright (c) Citus Data, Inc.
|
|
*
|
|
* Low-level storage layer for columnar.
|
|
* - Translates columnar read/write operations on logical offsets into operations on pages/blocks.
|
|
* - Emits WAL.
|
|
* - Reads/writes the columnar metapage.
|
|
* - Reserves data offsets, stripe numbers, and row offsets.
|
|
* - Truncation.
|
|
*
|
|
* Higher-level columnar operations deal with logical offsets and large
|
|
* contiguous buffers of data that need to be stored. But the buffer manager
|
|
* and WAL depend on formatted pages with headers, so these large buffers need
|
|
* to be written across many pages. This module translates the contiguous
|
|
* buffers into individual block reads/writes, and performs WAL when
|
|
* necessary.
|
|
*
|
|
* Storage layout: a metapage in block 0, followed by an empty page in block
|
|
* 1, followed by logical data starting at the first byte after the page
|
|
* header in block 2 (having logical offset ColumnarFirstLogicalOffset). (XXX:
|
|
* Block 1 is left empty for no particular reason. Reconsider?). A columnar
|
|
* table should always have at least 2 blocks.
|
|
*
|
|
* Reservation is done with a relation extension lock, and designed for
|
|
* concurrency, so the callers only need an ordinary lock on the
|
|
* relation. Initializing the metapage or truncating the relation require that
|
|
* the caller holds an AccessExclusiveLock. (XXX: New reservations of data are
|
|
* aligned onto a new page for no particular reason. Reconsider?).
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "miscadmin.h"
|
|
#include "safe_lib.h"
|
|
|
|
#include "access/generic_xlog.h"
|
|
#include "catalog/storage.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "storage/lmgr.h"
|
|
|
|
#include "pg_version_compat.h"
|
|
|
|
#include "columnar/columnar.h"
|
|
#include "columnar/columnar_storage.h"
|
|
|
|
|
|
/*
|
|
* Content of the first page in main fork, which stores metadata at file
|
|
* level.
|
|
*/
|
|
typedef struct ColumnarMetapage
|
|
{
|
|
/*
|
|
* Store version of file format used, so we can detect files from
|
|
* previous versions if we change file format.
|
|
*/
|
|
uint32 versionMajor;
|
|
uint32 versionMinor;
|
|
|
|
/*
|
|
* Each of the metadata table rows are identified by a storageId.
|
|
* We store it also in the main fork so we can link metadata rows
|
|
* with data files.
|
|
*/
|
|
uint64 storageId;
|
|
|
|
uint64 reservedStripeId; /* first unused stripe id */
|
|
uint64 reservedRowNumber; /* first unused row number */
|
|
uint64 reservedOffset; /* first unused byte offset */
|
|
|
|
/*
|
|
* Flag set to true in the init fork. After an unlogged table reset (due
|
|
* to a crash), the init fork will be copied over the main fork. When
|
|
* trying to read an unlogged table, if this flag is set to true, we must
|
|
* clear the metadata for the table (because the actual data is gone,
|
|
* too), and clear the flag. We can cross-check that the table is
|
|
* UNLOGGED, and that the main fork is at the minimum size (no actual
|
|
* data).
|
|
*
|
|
* XXX: Not used yet; reserved field for later support for UNLOGGED.
|
|
*/
|
|
bool unloggedReset;
|
|
} ColumnarMetapage;
|
|
|
|
|
|
/* represents a "physical" block+offset address */
|
|
typedef struct PhysicalAddr
|
|
{
|
|
BlockNumber blockno;
|
|
uint32 offset;
|
|
} PhysicalAddr;
|
|
|
|
|
|
#define COLUMNAR_METAPAGE_BLOCKNO 0
|
|
#define COLUMNAR_EMPTY_BLOCKNO 1
|
|
#define COLUMNAR_INVALID_STRIPE_ID 0
|
|
#define COLUMNAR_FIRST_STRIPE_ID 1
|
|
|
|
|
|
#define OLD_METAPAGE_VERSION_HINT "Use \"VACUUM\" to upgrade the columnar table format " \
|
|
"version or run \"ALTER EXTENSION citus UPDATE\"."
|
|
|
|
|
|
/* only for testing purposes */
|
|
PG_FUNCTION_INFO_V1(test_columnar_storage_write_new_page);
|
|
|
|
|
|
/*
|
|
* Map logical offsets to a physical page and offset where the data is kept.
|
|
*/
|
|
static inline PhysicalAddr
|
|
LogicalToPhysical(uint64 logicalOffset)
|
|
{
|
|
PhysicalAddr addr;
|
|
|
|
addr.blockno = logicalOffset / COLUMNAR_BYTES_PER_PAGE;
|
|
addr.offset = SizeOfPageHeaderData + (logicalOffset % COLUMNAR_BYTES_PER_PAGE);
|
|
|
|
return addr;
|
|
}
|
|
|
|
|
|
/*
|
|
* Map a physical page and offset address to a logical address.
|
|
*/
|
|
static inline uint64
|
|
PhysicalToLogical(PhysicalAddr addr)
|
|
{
|
|
return COLUMNAR_BYTES_PER_PAGE * addr.blockno + addr.offset - SizeOfPageHeaderData;
|
|
}
|
|
|
|
|
|
static void ColumnarOverwriteMetapage(Relation relation,
|
|
ColumnarMetapage columnarMetapage);
|
|
static ColumnarMetapage ColumnarMetapageRead(Relation rel, bool force);
|
|
static void ReadFromBlock(Relation rel, BlockNumber blockno, uint32 offset,
|
|
char *buf, uint32 len, bool force);
|
|
static void WriteToBlock(Relation rel, BlockNumber blockno, uint32 offset,
|
|
char *buf, uint32 len, bool clear);
|
|
static uint64 AlignReservation(uint64 prevReservation);
|
|
static bool ColumnarMetapageIsCurrent(ColumnarMetapage *metapage);
|
|
static bool ColumnarMetapageIsOlder(ColumnarMetapage *metapage);
|
|
static bool ColumnarMetapageIsNewer(ColumnarMetapage *metapage);
|
|
static void ColumnarMetapageCheckVersion(Relation rel, ColumnarMetapage *metapage);
|
|
|
|
|
|
/*
|
|
* ColumnarStorageInit - initialize a new metapage in an empty relation
|
|
* with the given storageId.
|
|
*
|
|
* Caller must hold AccessExclusiveLock on the relation.
|
|
*/
|
|
void
|
|
ColumnarStorageInit(SMgrRelation srel, uint64 storageId)
|
|
{
|
|
BlockNumber nblocks = smgrnblocks(srel, MAIN_FORKNUM);
|
|
|
|
if (nblocks > 0)
|
|
{
|
|
elog(ERROR,
|
|
"attempted to initialize metapage, but %d pages already exist",
|
|
nblocks);
|
|
}
|
|
|
|
/* create two pages */
|
|
#if PG_VERSION_NUM >= PG_VERSION_16
|
|
PGIOAlignedBlock block;
|
|
#else
|
|
PGAlignedBlock block;
|
|
#endif
|
|
Page page = block.data;
|
|
|
|
/* write metapage */
|
|
PageInit(page, BLCKSZ, 0);
|
|
PageHeader phdr = (PageHeader) page;
|
|
|
|
ColumnarMetapage metapage = { 0 };
|
|
metapage.storageId = storageId;
|
|
metapage.versionMajor = COLUMNAR_VERSION_MAJOR;
|
|
metapage.versionMinor = COLUMNAR_VERSION_MINOR;
|
|
metapage.reservedStripeId = COLUMNAR_FIRST_STRIPE_ID;
|
|
metapage.reservedRowNumber = COLUMNAR_FIRST_ROW_NUMBER;
|
|
metapage.reservedOffset = ColumnarFirstLogicalOffset;
|
|
metapage.unloggedReset = false;
|
|
memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower,
|
|
(char *) &metapage, sizeof(ColumnarMetapage));
|
|
phdr->pd_lower += sizeof(ColumnarMetapage);
|
|
|
|
log_newpage(RelationPhysicalIdentifierBackend_compat(&srel), MAIN_FORKNUM,
|
|
COLUMNAR_METAPAGE_BLOCKNO, page, true);
|
|
PageSetChecksumInplace(page, COLUMNAR_METAPAGE_BLOCKNO);
|
|
smgrextend(srel, MAIN_FORKNUM, COLUMNAR_METAPAGE_BLOCKNO, page, true);
|
|
|
|
/* write empty page */
|
|
PageInit(page, BLCKSZ, 0);
|
|
|
|
log_newpage(RelationPhysicalIdentifierBackend_compat(&srel), MAIN_FORKNUM,
|
|
COLUMNAR_EMPTY_BLOCKNO, page, true);
|
|
PageSetChecksumInplace(page, COLUMNAR_EMPTY_BLOCKNO);
|
|
smgrextend(srel, MAIN_FORKNUM, COLUMNAR_EMPTY_BLOCKNO, page, true);
|
|
|
|
/*
|
|
* An immediate sync is required even if we xlog'd the page, because the
|
|
* write did not go through shared_buffers and therefore a concurrent
|
|
* checkpoint may have moved the redo pointer past our xlog record.
|
|
*/
|
|
smgrimmedsync(srel, MAIN_FORKNUM);
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageUpdateCurrent - update the metapage to the current
|
|
* version. No effect if the version already matches. If 'upgrade' is true,
|
|
* throw an error if metapage version is newer; if 'upgrade' is false, it's a
|
|
* downgrade, so throw an error if the metapage version is older.
|
|
*
|
|
* NB: caller must ensure that metapage already exists, which might not be the
|
|
* case on 10.0.
|
|
*/
|
|
void
|
|
ColumnarStorageUpdateCurrent(Relation rel, bool upgrade, uint64 reservedStripeId,
|
|
uint64 reservedRowNumber, uint64 reservedOffset)
|
|
{
|
|
LockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, true);
|
|
|
|
if (ColumnarMetapageIsCurrent(&metapage))
|
|
{
|
|
/* nothing to do */
|
|
return;
|
|
}
|
|
|
|
if (upgrade && ColumnarMetapageIsNewer(&metapage))
|
|
{
|
|
elog(ERROR, "found newer columnar metapage while upgrading");
|
|
}
|
|
|
|
if (!upgrade && ColumnarMetapageIsOlder(&metapage))
|
|
{
|
|
elog(ERROR, "found older columnar metapage while downgrading");
|
|
}
|
|
|
|
metapage.versionMajor = COLUMNAR_VERSION_MAJOR;
|
|
metapage.versionMinor = COLUMNAR_VERSION_MINOR;
|
|
|
|
/* storageId remains the same */
|
|
metapage.reservedStripeId = reservedStripeId;
|
|
metapage.reservedRowNumber = reservedRowNumber;
|
|
metapage.reservedOffset = reservedOffset;
|
|
|
|
ColumnarOverwriteMetapage(rel, metapage);
|
|
|
|
UnlockRelationForExtension(rel, ExclusiveLock);
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageGetVersionMajor - return major version from the metapage.
|
|
*
|
|
* Throw an error if the metapage is not the current version, unless
|
|
* 'force' is true.
|
|
*/
|
|
uint64
|
|
ColumnarStorageGetVersionMajor(Relation rel, bool force)
|
|
{
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
|
|
|
|
return metapage.versionMajor;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageGetVersionMinor - return minor version from the metapage.
|
|
*
|
|
* Throw an error if the metapage is not the current version, unless
|
|
* 'force' is true.
|
|
*/
|
|
uint64
|
|
ColumnarStorageGetVersionMinor(Relation rel, bool force)
|
|
{
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
|
|
|
|
return metapage.versionMinor;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageGetStorageId - return storage ID from the metapage.
|
|
*
|
|
* Throw an error if the metapage is not the current version, unless
|
|
* 'force' is true.
|
|
*/
|
|
uint64
|
|
ColumnarStorageGetStorageId(Relation rel, bool force)
|
|
{
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
|
|
|
|
return metapage.storageId;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageGetReservedStripeId - return reserved stripe ID from the
|
|
* metapage.
|
|
*
|
|
* Throw an error if the metapage is not the current version, unless
|
|
* 'force' is true.
|
|
*/
|
|
uint64
|
|
ColumnarStorageGetReservedStripeId(Relation rel, bool force)
|
|
{
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
|
|
|
|
return metapage.reservedStripeId;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageGetReservedRowNumber - return reserved row number from the
|
|
* metapage.
|
|
*
|
|
* Throw an error if the metapage is not the current version, unless
|
|
* 'force' is true.
|
|
*/
|
|
uint64
|
|
ColumnarStorageGetReservedRowNumber(Relation rel, bool force)
|
|
{
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
|
|
|
|
return metapage.reservedRowNumber;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageGetReservedOffset - return reserved offset from the metapage.
|
|
*
|
|
* Throw an error if the metapage is not the current version, unless
|
|
* 'force' is true.
|
|
*/
|
|
uint64
|
|
ColumnarStorageGetReservedOffset(Relation rel, bool force)
|
|
{
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
|
|
|
|
return metapage.reservedOffset;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageIsCurrent - return true if metapage exists and is not
|
|
* the current version.
|
|
*/
|
|
bool
|
|
ColumnarStorageIsCurrent(Relation rel)
|
|
{
|
|
BlockNumber nblocks = smgrnblocks(RelationGetSmgr(rel), MAIN_FORKNUM);
|
|
|
|
if (nblocks < 2)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, true);
|
|
return ColumnarMetapageIsCurrent(&metapage);
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageReserveRowNumber returns reservedRowNumber and advances
|
|
* it for next row number reservation.
|
|
*/
|
|
uint64
|
|
ColumnarStorageReserveRowNumber(Relation rel, uint64 nrows)
|
|
{
|
|
LockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, false);
|
|
|
|
uint64 firstRowNumber = metapage.reservedRowNumber;
|
|
metapage.reservedRowNumber += nrows;
|
|
|
|
ColumnarOverwriteMetapage(rel, metapage);
|
|
|
|
UnlockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
return firstRowNumber;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageReserveStripeId returns stripeId and advances it for next
|
|
* stripeId reservation.
|
|
* Note that this function doesn't handle row number reservation.
|
|
* See ColumnarStorageReserveRowNumber function.
|
|
*/
|
|
uint64
|
|
ColumnarStorageReserveStripeId(Relation rel)
|
|
{
|
|
LockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, false);
|
|
|
|
uint64 stripeId = metapage.reservedStripeId;
|
|
metapage.reservedStripeId++;
|
|
|
|
ColumnarOverwriteMetapage(rel, metapage);
|
|
|
|
UnlockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
return stripeId;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageReserveData - reserve logical data offsets for writing.
|
|
*/
|
|
uint64
|
|
ColumnarStorageReserveData(Relation rel, uint64 amount)
|
|
{
|
|
if (amount == 0)
|
|
{
|
|
return ColumnarInvalidLogicalOffset;
|
|
}
|
|
|
|
LockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, false);
|
|
|
|
uint64 alignedReservation = AlignReservation(metapage.reservedOffset);
|
|
uint64 nextReservation = alignedReservation + amount;
|
|
metapage.reservedOffset = nextReservation;
|
|
|
|
/* write new reservation */
|
|
ColumnarOverwriteMetapage(rel, metapage);
|
|
|
|
/* last used PhysicalAddr of new reservation */
|
|
PhysicalAddr final = LogicalToPhysical(nextReservation - 1);
|
|
|
|
/* extend with new pages */
|
|
BlockNumber nblocks = smgrnblocks(RelationGetSmgr(rel), MAIN_FORKNUM);
|
|
|
|
while (nblocks <= final.blockno)
|
|
{
|
|
Buffer newBuffer = ReadBuffer(rel, P_NEW);
|
|
Assert(BufferGetBlockNumber(newBuffer) == nblocks);
|
|
ReleaseBuffer(newBuffer);
|
|
nblocks++;
|
|
}
|
|
|
|
UnlockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
return alignedReservation;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageRead - map the logical offset to a block and offset, then
|
|
* read the buffer from multiple blocks if necessary.
|
|
*/
|
|
void
|
|
ColumnarStorageRead(Relation rel, uint64 logicalOffset, char *data, uint32 amount)
|
|
{
|
|
/* if there's no work to do, succeed even with invalid offset */
|
|
if (amount == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (!ColumnarLogicalOffsetIsValid(logicalOffset))
|
|
{
|
|
elog(ERROR,
|
|
"attempted columnar read on relation %d from invalid logical offset: "
|
|
UINT64_FORMAT,
|
|
rel->rd_id, logicalOffset);
|
|
}
|
|
|
|
uint64 read = 0;
|
|
|
|
while (read < amount)
|
|
{
|
|
PhysicalAddr addr = LogicalToPhysical(logicalOffset + read);
|
|
|
|
uint32 to_read = Min(amount - read, BLCKSZ - addr.offset);
|
|
ReadFromBlock(rel, addr.blockno, addr.offset, data + read, to_read,
|
|
false);
|
|
|
|
read += to_read;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageWrite - map the logical offset to a block and offset, then
|
|
* write the buffer across multiple blocks if necessary.
|
|
*/
|
|
void
|
|
ColumnarStorageWrite(Relation rel, uint64 logicalOffset, char *data, uint32 amount)
|
|
{
|
|
/* if there's no work to do, succeed even with invalid offset */
|
|
if (amount == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (!ColumnarLogicalOffsetIsValid(logicalOffset))
|
|
{
|
|
elog(ERROR,
|
|
"attempted columnar write on relation %d to invalid logical offset: "
|
|
UINT64_FORMAT,
|
|
rel->rd_id, logicalOffset);
|
|
}
|
|
|
|
uint64 written = 0;
|
|
|
|
while (written < amount)
|
|
{
|
|
PhysicalAddr addr = LogicalToPhysical(logicalOffset + written);
|
|
|
|
uint64 to_write = Min(amount - written, BLCKSZ - addr.offset);
|
|
WriteToBlock(rel, addr.blockno, addr.offset, data + written, to_write,
|
|
false);
|
|
|
|
written += to_write;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarStorageTruncate - truncate the columnar storage such that
|
|
* newDataReservation will be the first unused logical offset available. Free
|
|
* pages at the end of the relation.
|
|
*
|
|
* Caller must hold AccessExclusiveLock on the relation.
|
|
*
|
|
* Returns true if pages were truncated; false otherwise.
|
|
*/
|
|
bool
|
|
ColumnarStorageTruncate(Relation rel, uint64 newDataReservation)
|
|
{
|
|
if (!ColumnarLogicalOffsetIsValid(newDataReservation))
|
|
{
|
|
elog(ERROR,
|
|
"attempted to truncate relation %d to invalid logical offset: " UINT64_FORMAT,
|
|
rel->rd_id, newDataReservation);
|
|
}
|
|
|
|
BlockNumber old_rel_pages = smgrnblocks(RelationGetSmgr(rel), MAIN_FORKNUM);
|
|
if (old_rel_pages == 0)
|
|
{
|
|
/* nothing to do */
|
|
return false;
|
|
}
|
|
|
|
LockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(rel, false);
|
|
|
|
if (metapage.reservedOffset < newDataReservation)
|
|
{
|
|
elog(ERROR,
|
|
"attempted to truncate relation %d to offset " UINT64_FORMAT \
|
|
" which is higher than existing offset " UINT64_FORMAT,
|
|
rel->rd_id, newDataReservation, metapage.reservedOffset);
|
|
}
|
|
|
|
if (metapage.reservedOffset == newDataReservation)
|
|
{
|
|
/* nothing to do */
|
|
UnlockRelationForExtension(rel, ExclusiveLock);
|
|
return false;
|
|
}
|
|
|
|
metapage.reservedOffset = newDataReservation;
|
|
|
|
/* write new reservation */
|
|
ColumnarOverwriteMetapage(rel, metapage);
|
|
|
|
UnlockRelationForExtension(rel, ExclusiveLock);
|
|
|
|
PhysicalAddr final = LogicalToPhysical(newDataReservation - 1);
|
|
BlockNumber new_rel_pages = final.blockno + 1;
|
|
Assert(new_rel_pages <= old_rel_pages);
|
|
|
|
/*
|
|
* Truncate the storage. Note that RelationTruncate() takes care of
|
|
* Write Ahead Logging.
|
|
*/
|
|
if (new_rel_pages < old_rel_pages)
|
|
{
|
|
RelationTruncate(rel, new_rel_pages);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarOverwriteMetapage writes given columnarMetapage back to metapage
|
|
* for given relation.
|
|
*/
|
|
static void
|
|
ColumnarOverwriteMetapage(Relation relation, ColumnarMetapage columnarMetapage)
|
|
{
|
|
/* clear metapage because we are overwriting */
|
|
bool clear = true;
|
|
WriteToBlock(relation, COLUMNAR_METAPAGE_BLOCKNO, SizeOfPageHeaderData,
|
|
(char *) &columnarMetapage, sizeof(ColumnarMetapage), clear);
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarMetapageRead - read the current contents of the metapage. Error if
|
|
* it does not exist. Throw an error if the metapage is not the current
|
|
* version, unless 'force' is true.
|
|
*
|
|
* NB: it's safe to read a different version of a metapage because we
|
|
* guarantee that fields will only be added and existing fields will never be
|
|
* changed. However, it's important that we don't depend on new fields being
|
|
* set properly when we read an old metapage; an old metapage should only be
|
|
* read for the purposes of upgrading or error checking.
|
|
*/
|
|
static ColumnarMetapage
|
|
ColumnarMetapageRead(Relation rel, bool force)
|
|
{
|
|
BlockNumber nblocks = smgrnblocks(RelationGetSmgr(rel), MAIN_FORKNUM);
|
|
if (nblocks == 0)
|
|
{
|
|
/*
|
|
* We only expect this to happen when upgrading citus.so. This is because,
|
|
* in current version of columnar, we immediately create the metapage
|
|
* for columnar tables, i.e right after creating the table.
|
|
* However in older versions, we were creating metapages lazily, i.e
|
|
* when ingesting data to columnar table.
|
|
*/
|
|
ereport(ERROR, (errmsg("columnar metapage for relation \"%s\" does not exist",
|
|
RelationGetRelationName(rel)),
|
|
errhint(OLD_METAPAGE_VERSION_HINT)));
|
|
}
|
|
|
|
/*
|
|
* Regardless of "force" parameter, always force read metapage block.
|
|
* We will check metapage version in ColumnarMetapageCheckVersion
|
|
* depending on "force".
|
|
*/
|
|
bool forceReadBlock = true;
|
|
ColumnarMetapage metapage;
|
|
ReadFromBlock(rel, COLUMNAR_METAPAGE_BLOCKNO, SizeOfPageHeaderData,
|
|
(char *) &metapage, sizeof(ColumnarMetapage), forceReadBlock);
|
|
|
|
if (!force)
|
|
{
|
|
ColumnarMetapageCheckVersion(rel, &metapage);
|
|
}
|
|
|
|
return metapage;
|
|
}
|
|
|
|
|
|
/*
|
|
* ReadFromBlock - read bytes from a page at the given offset. If 'force' is
|
|
* true, don't check pd_lower; useful when reading a metapage of unknown
|
|
* version.
|
|
*/
|
|
static void
|
|
ReadFromBlock(Relation rel, BlockNumber blockno, uint32 offset, char *buf,
|
|
uint32 len, bool force)
|
|
{
|
|
Buffer buffer = ReadBuffer(rel, blockno);
|
|
LockBuffer(buffer, BUFFER_LOCK_SHARE);
|
|
Page page = BufferGetPage(buffer);
|
|
PageHeader phdr = (PageHeader) page;
|
|
|
|
if (BLCKSZ < offset + len || (!force && (phdr->pd_lower < offset + len)))
|
|
{
|
|
elog(ERROR,
|
|
"attempt to read columnar data of length %d from offset %d of block %d of relation %d",
|
|
len, offset, blockno, rel->rd_id);
|
|
}
|
|
|
|
memcpy_s(buf, len, page + offset, len);
|
|
UnlockReleaseBuffer(buffer);
|
|
}
|
|
|
|
|
|
/*
|
|
* WriteToBlock - append data to a block, initializing if necessary, and emit
|
|
* WAL. If 'clear' is true, always clear the data on the page and reinitialize
|
|
* it first, and offset must be SizeOfPageHeaderData. Otherwise, offset must
|
|
* be equal to pd_lower and pd_lower will be set to the end of the written
|
|
* data.
|
|
*/
|
|
static void
|
|
WriteToBlock(Relation rel, BlockNumber blockno, uint32 offset, char *buf,
|
|
uint32 len, bool clear)
|
|
{
|
|
Buffer buffer = ReadBuffer(rel, blockno);
|
|
GenericXLogState *state = GenericXLogStart(rel);
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
Page page = GenericXLogRegisterBuffer(state, buffer, GENERIC_XLOG_FULL_IMAGE);
|
|
|
|
PageHeader phdr = (PageHeader) page;
|
|
if (PageIsNew(page) || clear)
|
|
{
|
|
PageInit(page, BLCKSZ, 0);
|
|
}
|
|
|
|
if (phdr->pd_lower < offset || phdr->pd_upper - offset < len)
|
|
{
|
|
elog(ERROR,
|
|
"attempt to write columnar data of length %d to offset %d of block %d of relation %d",
|
|
len, offset, blockno, rel->rd_id);
|
|
}
|
|
|
|
/*
|
|
* After a transaction has been rolled-back, we might be
|
|
* over-writing the rolledback write, so phdr->pd_lower can be
|
|
* different from addr.offset.
|
|
*
|
|
* We reset pd_lower to reset the rolledback write.
|
|
*
|
|
* Given that we always align page reservation to the next page as of
|
|
* 10.2, having such a disk page is only possible if write operaion
|
|
* failed in an older version of columnar, but now user attempts writing
|
|
* to that table in version >= 10.2.
|
|
*/
|
|
if (phdr->pd_lower > offset)
|
|
{
|
|
ereport(DEBUG4, (errmsg("overwriting page %u", blockno),
|
|
errdetail("This can happen after a roll-back.")));
|
|
phdr->pd_lower = offset;
|
|
}
|
|
|
|
memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower, buf, len);
|
|
phdr->pd_lower += len;
|
|
|
|
GenericXLogFinish(state);
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
}
|
|
|
|
|
|
/*
|
|
* AlignReservation - given an unused logical byte offset, align it so that it
|
|
* falls at the start of a page.
|
|
*
|
|
* XXX: Reconsider whether we want/need to do this at all.
|
|
*/
|
|
static uint64
|
|
AlignReservation(uint64 prevReservation)
|
|
{
|
|
PhysicalAddr prevAddr = LogicalToPhysical(prevReservation);
|
|
uint64 alignedReservation = prevReservation;
|
|
|
|
if (prevAddr.offset != SizeOfPageHeaderData)
|
|
{
|
|
/* not aligned; align on beginning of next page */
|
|
PhysicalAddr initial = { 0 };
|
|
initial.blockno = prevAddr.blockno + 1;
|
|
initial.offset = SizeOfPageHeaderData;
|
|
alignedReservation = PhysicalToLogical(initial);
|
|
}
|
|
|
|
Assert(alignedReservation >= prevReservation);
|
|
return alignedReservation;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarMetapageIsCurrent - is the metapage at the latest version?
|
|
*/
|
|
static bool
|
|
ColumnarMetapageIsCurrent(ColumnarMetapage *metapage)
|
|
{
|
|
return (metapage->versionMajor == COLUMNAR_VERSION_MAJOR &&
|
|
metapage->versionMinor == COLUMNAR_VERSION_MINOR);
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarMetapageIsOlder - is the metapage older than the current version?
|
|
*/
|
|
static bool
|
|
ColumnarMetapageIsOlder(ColumnarMetapage *metapage)
|
|
{
|
|
return (metapage->versionMajor < COLUMNAR_VERSION_MAJOR ||
|
|
(metapage->versionMajor == COLUMNAR_VERSION_MAJOR &&
|
|
(int) metapage->versionMinor < (int) COLUMNAR_VERSION_MINOR));
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarMetapageIsNewer - is the metapage newer than the current version?
|
|
*/
|
|
static bool
|
|
ColumnarMetapageIsNewer(ColumnarMetapage *metapage)
|
|
{
|
|
return (metapage->versionMajor > COLUMNAR_VERSION_MAJOR ||
|
|
(metapage->versionMajor == COLUMNAR_VERSION_MAJOR &&
|
|
metapage->versionMinor > COLUMNAR_VERSION_MINOR));
|
|
}
|
|
|
|
|
|
/*
|
|
* ColumnarMetapageCheckVersion - throw an error if accessing old
|
|
* version of metapage.
|
|
*/
|
|
static void
|
|
ColumnarMetapageCheckVersion(Relation rel, ColumnarMetapage *metapage)
|
|
{
|
|
if (!ColumnarMetapageIsCurrent(metapage))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg(
|
|
"attempted to access relation \"%s\", which uses an older columnar format",
|
|
RelationGetRelationName(rel)),
|
|
errdetail(
|
|
"Columnar format version %d.%d is required, \"%s\" has version %d.%d.",
|
|
COLUMNAR_VERSION_MAJOR, COLUMNAR_VERSION_MINOR,
|
|
RelationGetRelationName(rel),
|
|
metapage->versionMajor, metapage->versionMinor),
|
|
errhint(OLD_METAPAGE_VERSION_HINT)));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* test_columnar_storage_write_new_page is a UDF only used for testing
|
|
* purposes. It could make more sense to define this in columnar_debug.c,
|
|
* but the storage layer doesn't expose ColumnarMetapage to any other files,
|
|
* so we define it here.
|
|
*/
|
|
Datum
|
|
test_columnar_storage_write_new_page(PG_FUNCTION_ARGS)
|
|
{
|
|
Oid relationId = PG_GETARG_OID(0);
|
|
|
|
Relation relation = relation_open(relationId, AccessShareLock);
|
|
|
|
/*
|
|
* Allocate a new page, write some data to there, and set reserved offset
|
|
* to the start of that page. That way, for a subsequent write operation,
|
|
* storage layer would try to overwrite the page that we allocated here.
|
|
*/
|
|
uint64 newPageOffset = ColumnarStorageGetReservedOffset(relation, false);
|
|
|
|
ColumnarStorageReserveData(relation, 100);
|
|
ColumnarStorageWrite(relation, newPageOffset, "foo_bar", 8);
|
|
|
|
ColumnarMetapage metapage = ColumnarMetapageRead(relation, false);
|
|
metapage.reservedOffset = newPageOffset;
|
|
ColumnarOverwriteMetapage(relation, metapage);
|
|
|
|
relation_close(relation, AccessShareLock);
|
|
|
|
PG_RETURN_VOID();
|
|
}
|