diff --git a/src/backend/columnar/columnar_customscan.c b/src/backend/columnar/columnar_customscan.c index 4866c81ad..88760c654 100644 --- a/src/backend/columnar/columnar_customscan.c +++ b/src/backend/columnar/columnar_customscan.c @@ -27,6 +27,7 @@ #include "columnar/columnar_customscan.h" #include "columnar/columnar_metadata.h" #include "columnar/columnar_tableam.h" +#include "distributed/listutils.h" typedef struct ColumnarScanPath { @@ -50,8 +51,13 @@ typedef struct ColumnarScanState } ColumnarScanState; +typedef bool (*PathPredicate)(Path *path); + + static void ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); +static void RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate); +static bool IsNotIndexPath(Path *path); static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static Cost ColumnarScanCost(RangeTblEntry *rte); @@ -137,18 +143,6 @@ columnar_customscan_init() } -static void -clear_paths(RelOptInfo *rel) -{ - rel->pathlist = NIL; - rel->partial_pathlist = NIL; - rel->cheapest_startup_path = NULL; - rel->cheapest_total_path = NULL; - rel->cheapest_unique_path = NULL; - rel->cheapest_parameterized_paths = NIL; -} - - static void ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte) @@ -188,8 +182,13 @@ ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, ereport(DEBUG1, (errmsg("pathlist hook for columnar table am"))); - /* we propose a new path that will be the only path for scanning this relation */ - clear_paths(rel); + /* + * TODO: Since we don't have a proper costing model for + * ColumnarCustomScan, we remove other paths to force postgres + * using ColumnarCustomScan. Note that we still keep index paths + * since they still might be useful. + */ + RemovePathsByPredicate(rel, IsNotIndexPath); add_path(rel, customPath); } } @@ -197,6 +196,38 @@ ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, } +/* + * RemovePathsByPredicate removes the paths that removePathPredicate + * evaluates to true from pathlist of given rel. + */ +static void +RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate) +{ + List *filteredPathList = NIL; + + Path *path = NULL; + foreach_ptr(path, rel->pathlist) + { + if (!removePathPredicate(path)) + { + filteredPathList = lappend(filteredPathList, path); + } + } + + rel->pathlist = filteredPathList; +} + + +/* + * IsNotIndexPath returns true if given path is not an IndexPath. + */ +static bool +IsNotIndexPath(Path *path) +{ + return !IsA(path, IndexPath); +} + + static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) { diff --git a/src/backend/columnar/columnar_debug.c b/src/backend/columnar/columnar_debug.c index 2a772e7da..5b62a3b56 100644 --- a/src/backend/columnar/columnar_debug.c +++ b/src/backend/columnar/columnar_debug.c @@ -12,6 +12,7 @@ #include "pg_config.h" #include "access/nbtree.h" +#include "access/table.h" #include "catalog/pg_am.h" #include "catalog/pg_type.h" #include "distributed/pg_version_constants.h" @@ -25,11 +26,13 @@ #include "utils/tuplestore.h" #include "columnar/columnar.h" +#include "columnar/columnar_storage.h" #include "columnar/columnar_version_compat.h" static void MemoryContextTotals(MemoryContext context, MemoryContextCounters *counters); PG_FUNCTION_INFO_V1(columnar_store_memory_stats); +PG_FUNCTION_INFO_V1(columnar_storage_info); /* @@ -72,6 +75,74 @@ columnar_store_memory_stats(PG_FUNCTION_ARGS) } +/* + * columnar_storage_info - UDF to return internal storage info for a columnar relation. + * + * DDL: + * CREATE OR REPLACE FUNCTION columnar_storage_info( + * rel regclass, + * version_major OUT int4, + * version_minor OUT int4, + * storage_id OUT int8, + * reserved_stripe_id OUT int8, + * reserved_row_number OUT int8, + * reserved_offset OUT int8) + * STRICT + * LANGUAGE c AS 'MODULE_PATHNAME', 'columnar_storage_info'; + */ +Datum +columnar_storage_info(PG_FUNCTION_ARGS) +{ +#define STORAGE_INFO_NATTS 6 + Oid relid = PG_GETARG_OID(0); + TupleDesc tupdesc; + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + { + elog(ERROR, "return type must be a row type"); + } + + if (tupdesc->natts != STORAGE_INFO_NATTS) + { + elog(ERROR, "return type must have %d columns", STORAGE_INFO_NATTS); + } + + Relation rel = table_open(relid, AccessShareLock); + if (!IsColumnarTableAmTable(relid)) + { + ereport(ERROR, (errmsg("table \"%s\" is not a columnar table", + RelationGetRelationName(rel)))); + } + + RelationOpenSmgr(rel); + + Datum values[STORAGE_INFO_NATTS] = { 0 }; + bool nulls[STORAGE_INFO_NATTS] = { 0 }; + + /* + * Pass force = true so that we can inspect metapages that are not the + * current version. + * + * NB: ensure the order and number of attributes correspond to DDL + * declaration. + */ + values[0] = Int32GetDatum(ColumnarStorageGetVersionMajor(rel, true)); + values[1] = Int32GetDatum(ColumnarStorageGetVersionMinor(rel, true)); + values[2] = Int64GetDatum(ColumnarStorageGetStorageId(rel, true)); + values[3] = Int64GetDatum(ColumnarStorageGetReservedStripeId(rel, true)); + values[4] = Int64GetDatum(ColumnarStorageGetReservedRowNumber(rel, true)); + values[5] = Int64GetDatum(ColumnarStorageGetReservedOffset(rel, true)); + + /* release lock */ + table_close(rel, AccessShareLock); + + HeapTuple tuple = heap_form_tuple(tupdesc, values, nulls); + + PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); +} + + /* * MemoryContextTotals adds stats of the given memory context and its * subtree to the given counters. diff --git a/src/backend/columnar/columnar_metadata.c b/src/backend/columnar/columnar_metadata.c index b6aab8922..0ebb533c7 100644 --- a/src/backend/columnar/columnar_metadata.c +++ b/src/backend/columnar/columnar_metadata.c @@ -1,8 +1,19 @@ /*------------------------------------------------------------------------- * - * columnar_metadata_tables.c + * columnar_metadata.c * - * Copyright (c), Citus Data, Inc. + * Copyright (c) Citus Data, Inc. + * + * Manages metadata for columnar relations in separate, shared metadata tables + * in the "columnar" schema. + * + * * holds basic stripe information including data size and row counts + * * holds basic chunk and chunk group information like data offsets and + * min/max values (used for Chunk Group Filtering) + * * useful for fast VACUUM operations (e.g. reporting with VACUUM VERBOSE) + * * useful for stats/costing + * * maps logical row numbers to stripe IDs + * * TODO: visibility information * *------------------------------------------------------------------------- */ @@ -14,7 +25,9 @@ #include "citus_version.h" #include "columnar/columnar.h" +#include "columnar/columnar_storage.h" #include "columnar/columnar_version_compat.h" +#include "distributed/listutils.h" #include #include "access/heapam.h" @@ -30,7 +43,6 @@ #include "commands/sequence.h" #include "commands/trigger.h" #include "distributed/metadata_cache.h" -#include "distributed/resource_lock.h" #include "executor/executor.h" #include "executor/spi.h" #include "miscadmin.h" @@ -48,28 +60,6 @@ #include "utils/relfilenodemap.h" -/* - * Content of the first page in main fork, which stores metadata at file - * level. - */ -typedef struct ColumnarMetapage -{ - /* - * Store version of file format used, so we can detect files from - * previous versions if we change file format. - */ - int versionMajor; - int versionMinor; - - /* - * Each of the metadata table rows are identified by a storageId. - * We store it also in the main fork so we can link metadata rows - * with data files. - */ - uint64 storageId; -} ColumnarMetapage; - - typedef struct { Relation rel; @@ -80,14 +70,14 @@ static void InsertStripeMetadataRow(uint64 storageId, StripeMetadata *stripe); static void GetHighestUsedAddressAndId(uint64 storageId, uint64 *highestUsedAddress, uint64 *highestUsedId); -static void LockForStripeReservation(Relation rel, LOCKMODE mode); -static void UnlockForStripeReservation(Relation rel, LOCKMODE mode); static List * ReadDataFileStripeList(uint64 storageId, Snapshot snapshot); +static StripeMetadata * BuildStripeMetadata(Datum *datumArray); static uint32 * ReadChunkGroupRowCounts(uint64 storageId, uint64 stripe, uint32 chunkGroupCount); static Oid ColumnarStorageIdSequenceRelationId(void); static Oid ColumnarStripeRelationId(void); -static Oid ColumnarStripeIndexRelationId(void); +static Oid ColumnarStripePKeyIndexRelationId(void); +static Oid ColumnarStripeFirstRowNumberIndexRelationId(void); static Oid ColumnarOptionsRelationId(void); static Oid ColumnarOptionsIndexRegclass(void); static Oid ColumnarChunkRelationId(void); @@ -95,6 +85,8 @@ static Oid ColumnarChunkGroupRelationId(void); static Oid ColumnarChunkIndexRelationId(void); static Oid ColumnarChunkGroupIndexRelationId(void); static Oid ColumnarNamespaceId(void); +static uint64 LookupStorageId(RelFileNode relfilenode); +static uint64 GetHighestUsedFirstRowNumber(uint64 storageId); static void DeleteStorageFromColumnarMetadataTable(Oid metadataTableId, AttrNumber storageIdAtrrNumber, Oid storageIdIndexId, @@ -107,8 +99,6 @@ static void FinishModifyRelation(ModifyState *state); static EState * create_estate_for_relation(Relation rel); static bytea * DatumToBytea(Datum value, Form_pg_attribute attrForm); static Datum ByteaToDatum(bytea *bytes, Form_pg_attribute attrForm); -static ColumnarMetapage * InitMetapage(Relation relation); -static ColumnarMetapage * ReadMetapage(RelFileNode relfilenode, bool missingOk); static bool WriteColumnarOptions(Oid regclass, ColumnarOptions *options, bool overwrite); PG_FUNCTION_INFO_V1(columnar_relation_storageid); @@ -140,7 +130,7 @@ typedef FormData_columnar_options *Form_columnar_options; /* constants for columnar.stripe */ -#define Natts_columnar_stripe 8 +#define Natts_columnar_stripe 9 #define Anum_columnar_stripe_storageid 1 #define Anum_columnar_stripe_stripe 2 #define Anum_columnar_stripe_file_offset 3 @@ -149,6 +139,7 @@ typedef FormData_columnar_options *Form_columnar_options; #define Anum_columnar_stripe_chunk_row_count 6 #define Anum_columnar_stripe_row_count 7 #define Anum_columnar_stripe_chunk_count 8 +#define Anum_columnar_stripe_first_row_number 9 /* constants for columnar.chunk_group */ #define Natts_columnar_chunkgroup 4 @@ -423,7 +414,7 @@ SaveStripeSkipList(RelFileNode relfilenode, uint64 stripe, StripeSkipList *chunk uint32 chunkIndex = 0; uint32 columnCount = chunkList->columnCount; - ColumnarMetapage *metapage = ReadMetapage(relfilenode, false); + uint64 storageId = LookupStorageId(relfilenode); Oid columnarChunkOid = ColumnarChunkRelationId(); Relation columnarChunk = table_open(columnarChunkOid, RowExclusiveLock); ModifyState *modifyState = StartModifyRelation(columnarChunk); @@ -436,7 +427,7 @@ SaveStripeSkipList(RelFileNode relfilenode, uint64 stripe, StripeSkipList *chunk &chunkList->chunkSkipNodeArray[columnIndex][chunkIndex]; Datum values[Natts_columnar_chunk] = { - UInt64GetDatum(metapage->storageId), + UInt64GetDatum(storageId), Int64GetDatum(stripe), Int32GetDatum(columnIndex + 1), Int32GetDatum(chunkIndex), @@ -487,7 +478,7 @@ void SaveChunkGroups(RelFileNode relfilenode, uint64 stripe, List *chunkGroupRowCounts) { - ColumnarMetapage *metapage = ReadMetapage(relfilenode, false); + uint64 storageId = LookupStorageId(relfilenode); Oid columnarChunkGroupOid = ColumnarChunkGroupRelationId(); Relation columnarChunkGroup = table_open(columnarChunkGroupOid, RowExclusiveLock); ModifyState *modifyState = StartModifyRelation(columnarChunkGroup); @@ -499,7 +490,7 @@ SaveChunkGroups(RelFileNode relfilenode, uint64 stripe, { int64 rowCount = lfirst_int(lc); Datum values[Natts_columnar_chunkgroup] = { - UInt64GetDatum(metapage->storageId), + UInt64GetDatum(storageId), Int64GetDatum(stripe), Int32GetDatum(chunkId), Int64GetDatum(rowCount) @@ -530,14 +521,14 @@ ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe, TupleDesc tupleDescri uint32 columnCount = tupleDescriptor->natts; ScanKeyData scanKey[2]; - ColumnarMetapage *metapage = ReadMetapage(relfilenode, false); + uint64 storageId = LookupStorageId(relfilenode); Oid columnarChunkOid = ColumnarChunkRelationId(); Relation columnarChunk = table_open(columnarChunkOid, AccessShareLock); Relation index = index_open(ColumnarChunkIndexRelationId(), AccessShareLock); ScanKeyInit(&scanKey[0], Anum_columnar_chunk_storageid, - BTEqualStrategyNumber, F_OIDEQ, UInt64GetDatum(metapage->storageId)); + BTEqualStrategyNumber, F_OIDEQ, UInt64GetDatum(storageId)); ScanKeyInit(&scanKey[1], Anum_columnar_chunk_stripe, BTEqualStrategyNumber, F_OIDEQ, Int32GetDatum(stripe)); @@ -624,12 +615,99 @@ ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe, TupleDesc tupleDescri table_close(columnarChunk, AccessShareLock); chunkList->chunkGroupRowCounts = - ReadChunkGroupRowCounts(metapage->storageId, stripe, chunkCount); + ReadChunkGroupRowCounts(storageId, stripe, chunkCount); return chunkList; } +/* + * FindStripeByRowNumber returns StripeMetadata for the stripe that has the + * row with rowNumber by doing backward index scan on + * stripe_first_row_number_idx. If no such row exists, then returns NULL. + */ +StripeMetadata * +FindStripeByRowNumber(Relation relation, uint64 rowNumber, Snapshot snapshot) +{ + StripeMetadata *foundStripeMetadata = NULL; + + uint64 storageId = ColumnarStorageGetStorageId(relation, false); + ScanKeyData scanKey[2]; + ScanKeyInit(&scanKey[0], Anum_columnar_stripe_storageid, + BTEqualStrategyNumber, F_OIDEQ, Int32GetDatum(storageId)); + ScanKeyInit(&scanKey[1], Anum_columnar_stripe_first_row_number, + BTLessEqualStrategyNumber, F_INT8LE, UInt64GetDatum(rowNumber)); + + Relation columnarStripes = table_open(ColumnarStripeRelationId(), AccessShareLock); + Relation index = index_open(ColumnarStripeFirstRowNumberIndexRelationId(), + AccessShareLock); + SysScanDesc scanDescriptor = systable_beginscan_ordered(columnarStripes, index, + snapshot, 2, + scanKey); + + HeapTuple heapTuple = systable_getnext_ordered(scanDescriptor, BackwardScanDirection); + if (HeapTupleIsValid(heapTuple)) + { + TupleDesc tupleDescriptor = RelationGetDescr(columnarStripes); + Datum datumArray[Natts_columnar_stripe]; + bool isNullArray[Natts_columnar_stripe]; + heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray); + + StripeMetadata *stripeMetadata = BuildStripeMetadata(datumArray); + if (rowNumber < stripeMetadata->firstRowNumber + stripeMetadata->rowCount) + { + foundStripeMetadata = stripeMetadata; + } + } + + systable_endscan_ordered(scanDescriptor); + index_close(index, AccessShareLock); + table_close(columnarStripes, AccessShareLock); + + return foundStripeMetadata; +} + + +/* + * FindStripeWithHighestRowNumber returns StripeMetadata for the stripe that + * has the row with highest rowNumber by doing backward index scan on + * stripe_first_row_number_idx. If given relation is empty, then returns NULL. + */ +StripeMetadata * +FindStripeWithHighestRowNumber(Relation relation, Snapshot snapshot) +{ + StripeMetadata *stripeWithHighestRowNumber = NULL; + + uint64 storageId = ColumnarStorageGetStorageId(relation, false); + ScanKeyData scanKey[1]; + ScanKeyInit(&scanKey[0], Anum_columnar_stripe_storageid, + BTEqualStrategyNumber, F_OIDEQ, Int32GetDatum(storageId)); + + Relation columnarStripes = table_open(ColumnarStripeRelationId(), AccessShareLock); + Relation index = index_open(ColumnarStripeFirstRowNumberIndexRelationId(), + AccessShareLock); + SysScanDesc scanDescriptor = systable_beginscan_ordered(columnarStripes, index, + snapshot, 1, scanKey); + + HeapTuple heapTuple = systable_getnext_ordered(scanDescriptor, BackwardScanDirection); + if (HeapTupleIsValid(heapTuple)) + { + TupleDesc tupleDescriptor = RelationGetDescr(columnarStripes); + Datum datumArray[Natts_columnar_stripe]; + bool isNullArray[Natts_columnar_stripe]; + heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray); + + stripeWithHighestRowNumber = BuildStripeMetadata(datumArray); + } + + systable_endscan_ordered(scanDescriptor); + index_close(index, AccessShareLock); + table_close(columnarStripes, AccessShareLock); + + return stripeWithHighestRowNumber; +} + + /* * ReadChunkGroupRowCounts returns an array of row counts of chunk groups for the * given stripe. @@ -704,7 +782,8 @@ InsertStripeMetadataRow(uint64 storageId, StripeMetadata *stripe) Int32GetDatum(stripe->columnCount), Int32GetDatum(stripe->chunkGroupRowCount), Int64GetDatum(stripe->rowCount), - Int32GetDatum(stripe->chunkCount) + Int32GetDatum(stripe->chunkCount), + UInt64GetDatum(stripe->firstRowNumber) }; Oid columnarStripesOid = ColumnarStripeRelationId(); @@ -729,15 +808,9 @@ InsertStripeMetadataRow(uint64 storageId, StripeMetadata *stripe) List * StripesForRelfilenode(RelFileNode relfilenode) { - ColumnarMetapage *metapage = ReadMetapage(relfilenode, true); - if (metapage == NULL) - { - /* empty relation */ - return NIL; - } + uint64 storageId = LookupStorageId(relfilenode); - - return ReadDataFileStripeList(metapage->storageId, GetTransactionSnapshot()); + return ReadDataFileStripeList(storageId, GetTransactionSnapshot()); } @@ -752,17 +825,11 @@ StripesForRelfilenode(RelFileNode relfilenode) uint64 GetHighestUsedAddress(RelFileNode relfilenode) { + uint64 storageId = LookupStorageId(relfilenode); + uint64 highestUsedAddress = 0; uint64 highestUsedId = 0; - ColumnarMetapage *metapage = ReadMetapage(relfilenode, true); - - /* empty data file? */ - if (metapage == NULL) - { - return 0; - } - - GetHighestUsedAddressAndId(metapage->storageId, &highestUsedAddress, &highestUsedId); + GetHighestUsedAddressAndId(storageId, &highestUsedAddress, &highestUsedId); return highestUsedAddress; } @@ -799,35 +866,6 @@ GetHighestUsedAddressAndId(uint64 storageId, } -/* - * LockForStripeReservation acquires a lock for stripe reservation. - */ -static void -LockForStripeReservation(Relation rel, LOCKMODE mode) -{ - /* - * We use an advisory lock here so we can easily detect these kind of - * locks in IsProcessWaitingForSafeOperations() and don't include them - * in the lock graph. - */ - LOCKTAG tag; - SET_LOCKTAG_COLUMNAR_STRIPE_RESERVATION(tag, rel); - LockAcquire(&tag, mode, false, false); -} - - -/* - * UnlockForStripeReservation releases the stripe reservation lock. - */ -static void -UnlockForStripeReservation(Relation rel, LOCKMODE mode) -{ - LOCKTAG tag; - SET_LOCKTAG_COLUMNAR_STRIPE_RESERVATION(tag, rel); - LockRelease(&tag, mode, false); -} - - /* * ReserveStripe reserves and stripe of given size for the given relation, * and inserts it into columnar.stripe. It is guaranteed that concurrent @@ -836,50 +874,15 @@ UnlockForStripeReservation(Relation rel, LOCKMODE mode) StripeMetadata ReserveStripe(Relation rel, uint64 sizeBytes, uint64 rowCount, uint64 columnCount, - uint64 chunkCount, uint64 chunkGroupRowCount) + uint64 chunkCount, uint64 chunkGroupRowCount, + uint64 stripeFirstRowNumber) { StripeMetadata stripe = { 0 }; - uint64 currLogicalHigh = 0; - uint64 highestId = 0; - /* - * We take ExclusiveLock here, so two space reservations conflict. - */ - LOCKMODE lockMode = ExclusiveLock; - LockForStripeReservation(rel, lockMode); + uint64 storageId = ColumnarStorageGetStorageId(rel, false); - RelFileNode relfilenode = rel->rd_node; - - /* - * If this is the first stripe for this relation, initialize the - * metapage, otherwise use the previously initialized metapage. - */ - ColumnarMetapage *metapage = ReadMetapage(relfilenode, true); - if (metapage == NULL) - { - metapage = InitMetapage(rel); - } - - GetHighestUsedAddressAndId(metapage->storageId, &currLogicalHigh, &highestId); - SmgrAddr currSmgrHigh = logical_to_smgr(currLogicalHigh); - - SmgrAddr resSmgrStart = next_block_start(currSmgrHigh); - uint64 resLogicalStart = smgr_to_logical(resSmgrStart); - - uint64 resLogicalEnd = resLogicalStart + sizeBytes - 1; - SmgrAddr resSmgrEnd = logical_to_smgr(resLogicalEnd); - - RelationOpenSmgr(rel); - uint64 nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); - - while (resSmgrEnd.blockno >= nblocks) - { - Buffer newBuffer = ReadBuffer(rel, P_NEW); - ReleaseBuffer(newBuffer); - nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); - } - - RelationCloseSmgr(rel); + uint64 stripeId = ColumnarStorageReserveStripe(rel); + uint64 resLogicalStart = ColumnarStorageReserveData(rel, sizeBytes); stripe.fileOffset = resLogicalStart; stripe.dataLength = sizeBytes; @@ -887,11 +890,10 @@ ReserveStripe(Relation rel, uint64 sizeBytes, stripe.chunkGroupRowCount = chunkGroupRowCount; stripe.columnCount = columnCount; stripe.rowCount = rowCount; - stripe.id = highestId + 1; + stripe.id = stripeId; + stripe.firstRowNumber = stripeFirstRowNumber; - InsertStripeMetadataRow(metapage->storageId, &stripe); - - UnlockForStripeReservation(rel, lockMode); + InsertStripeMetadataRow(storageId, &stripe); return stripe; } @@ -914,7 +916,8 @@ ReadDataFileStripeList(uint64 storageId, Snapshot snapshot) Oid columnarStripesOid = ColumnarStripeRelationId(); Relation columnarStripes = table_open(columnarStripesOid, AccessShareLock); - Relation index = index_open(ColumnarStripeIndexRelationId(), AccessShareLock); + Relation index = index_open(ColumnarStripeFirstRowNumberIndexRelationId(), + AccessShareLock); TupleDesc tupleDescriptor = RelationGetDescr(columnarStripes); SysScanDesc scanDescriptor = systable_beginscan_ordered(columnarStripes, index, @@ -927,22 +930,7 @@ ReadDataFileStripeList(uint64 storageId, Snapshot snapshot) bool isNullArray[Natts_columnar_stripe]; heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray); - - StripeMetadata *stripeMetadata = palloc0(sizeof(StripeMetadata)); - stripeMetadata->id = DatumGetInt64(datumArray[Anum_columnar_stripe_stripe - 1]); - stripeMetadata->fileOffset = DatumGetInt64( - datumArray[Anum_columnar_stripe_file_offset - 1]); - stripeMetadata->dataLength = DatumGetInt64( - datumArray[Anum_columnar_stripe_data_length - 1]); - stripeMetadata->columnCount = DatumGetInt32( - datumArray[Anum_columnar_stripe_column_count - 1]); - stripeMetadata->chunkCount = DatumGetInt32( - datumArray[Anum_columnar_stripe_chunk_count - 1]); - stripeMetadata->chunkGroupRowCount = DatumGetInt32( - datumArray[Anum_columnar_stripe_chunk_row_count - 1]); - stripeMetadata->rowCount = DatumGetInt64( - datumArray[Anum_columnar_stripe_row_count - 1]); - + StripeMetadata *stripeMetadata = BuildStripeMetadata(datumArray); stripeMetadataList = lappend(stripeMetadataList, stripeMetadata); } @@ -954,6 +942,32 @@ ReadDataFileStripeList(uint64 storageId, Snapshot snapshot) } +/* + * BuildStripeMetadata builds a StripeMetadata object from given datumArray. + */ +static StripeMetadata * +BuildStripeMetadata(Datum *datumArray) +{ + StripeMetadata *stripeMetadata = palloc0(sizeof(StripeMetadata)); + stripeMetadata->id = DatumGetInt64(datumArray[Anum_columnar_stripe_stripe - 1]); + stripeMetadata->fileOffset = DatumGetInt64( + datumArray[Anum_columnar_stripe_file_offset - 1]); + stripeMetadata->dataLength = DatumGetInt64( + datumArray[Anum_columnar_stripe_data_length - 1]); + stripeMetadata->columnCount = DatumGetInt32( + datumArray[Anum_columnar_stripe_column_count - 1]); + stripeMetadata->chunkCount = DatumGetInt32( + datumArray[Anum_columnar_stripe_chunk_count - 1]); + stripeMetadata->chunkGroupRowCount = DatumGetInt32( + datumArray[Anum_columnar_stripe_chunk_row_count - 1]); + stripeMetadata->rowCount = DatumGetInt64( + datumArray[Anum_columnar_stripe_row_count - 1]); + stripeMetadata->firstRowNumber = DatumGetUInt64( + datumArray[Anum_columnar_stripe_first_row_number - 1]); + return stripeMetadata; +} + + /* * DeleteMetadataRows removes the rows with given relfilenode from columnar * metadata tables. @@ -970,28 +984,20 @@ DeleteMetadataRows(RelFileNode relfilenode) return; } - ColumnarMetapage *metapage = ReadMetapage(relfilenode, true); - if (metapage == NULL) - { - /* - * No data has been written to this storage yet, so there is no - * associated metadata yet. - */ - return; - } + uint64 storageId = LookupStorageId(relfilenode); DeleteStorageFromColumnarMetadataTable(ColumnarStripeRelationId(), Anum_columnar_stripe_storageid, - ColumnarStripeIndexRelationId(), - metapage->storageId); + ColumnarStripePKeyIndexRelationId(), + storageId); DeleteStorageFromColumnarMetadataTable(ColumnarChunkGroupRelationId(), Anum_columnar_chunkgroup_storageid, ColumnarChunkGroupIndexRelationId(), - metapage->storageId); + storageId); DeleteStorageFromColumnarMetadataTable(ColumnarChunkRelationId(), Anum_columnar_chunk_storageid, ColumnarChunkIndexRelationId(), - metapage->storageId); + storageId); } @@ -1226,16 +1232,28 @@ ColumnarStripeRelationId(void) /* - * ColumnarStripeIndexRelationId returns relation id of columnar.stripe_pkey. + * ColumnarStripePKeyIndexRelationId returns relation id of columnar.stripe_pkey. * TODO: should we cache this similar to citus? */ static Oid -ColumnarStripeIndexRelationId(void) +ColumnarStripePKeyIndexRelationId(void) { return get_relname_relid("stripe_pkey", ColumnarNamespaceId()); } +/* + * ColumnarStripeFirstRowNumberIndexRelationId returns relation id of + * columnar.stripe_first_row_number_idx. + * TODO: should we cache this similar to citus? + */ +static Oid +ColumnarStripeFirstRowNumberIndexRelationId(void) +{ + return get_relname_relid("stripe_first_row_number_idx", ColumnarNamespaceId()); +} + + /* * ColumnarOptionsRelationId returns relation id of columnar.options. */ @@ -1312,75 +1330,31 @@ ColumnarNamespaceId(void) /* - * ReadMetapage reads metapage for the given relfilenode. It returns + * LookupStorageId reads storage metapage to find the storage ID for the given relfilenode. It returns * false if the relation doesn't have a meta page yet. */ -static ColumnarMetapage * -ReadMetapage(RelFileNode relfilenode, bool missingOk) +static uint64 +LookupStorageId(RelFileNode relfilenode) { - StringInfo metapageBuffer = NULL; Oid relationId = RelidByRelfilenode(relfilenode.spcNode, relfilenode.relNode); - if (OidIsValid(relationId)) - { - Relation relation = relation_open(relationId, NoLock); - RelationOpenSmgr(relation); - int nblocks = smgrnblocks(relation->rd_smgr, MAIN_FORKNUM); - RelationCloseSmgr(relation); + Relation relation = relation_open(relationId, AccessShareLock); + uint64 storageId = ColumnarStorageGetStorageId(relation, false); + table_close(relation, AccessShareLock); - if (nblocks != 0) - { - metapageBuffer = ReadFromSmgr(relation, 0, sizeof(ColumnarMetapage)); - } - - relation_close(relation, NoLock); - } - - if (metapageBuffer == NULL) - { - if (!missingOk) - { - elog(ERROR, "columnar metapage was not found"); - } - - return NULL; - } - - ColumnarMetapage *metapage = palloc0(sizeof(ColumnarMetapage)); - memcpy_s((void *) metapage, sizeof(ColumnarMetapage), - metapageBuffer->data, sizeof(ColumnarMetapage)); - - return metapage; + return storageId; } /* - * InitMetapage initializes metapage for the given relation. + * ColumnarMetadataNewStorageId - create a new, unique storage id and return + * it. */ -static ColumnarMetapage * -InitMetapage(Relation relation) +uint64 +ColumnarMetadataNewStorageId() { - /* - * If we init metapage during upgrade, we might override the - * pre-upgrade storage id which will render pre-upgrade data - * invisible. - */ - Assert(!IsBinaryUpgrade); - ColumnarMetapage *metapage = palloc0(sizeof(ColumnarMetapage)); - - metapage->storageId = nextval_internal(ColumnarStorageIdSequenceRelationId(), false); - metapage->versionMajor = COLUMNAR_VERSION_MAJOR; - metapage->versionMinor = COLUMNAR_VERSION_MINOR; - - /* create the first block */ - Buffer newBuffer = ReadBuffer(relation, P_NEW); - ReleaseBuffer(newBuffer); - - Assert(sizeof(ColumnarMetapage) <= BLCKSZ - SizeOfPageHeaderData); - WriteToSmgr(relation, 0, (char *) metapage, sizeof(ColumnarMetapage)); - - return metapage; + return nextval_internal(ColumnarStorageIdSequenceRelationId(), false); } @@ -1391,20 +1365,85 @@ InitMetapage(Relation relation) Datum columnar_relation_storageid(PG_FUNCTION_ARGS) { - uint64 storageId = -1; - Oid relationId = PG_GETARG_OID(0); Relation relation = relation_open(relationId, AccessShareLock); - if (IsColumnarTableAmTable(relationId)) + if (!IsColumnarTableAmTable(relationId)) { - ColumnarMetapage *metadata = ReadMetapage(relation->rd_node, true); - if (metadata != NULL) - { - storageId = metadata->storageId; - } + elog(ERROR, "relation \"%s\" is not a columnar table", + RelationGetRelationName(relation)); } + uint64 storageId = ColumnarStorageGetStorageId(relation, false); + relation_close(relation, AccessShareLock); PG_RETURN_INT64(storageId); } + + +/* + * ColumnarStorageUpdateIfNeeded - upgrade columnar storage to the current version by + * using information from the metadata tables. + */ +void +ColumnarStorageUpdateIfNeeded(Relation rel, bool isUpgrade) +{ + if (ColumnarStorageIsCurrent(rel)) + { + return; + } + + RelationOpenSmgr(rel); + BlockNumber nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); + if (nblocks < 2) + { + ColumnarStorageInit(rel->rd_smgr, ColumnarMetadataNewStorageId()); + return; + } + + uint64 storageId = ColumnarStorageGetStorageId(rel, true); + + uint64 highestId; + uint64 highestOffset; + GetHighestUsedAddressAndId(storageId, &highestOffset, &highestId); + + uint64 reservedStripeId = highestId + 1; + uint64 reservedOffset = highestOffset + 1; + uint64 reservedRowNumber = GetHighestUsedFirstRowNumber(storageId) + 1; + ColumnarStorageUpdateCurrent(rel, isUpgrade, reservedStripeId, + reservedRowNumber, reservedOffset); +} + + +/* + * GetHighestUsedFirstRowNumber returns the highest used first_row_number + * for given storageId. Returns COLUMNAR_INVALID_ROW_NUMBER if storage with + * storageId has no stripes. + * Note that normally we would use ColumnarStorageGetReservedRowNumber + * to decide that. However, this function is designed to be used when + * building the metapage itself during upgrades. + */ +static uint64 +GetHighestUsedFirstRowNumber(uint64 storageId) +{ + List *stripeMetadataList = ReadDataFileStripeList(storageId, + GetTransactionSnapshot()); + if (list_length(stripeMetadataList) == 0) + { + return COLUMNAR_INVALID_ROW_NUMBER; + } + + /* XXX: Better to have an invalid value for StripeMetadata.rowCount too */ + uint64 stripeRowCount = -1; + uint64 highestFirstRowNumber = COLUMNAR_INVALID_ROW_NUMBER; + + StripeMetadata *stripeMetadata = NULL; + foreach_ptr(stripeMetadata, stripeMetadataList) + { + highestFirstRowNumber = Max(highestFirstRowNumber, + stripeMetadata->firstRowNumber); + stripeRowCount = stripeMetadata->rowCount; + } + + return highestFirstRowNumber + stripeRowCount - 1; +} diff --git a/src/backend/columnar/columnar_reader.c b/src/backend/columnar/columnar_reader.c index a4939b763..0f1a1767f 100644 --- a/src/backend/columnar/columnar_reader.c +++ b/src/backend/columnar/columnar_reader.c @@ -34,6 +34,8 @@ #include "utils/rel.h" #include "columnar/columnar.h" +#include "columnar/columnar_storage.h" +#include "columnar/columnar_tableam.h" #include "columnar/columnar_version_compat.h" typedef struct ChunkGroupReadState @@ -84,6 +86,14 @@ struct ColumnarReadState /* static function declarations */ static MemoryContext CreateStripeReadMemoryContext(void); +static void ReadStripeRowByRowNumber(StripeReadState *stripeReadState, + StripeMetadata *stripeMetadata, + uint64 rowNumber, Datum *columnValues, + bool *columnNulls); +static void ReadChunkGroupRowByRowOffset(ChunkGroupReadState *chunkGroupReadState, + StripeMetadata *stripeMetadata, + uint64 stripeRowOffset, Datum *columnValues, + bool *columnNulls); static bool StripeReadInProgress(ColumnarReadState *readState); static bool HasUnreadStripe(ColumnarReadState *readState); static StripeReadState * BeginStripeRead(StripeMetadata *stripeMetadata, Relation rel, @@ -194,11 +204,12 @@ CreateStripeReadMemoryContext() /* * ColumnarReadNextRow tries to read a row from the columnar table. On success, it sets - * column values and nulls, and returns true. If there are no more rows to read, - * the function returns false. + * column values, column nulls and rowNumber (if passed to be non-NULL), and returns true. + * If there are no more rows to read, the function returns false. */ bool -ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *columnNulls) +ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *columnNulls, + uint64 *rowNumber) { while (true) { @@ -226,6 +237,14 @@ ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *col continue; } + if (rowNumber) + { + StripeMetadata *stripeMetadata = list_nth(readState->stripeList, + readState->currentStripe); + *rowNumber = stripeMetadata->firstRowNumber + + readState->stripeReadState->currentRow - 1; + } + return true; } @@ -233,6 +252,104 @@ ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *col } +/* + * ColumnarReadRowByRowNumber reads row with rowNumber from given relation + * into columnValues and columnNulls, and returns true. If no such row + * exists, then returns false. + */ +bool +ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber, + List *neededColumnList, Datum *columnValues, + bool *columnNulls, Snapshot snapshot) +{ + StripeMetadata *stripeMetadata = FindStripeByRowNumber(relation, rowNumber, snapshot); + if (stripeMetadata == NULL) + { + /* no such row exists */ + return false; + } + + TupleDesc relationTupleDesc = RelationGetDescr(relation); + List *whereClauseList = NIL; + List *whereClauseVars = NIL; + MemoryContext stripeReadContext = CreateStripeReadMemoryContext(); + StripeReadState *stripeReadState = BeginStripeRead(stripeMetadata, + relation, + relationTupleDesc, + neededColumnList, + whereClauseList, + whereClauseVars, + stripeReadContext); + + ReadStripeRowByRowNumber(stripeReadState, stripeMetadata, rowNumber, + columnValues, columnNulls); + + EndStripeRead(stripeReadState); + MemoryContextReset(stripeReadContext); + + return true; +} + + +/* + * ReadStripeRowByRowNumber reads row with rowNumber from given + * stripeReadState into columnValues and columnNulls. + * Errors out if no such row exists in the stripe being read. + */ +static void +ReadStripeRowByRowNumber(StripeReadState *stripeReadState, + StripeMetadata *stripeMetadata, + uint64 rowNumber, Datum *columnValues, + bool *columnNulls) +{ + if (rowNumber < stripeMetadata->firstRowNumber) + { + /* not expected but be on the safe side */ + ereport(ERROR, (errmsg("row offset cannot be negative"))); + } + + /* find the exact chunk group to be read */ + uint64 stripeRowOffset = rowNumber - stripeMetadata->firstRowNumber; + stripeReadState->chunkGroupIndex = stripeRowOffset / + stripeMetadata->chunkGroupRowCount; + stripeReadState->chunkGroupReadState = BeginChunkGroupRead( + stripeReadState->stripeBuffers, + stripeReadState->chunkGroupIndex, + stripeReadState->tupleDescriptor, + stripeReadState->projectedColumnList, + stripeReadState->stripeReadContext); + + ReadChunkGroupRowByRowOffset(stripeReadState->chunkGroupReadState, + stripeMetadata, stripeRowOffset, + columnValues, columnNulls); + + EndChunkGroupRead(stripeReadState->chunkGroupReadState); + stripeReadState->chunkGroupReadState = NULL; +} + + +/* + * ReadChunkGroupRowByRowOffset reads row with stripeRowOffset from given + * chunkGroupReadState into columnValues and columnNulls. + * Errors out if no such row exists in the chunk group being read. + */ +static void +ReadChunkGroupRowByRowOffset(ChunkGroupReadState *chunkGroupReadState, + StripeMetadata *stripeMetadata, + uint64 stripeRowOffset, Datum *columnValues, + bool *columnNulls) +{ + /* set the exact row number to be read from given chunk roup */ + chunkGroupReadState->currentRow = stripeRowOffset % + stripeMetadata->chunkGroupRowCount; + if (!ReadChunkGroupNextRow(chunkGroupReadState, columnValues, columnNulls)) + { + /* not expected but be on the safe side */ + ereport(ERROR, (errmsg("could not find the row in stripe"))); + } +} + + /* * StripeReadInProgress returns true if we already started reading a stripe. */ @@ -667,8 +784,12 @@ LoadColumnBuffers(Relation relation, ColumnChunkSkipNode *chunkSkipNodeArray, { ColumnChunkSkipNode *chunkSkipNode = &chunkSkipNodeArray[chunkIndex]; uint64 existsOffset = stripeOffset + chunkSkipNode->existsChunkOffset; - StringInfo rawExistsBuffer = ReadFromSmgr(relation, existsOffset, - chunkSkipNode->existsLength); + StringInfo rawExistsBuffer = makeStringInfo(); + + enlargeStringInfo(rawExistsBuffer, chunkSkipNode->existsLength); + rawExistsBuffer->len = chunkSkipNode->existsLength; + ColumnarStorageRead(relation, existsOffset, rawExistsBuffer->data, + chunkSkipNode->existsLength); chunkBuffersArray[chunkIndex]->existsBuffer = rawExistsBuffer; } @@ -679,8 +800,12 @@ LoadColumnBuffers(Relation relation, ColumnChunkSkipNode *chunkSkipNodeArray, ColumnChunkSkipNode *chunkSkipNode = &chunkSkipNodeArray[chunkIndex]; CompressionType compressionType = chunkSkipNode->valueCompressionType; uint64 valueOffset = stripeOffset + chunkSkipNode->valueChunkOffset; - StringInfo rawValueBuffer = ReadFromSmgr(relation, valueOffset, - chunkSkipNode->valueLength); + StringInfo rawValueBuffer = makeStringInfo(); + + enlargeStringInfo(rawValueBuffer, chunkSkipNode->valueLength); + rawValueBuffer->len = chunkSkipNode->valueLength; + ColumnarStorageRead(relation, valueOffset, rawValueBuffer->data, + chunkSkipNode->valueLength); chunkBuffersArray[chunkIndex]->valueBuffer = rawValueBuffer; chunkBuffersArray[chunkIndex]->valueCompressionType = compressionType; @@ -1269,30 +1394,3 @@ ColumnDefaultValue(TupleConstr *tupleConstraints, Form_pg_attribute attributeFor "does not evaluate to constant value"))); } } - - -StringInfo -ReadFromSmgr(Relation rel, uint64 offset, uint32 size) -{ - StringInfo resultBuffer = makeStringInfo(); - uint64 read = 0; - - enlargeStringInfo(resultBuffer, size); - resultBuffer->len = size; - - while (read < size) - { - SmgrAddr addr = logical_to_smgr(offset + read); - - Buffer buffer = ReadBuffer(rel, addr.blockno); - Page page = BufferGetPage(buffer); - PageHeader phdr = (PageHeader) page; - - uint32 to_read = Min(size - read, phdr->pd_upper - addr.offset); - memcpy_s(resultBuffer->data + read, size - read, page + addr.offset, to_read); - ReleaseBuffer(buffer); - read += to_read; - } - - return resultBuffer; -} diff --git a/src/backend/columnar/columnar_storage.c b/src/backend/columnar/columnar_storage.c new file mode 100644 index 000000000..aba79303f --- /dev/null +++ b/src/backend/columnar/columnar_storage.c @@ -0,0 +1,809 @@ +/*------------------------------------------------------------------------- + * + * columnar_storage.c + * + * Copyright (c) Citus Data, Inc. + * + * Low-level storage layer for columnar. + * - Translates columnar read/write operations on logical offsets into operations on pages/blocks. + * - Emits WAL. + * - Reads/writes the columnar metapage. + * - Reserves data offsets, stripe numbers, and row offsets. + * - Truncation. + * + * Higher-level columnar operations deal with logical offsets and large + * contiguous buffers of data that need to be stored. But the buffer manager + * and WAL depend on formatted pages with headers, so these large buffers need + * to be written across many pages. This module translates the contiguous + * buffers into individual block reads/writes, and performs WAL when + * necessary. + * + * Storage layout: a metapage in block 0, followed by an empty page in block + * 1, followed by logical data starting at the first byte after the page + * header in block 2 (having logical offset ColumnarFirstLogicalOffset). (XXX: + * Block 1 is left empty for no particular reason. Reconsider?). A columnar + * table should always have at least 2 blocks. + * + * Reservation is done with a relation extension lock, and designed for + * concurrency, so the callers only need an ordinary lock on the + * relation. Initializing the metapage or truncating the relation require that + * the caller holds an AccessExclusiveLock. (XXX: New reservations of data are + * aligned onto a new page for no particular reason. Reconsider?). + * + *------------------------------------------------------------------------- + */ + + +#include "postgres.h" + +#include "safe_lib.h" + +#include "catalog/storage.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" + +#include "columnar/columnar.h" +#include "columnar/columnar_storage.h" + + +/* + * Content of the first page in main fork, which stores metadata at file + * level. + */ +typedef struct ColumnarMetapage +{ + /* + * Store version of file format used, so we can detect files from + * previous versions if we change file format. + */ + uint32 versionMajor; + uint32 versionMinor; + + /* + * Each of the metadata table rows are identified by a storageId. + * We store it also in the main fork so we can link metadata rows + * with data files. + */ + uint64 storageId; + + uint64 reservedStripeId; /* first unused stripe id */ + uint64 reservedRowNumber; /* first unused row number */ + uint64 reservedOffset; /* first unused byte offset */ +} ColumnarMetapage; + + +/* represents a "physical" block+offset address */ +typedef struct PhysicalAddr +{ + BlockNumber blockno; + uint32 offset; +} PhysicalAddr; + + +#define COLUMNAR_METAPAGE_BLOCKNO 0 +#define COLUMNAR_EMPTY_BLOCKNO 1 +#define COLUMNAR_INVALID_STRIPE_ID 0 +#define COLUMNAR_FIRST_STRIPE_ID 1 + + +#define OLD_METAPAGE_VERSION_HINT "Use \"VACUUM\" to upgrade the columnar table format " \ + "version or run \"ALTER EXTENSION citus UPDATE\"." + + +/* + * Map logical offsets to a physical page and offset where the data is kept. + */ +static inline PhysicalAddr +LogicalToPhysical(uint64 logicalOffset) +{ + PhysicalAddr addr; + + addr.blockno = logicalOffset / COLUMNAR_BYTES_PER_PAGE; + addr.offset = SizeOfPageHeaderData + (logicalOffset % COLUMNAR_BYTES_PER_PAGE); + + return addr; +} + + +/* + * Map a physical page and offset address to a logical address. + */ +static inline uint64 +PhysicalToLogical(PhysicalAddr addr) +{ + return COLUMNAR_BYTES_PER_PAGE * addr.blockno + addr.offset - SizeOfPageHeaderData; +} + + +static void ColumnarOverwriteMetapage(Relation relation, + ColumnarMetapage columnarMetapage); +static ColumnarMetapage ColumnarMetapageRead(Relation rel, bool force); +static void ReadFromBlock(Relation rel, BlockNumber blockno, uint32 offset, + char *buf, uint32 len, bool force); +static void WriteToBlock(Relation rel, BlockNumber blockno, uint32 offset, + char *buf, uint32 len, bool clear); +static uint64 AlignReservation(uint64 prevReservation); +static bool ColumnarMetapageIsCurrent(ColumnarMetapage *metapage); +static bool ColumnarMetapageIsOlder(ColumnarMetapage *metapage); +static bool ColumnarMetapageIsNewer(ColumnarMetapage *metapage); +static void ColumnarMetapageCheckVersion(Relation rel, ColumnarMetapage *metapage); + + +/* + * ColumnarStorageInit - initialize a new metapage in an empty relation + * with the given storageId. + * + * Caller must hold AccessExclusiveLock on the relation. + */ +void +ColumnarStorageInit(SMgrRelation srel, uint64 storageId) +{ + BlockNumber nblocks = smgrnblocks(srel, MAIN_FORKNUM); + + if (nblocks > 0) + { + elog(ERROR, + "attempted to initialize metapage, but %d pages already exist", + nblocks); + } + + /* create two pages */ + PGAlignedBlock block; + Page page = block.data; + + /* write metapage */ + PageInit(page, BLCKSZ, 0); + PageHeader phdr = (PageHeader) page; + + ColumnarMetapage metapage = { 0 }; + metapage.storageId = storageId; + metapage.versionMajor = COLUMNAR_VERSION_MAJOR; + metapage.versionMinor = COLUMNAR_VERSION_MINOR; + metapage.reservedStripeId = COLUMNAR_FIRST_STRIPE_ID; + metapage.reservedRowNumber = COLUMNAR_FIRST_ROW_NUMBER; + metapage.reservedOffset = ColumnarFirstLogicalOffset; + memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower, + (char *) &metapage, sizeof(ColumnarMetapage)); + phdr->pd_lower += sizeof(ColumnarMetapage); + + PageSetChecksumInplace(page, COLUMNAR_METAPAGE_BLOCKNO); + smgrwrite(srel, MAIN_FORKNUM, COLUMNAR_METAPAGE_BLOCKNO, page, true); + log_newpage(&srel->smgr_rnode.node, MAIN_FORKNUM, + COLUMNAR_METAPAGE_BLOCKNO, page, true); + + /* write empty page */ + PageInit(page, BLCKSZ, 0); + + PageSetChecksumInplace(page, COLUMNAR_EMPTY_BLOCKNO); + smgrwrite(srel, MAIN_FORKNUM, COLUMNAR_EMPTY_BLOCKNO, page, true); + log_newpage(&srel->smgr_rnode.node, MAIN_FORKNUM, + COLUMNAR_EMPTY_BLOCKNO, page, true); + + /* + * An immediate sync is required even if we xlog'd the page, because the + * write did not go through shared_buffers and therefore a concurrent + * checkpoint may have moved the redo pointer past our xlog record. + */ + smgrimmedsync(srel, MAIN_FORKNUM); +} + + +/* + * ColumnarStorageUpdateCurrent - update the metapage to the current + * version. No effect if the version already matches. If 'upgrade' is true, + * throw an error if metapage version is newer; if 'upgrade' is false, it's a + * downgrade, so throw an error if the metapage version is older. + * + * NB: caller must ensure that metapage already exists, which might not be the + * case on 10.0. + */ +void +ColumnarStorageUpdateCurrent(Relation rel, bool upgrade, uint64 reservedStripeId, + uint64 reservedRowNumber, uint64 reservedOffset) +{ + LockRelationForExtension(rel, ExclusiveLock); + + ColumnarMetapage metapage = ColumnarMetapageRead(rel, true); + + if (ColumnarMetapageIsCurrent(&metapage)) + { + /* nothing to do */ + return; + } + + if (upgrade && ColumnarMetapageIsNewer(&metapage)) + { + elog(ERROR, "found newer columnar metapage while upgrading"); + } + + if (!upgrade && ColumnarMetapageIsOlder(&metapage)) + { + elog(ERROR, "found older columnar metapage while downgrading"); + } + + metapage.versionMajor = COLUMNAR_VERSION_MAJOR; + metapage.versionMinor = COLUMNAR_VERSION_MINOR; + + /* storageId remains the same */ + metapage.reservedStripeId = reservedStripeId; + metapage.reservedRowNumber = reservedRowNumber; + metapage.reservedOffset = reservedOffset; + + ColumnarOverwriteMetapage(rel, metapage); + + UnlockRelationForExtension(rel, ExclusiveLock); +} + + +/* + * ColumnarStorageGetVersionMajor - return major version from the metapage. + * + * Throw an error if the metapage is not the current version, unless + * 'force' is true. + */ +uint64 +ColumnarStorageGetVersionMajor(Relation rel, bool force) +{ + ColumnarMetapage metapage = ColumnarMetapageRead(rel, force); + + return metapage.versionMajor; +} + + +/* + * ColumnarStorageGetVersionMinor - return minor version from the metapage. + * + * Throw an error if the metapage is not the current version, unless + * 'force' is true. + */ +uint64 +ColumnarStorageGetVersionMinor(Relation rel, bool force) +{ + ColumnarMetapage metapage = ColumnarMetapageRead(rel, force); + + return metapage.versionMinor; +} + + +/* + * ColumnarStorageGetStorageId - return storage ID from the metapage. + * + * Throw an error if the metapage is not the current version, unless + * 'force' is true. + */ +uint64 +ColumnarStorageGetStorageId(Relation rel, bool force) +{ + ColumnarMetapage metapage = ColumnarMetapageRead(rel, force); + + return metapage.storageId; +} + + +/* + * ColumnarStorageGetReservedStripeId - return reserved stripe ID from the + * metapage. + * + * Throw an error if the metapage is not the current version, unless + * 'force' is true. + */ +uint64 +ColumnarStorageGetReservedStripeId(Relation rel, bool force) +{ + ColumnarMetapage metapage = ColumnarMetapageRead(rel, force); + + return metapage.reservedStripeId; +} + + +/* + * ColumnarStorageGetReservedRowNumber - return reserved row number from the + * metapage. + * + * Throw an error if the metapage is not the current version, unless + * 'force' is true. + */ +uint64 +ColumnarStorageGetReservedRowNumber(Relation rel, bool force) +{ + ColumnarMetapage metapage = ColumnarMetapageRead(rel, force); + + return metapage.reservedRowNumber; +} + + +/* + * ColumnarStorageGetReservedOffset - return reserved offset from the metapage. + * + * Throw an error if the metapage is not the current version, unless + * 'force' is true. + */ +uint64 +ColumnarStorageGetReservedOffset(Relation rel, bool force) +{ + ColumnarMetapage metapage = ColumnarMetapageRead(rel, force); + + return metapage.reservedOffset; +} + + +/* + * ColumnarStorageIsCurrent - return true if metapage exists and is not + * the current version. + */ +bool +ColumnarStorageIsCurrent(Relation rel) +{ + RelationOpenSmgr(rel); + BlockNumber nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); + + if (nblocks < 2) + { + return false; + } + + ColumnarMetapage metapage = ColumnarMetapageRead(rel, true); + return ColumnarMetapageIsCurrent(&metapage); +} + + +/* + * ColumnarStorageReserveRowNumber returns reservedRowNumber and advances + * it for next row number reservation. + */ +uint64 +ColumnarStorageReserveRowNumber(Relation rel, uint64 nrows) +{ + LockRelationForExtension(rel, ExclusiveLock); + + ColumnarMetapage metapage = ColumnarMetapageRead(rel, false); + + uint64 firstRowNumber = metapage.reservedRowNumber; + metapage.reservedRowNumber += nrows; + + ColumnarOverwriteMetapage(rel, metapage); + + UnlockRelationForExtension(rel, ExclusiveLock); + + return firstRowNumber; +} + + +/* + * ColumnarStorageReserveStripe returns stripeId and advances it for next + * stripeId reservation. + * Note that this function doesn't handle row number reservation. + * This is because, unlike stripeId reservation, we immediately reserve + * row number during writes, not when flushing stripes to disk. + * See ColumnarStorageReserveRowNumber function. + */ +uint64 +ColumnarStorageReserveStripe(Relation rel) +{ + LockRelationForExtension(rel, ExclusiveLock); + + ColumnarMetapage metapage = ColumnarMetapageRead(rel, false); + + uint64 stripeId = metapage.reservedStripeId; + metapage.reservedStripeId++; + + ColumnarOverwriteMetapage(rel, metapage); + + UnlockRelationForExtension(rel, ExclusiveLock); + + return stripeId; +} + + +/* + * ColumnarStorageReserveData - reserve logical data offsets for writing. + */ +uint64 +ColumnarStorageReserveData(Relation rel, uint64 amount) +{ + if (amount == 0) + { + return ColumnarInvalidLogicalOffset; + } + + LockRelationForExtension(rel, ExclusiveLock); + + ColumnarMetapage metapage = ColumnarMetapageRead(rel, false); + + uint64 alignedReservation = AlignReservation(metapage.reservedOffset); + uint64 nextReservation = alignedReservation + amount; + metapage.reservedOffset = nextReservation; + + /* write new reservation */ + ColumnarOverwriteMetapage(rel, metapage); + + /* last used PhysicalAddr of new reservation */ + PhysicalAddr final = LogicalToPhysical(nextReservation - 1); + + /* extend with new pages */ + RelationOpenSmgr(rel); + BlockNumber nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); + + while (nblocks <= final.blockno) + { + Buffer newBuffer = ReadBuffer(rel, P_NEW); + Assert(BufferGetBlockNumber(newBuffer) == nblocks); + ReleaseBuffer(newBuffer); + nblocks++; + } + + UnlockRelationForExtension(rel, ExclusiveLock); + + return alignedReservation; +} + + +/* + * ColumnarStorageRead - map the logical offset to a block and offset, then + * read the buffer from multiple blocks if necessary. + */ +void +ColumnarStorageRead(Relation rel, uint64 logicalOffset, char *data, uint32 amount) +{ + /* if there's no work to do, succeed even with invalid offset */ + if (amount == 0) + { + return; + } + + if (!ColumnarLogicalOffsetIsValid(logicalOffset)) + { + elog(ERROR, + "attempted columnar read on relation %d from invalid logical offset: " + UINT64_FORMAT, + rel->rd_id, logicalOffset); + } + + uint64 read = 0; + + while (read < amount) + { + PhysicalAddr addr = LogicalToPhysical(logicalOffset + read); + + uint32 to_read = Min(amount - read, BLCKSZ - addr.offset); + ReadFromBlock(rel, addr.blockno, addr.offset, data + read, to_read, + false); + + read += to_read; + } +} + + +/* + * ColumnarStorageWrite - map the logical offset to a block and offset, then + * write the buffer across multiple blocks if necessary. + */ +void +ColumnarStorageWrite(Relation rel, uint64 logicalOffset, char *data, uint32 amount) +{ + /* if there's no work to do, succeed even with invalid offset */ + if (amount == 0) + { + return; + } + + if (!ColumnarLogicalOffsetIsValid(logicalOffset)) + { + elog(ERROR, + "attempted columnar write on relation %d to invalid logical offset: " + UINT64_FORMAT, + rel->rd_id, logicalOffset); + } + + uint64 written = 0; + + while (written < amount) + { + PhysicalAddr addr = LogicalToPhysical(logicalOffset + written); + + uint64 to_write = Min(amount - written, BLCKSZ - addr.offset); + WriteToBlock(rel, addr.blockno, addr.offset, data + written, to_write, + false); + + written += to_write; + } +} + + +/* + * ColumnarStorageTruncate - truncate the columnar storage such that + * newDataReservation will be the first unused logical offset available. Free + * pages at the end of the relation. + * + * Caller must hold AccessExclusiveLock on the relation. + * + * Returns true if pages were truncated; false otherwise. + */ +bool +ColumnarStorageTruncate(Relation rel, uint64 newDataReservation) +{ + if (!ColumnarLogicalOffsetIsValid(newDataReservation)) + { + elog(ERROR, + "attempted to truncate relation %d to invalid logical offset: " UINT64_FORMAT, + rel->rd_id, newDataReservation); + } + + RelationOpenSmgr(rel); + BlockNumber old_rel_pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); + if (old_rel_pages == 0) + { + /* nothing to do */ + return false; + } + + LockRelationForExtension(rel, ExclusiveLock); + + ColumnarMetapage metapage = ColumnarMetapageRead(rel, false); + + if (metapage.reservedOffset < newDataReservation) + { + elog(ERROR, + "attempted to truncate relation %d to offset " UINT64_FORMAT \ + " which is higher than existing offset " UINT64_FORMAT, + rel->rd_id, newDataReservation, metapage.reservedOffset); + } + + if (metapage.reservedOffset == newDataReservation) + { + /* nothing to do */ + UnlockRelationForExtension(rel, ExclusiveLock); + return false; + } + + metapage.reservedOffset = newDataReservation; + + /* write new reservation */ + ColumnarOverwriteMetapage(rel, metapage); + + UnlockRelationForExtension(rel, ExclusiveLock); + + PhysicalAddr final = LogicalToPhysical(newDataReservation - 1); + BlockNumber new_rel_pages = final.blockno + 1; + Assert(new_rel_pages <= old_rel_pages); + + /* + * Truncate the storage. Note that RelationTruncate() takes care of + * Write Ahead Logging. + */ + if (new_rel_pages < old_rel_pages) + { + RelationTruncate(rel, new_rel_pages); + return true; + } + + return false; +} + + +/* + * ColumnarOverwriteMetapage writes given columnarMetapage back to metapage + * for given relation. + */ +static void +ColumnarOverwriteMetapage(Relation relation, ColumnarMetapage columnarMetapage) +{ + /* clear metapage because we are overwriting */ + bool clear = true; + WriteToBlock(relation, COLUMNAR_METAPAGE_BLOCKNO, SizeOfPageHeaderData, + (char *) &columnarMetapage, sizeof(ColumnarMetapage), clear); +} + + +/* + * ColumnarMetapageRead - read the current contents of the metapage. Error if + * it does not exist. Throw an error if the metapage is not the current + * version, unless 'force' is true. + * + * NB: it's safe to read a different version of a metapage because we + * guarantee that fields will only be added and existing fields will never be + * changed. However, it's important that we don't depend on new fields being + * set properly when we read an old metapage; an old metapage should only be + * read for the purposes of upgrading or error checking. + */ +static ColumnarMetapage +ColumnarMetapageRead(Relation rel, bool force) +{ + RelationOpenSmgr(rel); + BlockNumber nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); + if (nblocks == 0) + { + /* + * We only expect this to happen when upgrading citus.so. This is because, + * in current version of columnar, we immediately create the metapage + * for columnar tables, i.e right after creating the table. + * However in older versions, we were creating metapages lazily, i.e + * when ingesting data to columnar table. + */ + ereport(ERROR, (errmsg("columnar metapage for relation \"%s\" does not exist", + RelationGetRelationName(rel)), + errhint(OLD_METAPAGE_VERSION_HINT))); + } + + /* + * Regardless of "force" parameter, always force read metapage block. + * We will check metapage version in ColumnarMetapageCheckVersion + * depending on "force". + */ + bool forceReadBlock = true; + ColumnarMetapage metapage; + ReadFromBlock(rel, COLUMNAR_METAPAGE_BLOCKNO, SizeOfPageHeaderData, + (char *) &metapage, sizeof(ColumnarMetapage), forceReadBlock); + + if (!force) + { + ColumnarMetapageCheckVersion(rel, &metapage); + } + + return metapage; +} + + +/* + * ReadFromBlock - read bytes from a page at the given offset. If 'force' is + * true, don't check pd_lower; useful when reading a metapage of unknown + * version. + */ +static void +ReadFromBlock(Relation rel, BlockNumber blockno, uint32 offset, char *buf, + uint32 len, bool force) +{ + Buffer buffer = ReadBuffer(rel, blockno); + Page page = BufferGetPage(buffer); + PageHeader phdr = (PageHeader) page; + + if (BLCKSZ < offset + len || (!force && (phdr->pd_lower < offset + len))) + { + elog(ERROR, + "attempt to read columnar data of length %d from offset %d of block %d of relation %d", + len, offset, blockno, rel->rd_id); + } + + memcpy_s(buf, len, page + offset, len); + ReleaseBuffer(buffer); +} + + +/* + * WriteToBlock - append data to a block, initializing if necessary, and emit + * WAL. If 'clear' is true, always clear the data on the page and reinitialize + * it first, and offset must be SizeOfPageHeaderData. Otherwise, offset must + * be equal to pd_lower and pd_lower will be set to the end of the written + * data. + */ +static void +WriteToBlock(Relation rel, BlockNumber blockno, uint32 offset, char *buf, + uint32 len, bool clear) +{ + Buffer buffer = ReadBuffer(rel, blockno); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + Page page = BufferGetPage(buffer); + PageHeader phdr = (PageHeader) page; + if (PageIsNew(page) || clear) + { + PageInit(page, BLCKSZ, 0); + } + + if (phdr->pd_lower != offset || phdr->pd_upper - offset < len) + { + elog(ERROR, + "attempt to write columnar data of length %d to offset %d of block %d of relation %d", + len, offset, blockno, rel->rd_id); + } + + START_CRIT_SECTION(); + + memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower, buf, len); + phdr->pd_lower += len; + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(rel)) + { + XLogBeginInsert(); + + /* + * Since columnar will mostly write whole pages we force the transmission of the + * whole image in the buffer + */ + XLogRegisterBuffer(0, buffer, REGBUF_FORCE_IMAGE); + + XLogRecPtr recptr = XLogInsert(RM_GENERIC_ID, 0); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); +} + + +/* + * AlignReservation - given an unused logical byte offset, align it so that it + * falls at the start of a page. + * + * XXX: Reconsider whether we want/need to do this at all. + */ +static uint64 +AlignReservation(uint64 prevReservation) +{ + PhysicalAddr prevAddr = LogicalToPhysical(prevReservation); + uint64 alignedReservation = prevReservation; + + if (prevAddr.offset != SizeOfPageHeaderData) + { + /* not aligned; align on beginning of next page */ + PhysicalAddr initial = { 0 }; + initial.blockno = prevAddr.blockno + 1; + initial.offset = SizeOfPageHeaderData; + alignedReservation = PhysicalToLogical(initial); + } + + Assert(alignedReservation >= prevReservation); + return alignedReservation; +} + + +/* + * ColumnarMetapageIsCurrent - is the metapage at the latest version? + */ +static bool +ColumnarMetapageIsCurrent(ColumnarMetapage *metapage) +{ + return (metapage->versionMajor == COLUMNAR_VERSION_MAJOR && + metapage->versionMinor == COLUMNAR_VERSION_MINOR); +} + + +/* + * ColumnarMetapageIsOlder - is the metapage older than the current version? + */ +static bool +ColumnarMetapageIsOlder(ColumnarMetapage *metapage) +{ + return (metapage->versionMajor < COLUMNAR_VERSION_MAJOR || + (metapage->versionMajor == COLUMNAR_VERSION_MAJOR && + (int) metapage->versionMinor < (int) COLUMNAR_VERSION_MINOR)); +} + + +/* + * ColumnarMetapageIsNewer - is the metapage newer than the current version? + */ +static bool +ColumnarMetapageIsNewer(ColumnarMetapage *metapage) +{ + return (metapage->versionMajor > COLUMNAR_VERSION_MAJOR || + (metapage->versionMajor == COLUMNAR_VERSION_MAJOR && + metapage->versionMinor > COLUMNAR_VERSION_MINOR)); +} + + +/* + * ColumnarMetapageCheckVersion - throw an error if accessing old + * version of metapage. + */ +static void +ColumnarMetapageCheckVersion(Relation rel, ColumnarMetapage *metapage) +{ + if (!ColumnarMetapageIsCurrent(metapage)) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg( + "attempted to access relation \"%s\", which uses an older columnar format", + RelationGetRelationName(rel)), + errdetail( + "Columnar format version %d.%d is required, \"%s\" has version %d.%d.", + COLUMNAR_VERSION_MAJOR, COLUMNAR_VERSION_MINOR, + RelationGetRelationName(rel), + metapage->versionMajor, metapage->versionMinor), + errhint(OLD_METAPAGE_VERSION_HINT))); + } +} diff --git a/src/backend/columnar/columnar_tableam.c b/src/backend/columnar/columnar_tableam.c index 74dba0cc6..0e2099fb5 100644 --- a/src/backend/columnar/columnar_tableam.c +++ b/src/backend/columnar/columnar_tableam.c @@ -51,10 +51,12 @@ #include "columnar/columnar.h" #include "columnar/columnar_customscan.h" +#include "columnar/columnar_storage.h" #include "columnar/columnar_tableam.h" #include "columnar/columnar_version_compat.h" #include "distributed/commands.h" #include "distributed/commands/utility_hook.h" +#include "distributed/listutils.h" #include "distributed/metadata_cache.h" /* @@ -81,12 +83,6 @@ typedef struct ColumnarScanDescData MemoryContext scanContext; Bitmapset *attr_needed; List *scanQual; - - /* - * ANALYZE requires an item pointer for sorting. We keep track of row - * number so we can construct an item pointer based on that. - */ - uint64 rowNumber; } ColumnarScanDescData; typedef struct ColumnarScanDescData *ColumnarScanDesc; @@ -115,6 +111,21 @@ static void TruncateColumnar(Relation rel, int elevel); static HeapTuple ColumnarSlotCopyHeapTuple(TupleTableSlot *slot); static void ColumnarCheckLogicalReplication(Relation rel); static Datum * detoast_values(TupleDesc tupleDesc, Datum *orig_values, bool *isnull); +static ItemPointerData row_number_to_tid(uint64 rowNumber); +static uint64 tid_to_row_number(ItemPointerData tid); +static void ErrorIfInvalidRowNumber(uint64 rowNumber); +static void ColumnarReportTotalVirtualBlocks(Relation relation, Snapshot snapshot, + int progressArrIndex); +static BlockNumber ColumnarGetNumberOfVirtualBlocks(Relation relation, Snapshot snapshot); +static ItemPointerData ColumnarGetHighestItemPointer(Relation relation, + Snapshot snapshot); +static double ColumnarReadRowsIntoIndex(TableScanDesc scan, + Relation indexRelation, + IndexInfo *indexInfo, + bool progress, + IndexBuildCallback indexCallback, + void *indexCallbackState, + EState *estate, ExprState *predicate); /* Custom tuple slot ops used for columnar. Initialized in columnar_tableam_init(). */ static TupleTableSlotOps TTSOpsColumnar; @@ -264,8 +275,9 @@ columnar_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlo ExecClearTuple(slot); + uint64 rowNumber; bool nextRowFound = ColumnarReadNextRow(scan->cs_readState, slot->tts_values, - slot->tts_isnull); + slot->tts_isnull, &rowNumber); if (!nextRowFound) { @@ -274,65 +286,130 @@ columnar_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlo ExecStoreVirtualTuple(slot); - /* - * Set slot's item pointer block & offset to non-zero. These are - * used just for sorting in acquire_sample_rows(), so rowNumber - * is good enough. See ColumnarSlotCopyHeapTuple for more info. - * - * offset is 16-bits, so use the first 15 bits for offset and - * rest as block number. - */ - ItemPointerSetBlockNumber(&(slot->tts_tid), scan->rowNumber / (32 * 1024) + 1); - ItemPointerSetOffsetNumber(&(slot->tts_tid), scan->rowNumber % (32 * 1024) + 1); - - scan->rowNumber++; + slot->tts_tid = row_number_to_tid(rowNumber); return true; } +/* + * row_number_to_tid maps given rowNumber to ItemPointerData. + */ +static ItemPointerData +row_number_to_tid(uint64 rowNumber) +{ + ErrorIfInvalidRowNumber(rowNumber); + + ItemPointerData tid = { 0 }; + ItemPointerSetBlockNumber(&tid, rowNumber / VALID_ITEMPOINTER_OFFSETS); + ItemPointerSetOffsetNumber(&tid, rowNumber % VALID_ITEMPOINTER_OFFSETS + + FirstOffsetNumber); + return tid; +} + + +/* + * tid_to_row_number maps given ItemPointerData to rowNumber. + */ +static uint64 +tid_to_row_number(ItemPointerData tid) +{ + uint64 rowNumber = ItemPointerGetBlockNumber(&tid) * VALID_ITEMPOINTER_OFFSETS + + ItemPointerGetOffsetNumber(&tid) - FirstOffsetNumber; + + ErrorIfInvalidRowNumber(rowNumber); + + return rowNumber; +} + + +/* + * ErrorIfInvalidRowNumber errors out if given rowNumber is invalid. + */ +static void +ErrorIfInvalidRowNumber(uint64 rowNumber) +{ + if (rowNumber == COLUMNAR_INVALID_ROW_NUMBER) + { + /* not expected but be on the safe side */ + ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("unexpected row number for columnar table"))); + } + else if (rowNumber > COLUMNAR_MAX_ROW_NUMBER) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("columnar tables can't have row numbers " + "greater than " UINT64_FORMAT, + (uint64) COLUMNAR_MAX_ROW_NUMBER), + errhint("Consider using VACUUM FULL for your table"))); + } +} + + static Size columnar_parallelscan_estimate(Relation rel) { - elog(ERROR, "columnar_parallelscan_estimate not implemented"); + return sizeof(ParallelBlockTableScanDescData); } static Size columnar_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan) { - elog(ERROR, "columnar_parallelscan_initialize not implemented"); + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + + bpscan->base.phs_relid = RelationGetRelid(rel); + bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel); + bpscan->base.phs_syncscan = synchronize_seqscans && + !RelationUsesLocalBuffers(rel) && + bpscan->phs_nblocks > NBuffers / 4; + SpinLockInit(&bpscan->phs_mutex); + bpscan->phs_startblock = InvalidBlockNumber; + pg_atomic_init_u64(&bpscan->phs_nallocated, 0); + + return sizeof(ParallelBlockTableScanDescData); } static void columnar_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) { - elog(ERROR, "columnar_parallelscan_reinitialize not implemented"); + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + pg_atomic_write_u64(&bpscan->phs_nallocated, 0); } static IndexFetchTableData * columnar_index_fetch_begin(Relation rel) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + Oid relfilenode = rel->rd_node.relNode; + if (PendingWritesInUpperTransactions(relfilenode, GetCurrentSubTransactionId())) + { + /* XXX: maybe we can just flush the data and continue */ + elog(ERROR, "cannot read from index when there is unflushed data in " + "upper transactions"); + } + + FlushWriteStateForRelfilenode(relfilenode, GetCurrentSubTransactionId()); + + IndexFetchTableData *scan = palloc0(sizeof(IndexFetchTableData)); + scan->rel = rel; + return scan; } static void columnar_index_fetch_reset(IndexFetchTableData *scan) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + /* no-op */ } static void columnar_index_fetch_end(IndexFetchTableData *scan) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + columnar_index_fetch_reset(scan); + pfree(scan); } @@ -343,8 +420,37 @@ columnar_index_fetch_tuple(struct IndexFetchTableData *scan, TupleTableSlot *slot, bool *call_again, bool *all_dead) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + /* no HOT chains are possible in columnar, directly set it to false */ + *call_again = false; + + /* + * No dead tuples are possible in columnar, set it to false if it's + * passed to be non-NULL. + */ + if (all_dead) + { + *all_dead = false; + } + + ExecClearTuple(slot); + + /* we need all columns */ + int natts = scan->rel->rd_att->natts; + Bitmapset *attr_needed = bms_add_range(NULL, 0, natts - 1); + TupleDesc relationTupleDesc = RelationGetDescr(scan->rel); + List *relationColumnList = NeededColumnsList(relationTupleDesc, attr_needed); + uint64 rowNumber = tid_to_row_number(*tid); + if (!ColumnarReadRowByRowNumber(scan->rel, rowNumber, relationColumnList, + slot->tts_values, slot->tts_isnull, snapshot)) + { + return false; + } + + slot->tts_tableOid = RelationGetRelid(scan->rel); + slot->tts_tid = *tid; + ExecStoreVirtualTuple(slot); + + return true; } @@ -411,7 +517,8 @@ columnar_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, Datum *values = detoast_values(slot->tts_tupleDescriptor, slot->tts_values, slot->tts_isnull); - ColumnarWriteRow(writeState, values, slot->tts_isnull); + uint64 writtenRowNumber = ColumnarWriteRow(writeState, values, slot->tts_isnull); + slot->tts_tid = row_number_to_tid(writtenRowNumber); MemoryContextSwitchTo(oldContext); MemoryContextReset(ColumnarWritePerTupleContext(writeState)); @@ -457,7 +564,10 @@ columnar_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, Datum *values = detoast_values(tupleSlot->tts_tupleDescriptor, tupleSlot->tts_values, tupleSlot->tts_isnull); - ColumnarWriteRow(writeState, values, tupleSlot->tts_isnull); + uint64 writtenRowNumber = ColumnarWriteRow(writeState, values, + tupleSlot->tts_isnull); + tupleSlot->tts_tid = row_number_to_tid(writtenRowNumber); + MemoryContextReset(ColumnarWritePerTupleContext(writeState)); } @@ -516,17 +626,24 @@ columnar_relation_set_new_filenode(Relation rel, errmsg("unlogged columnar tables are not supported"))); } - Oid oldRelfilenode = rel->rd_node.relNode; + /* + * If existing and new relfilenode are different, that means the existing + * storage was dropped and we also need to clean up the metadata and + * state. If they are equal, this is a new relation object and we don't + * need to clean anything. + */ + if (rel->rd_node.relNode != newrnode->relNode) + { + MarkRelfilenodeDropped(rel->rd_node.relNode, GetCurrentSubTransactionId()); - MarkRelfilenodeDropped(oldRelfilenode, GetCurrentSubTransactionId()); - - /* delete old relfilenode metadata */ - DeleteMetadataRows(rel->rd_node); + DeleteMetadataRows(rel->rd_node); + } *freezeXid = RecentXmin; *minmulti = GetOldestMultiXactId(); SMgrRelation srel = RelationCreateStorage(*newrnode, persistence); + ColumnarStorageInit(srel, ColumnarMetadataNewStorageId()); InitColumnarOptions(rel->rd_id); smgrclose(srel); @@ -554,7 +671,9 @@ columnar_relation_nontransactional_truncate(Relation rel) */ RelationTruncate(rel, 0); - /* we will lazily initialize new metadata in first stripe reservation */ + uint64 storageId = ColumnarMetadataNewStorageId(); + RelationOpenSmgr(rel); + ColumnarStorageInit(rel->rd_smgr, storageId); } @@ -588,7 +707,8 @@ columnar_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, if (OldIndex != NULL || use_sort) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + errmsg("clustering columnar tables using indexes is " + "not supported"))); } /* @@ -619,7 +739,8 @@ columnar_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, *num_tuples = 0; - while (ColumnarReadNextRow(readState, values, nulls)) + /* we don't need to know rowNumber here */ + while (ColumnarReadNextRow(readState, values, nulls, NULL)) { ColumnarWriteRow(writeState, values, nulls); (*num_tuples)++; @@ -667,6 +788,14 @@ static void columnar_vacuum_rel(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy) { + /* + * If metapage version of relation is older, then we hint users to VACUUM + * the relation in ColumnarMetapageCheckVersion. So if needed, upgrade + * the metapage before doing anything. + */ + bool isUpgrade = true; + ColumnarStorageUpdateIfNeeded(rel, isUpgrade); + int elevel = (params->options & VACOPT_VERBOSE) ? INFO : DEBUG2; /* this should have been resolved by vacuum.c until now */ @@ -840,34 +969,25 @@ TruncateColumnar(Relation rel, int elevel) return; } - RelationOpenSmgr(rel); - BlockNumber old_rel_pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); - RelationCloseSmgr(rel); - /* * Due to the AccessExclusive lock there's no danger that * new stripes be added beyond highestPhysicalAddress while * we're truncating. */ - SmgrAddr highestPhysicalAddress = - logical_to_smgr(GetHighestUsedAddress(rel->rd_node)); + uint64 newDataReservation = Max(GetHighestUsedAddress(rel->rd_node) + 1, + ColumnarFirstLogicalOffset); - /* - * Unlock and return if truncation won't reduce data file's size. - */ - BlockNumber new_rel_pages = Min(old_rel_pages, - highestPhysicalAddress.blockno + 1); - if (new_rel_pages == old_rel_pages) + RelationOpenSmgr(rel); + BlockNumber old_rel_pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); + + if (!ColumnarStorageTruncate(rel, newDataReservation)) { UnlockRelation(rel, AccessExclusiveLock); return; } - /* - * Truncate the storage. Note that RelationTruncate() takes care of - * Write Ahead Logging. - */ - RelationTruncate(rel, new_rel_pages); + RelationOpenSmgr(rel); + BlockNumber new_rel_pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); /* * We can release the exclusive lock as soon as we have truncated. @@ -964,7 +1084,7 @@ columnar_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, static double -columnar_index_build_range_scan(Relation heapRelation, +columnar_index_build_range_scan(Relation columnarRelation, Relation indexRelation, IndexInfo *indexInfo, bool allow_sync, @@ -976,8 +1096,278 @@ columnar_index_build_range_scan(Relation heapRelation, void *callback_state, TableScanDesc scan) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + if (start_blockno != 0 || numblocks != InvalidBlockNumber) + { + /* + * Columnar utility hook already errors out for BRIN indexes on columnar + * tables, but be on the safe side. + */ + ereport(ERROR, (errmsg("BRIN indexes on columnar tables are not supported"))); + } + + if (indexInfo->ii_Concurrent) + { + /* we already don't allow CONCURRENTLY syntax but be on the safe side */ + ereport(ERROR, (errmsg("concurrent index builds are not supported " + "for columnar tables"))); + } + + if (scan) + { + /* + * Scan is initialized iff postgres decided to build the index using + * parallel workers. In this case, we simply return for parallel + * workers since we don't support parallel scan on columnar tables. + */ + if (IsBackgroundWorker) + { + ereport(DEBUG4, (errmsg("ignoring parallel worker when building " + "index since parallel scan on columnar " + "tables is not supported"))); + return 0; + } + + ereport(NOTICE, (errmsg("falling back to serial index build since " + "parallel scan on columnar tables is not " + "supported"))); + } + + /* + * In a normal index build, we use SnapshotAny to retrieve all tuples. In + * a concurrent build or during bootstrap, we take a regular MVCC snapshot + * and index whatever's live according to that. + */ + TransactionId OldestXmin = InvalidTransactionId; + + /* + * We already don't allow concurrent index builds so ii_Concurrent + * will always be false, but let's keep the code close to heapAM. + */ + if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) + { + /* ignore lazy VACUUM's */ + OldestXmin = GetOldestXmin(columnarRelation, PROCARRAY_FLAGS_VACUUM); + } + + Snapshot snapshot = { 0 }; + bool snapshotRegisteredByUs = false; + if (!scan) + { + /* + * For serial index build, we begin our own scan. We may also need to + * register a snapshot whose lifetime is under our direct control. + */ + if (!TransactionIdIsValid(OldestXmin)) + { + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + snapshotRegisteredByUs = true; + } + else + { + snapshot = SnapshotAny; + } + + int nkeys = 0; + ScanKeyData *scanKey = NULL; + bool allowAccessStrategy = true; + scan = table_beginscan_strat(columnarRelation, snapshot, nkeys, scanKey, + allowAccessStrategy, allow_sync); + } + else + { + /* + * For parallel index build, we don't register/unregister own snapshot + * since snapshot is taken from parallel scan. Note that even if we + * don't support parallel index builds, we still continue building the + * index via the main backend and we should still rely on the snapshot + * provided by parallel scan. + */ + snapshot = scan->rs_snapshot; + } + + if (progress) + { + ColumnarReportTotalVirtualBlocks(columnarRelation, snapshot, + PROGRESS_SCAN_BLOCKS_TOTAL); + } + + /* + * Set up execution state for predicate, if any. + * Note that this is only useful for partial indexes. + */ + EState *estate = CreateExecutorState(); + ExprContext *econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = table_slot_create(columnarRelation, NULL); + ExprState *predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + double reltuples = ColumnarReadRowsIntoIndex(scan, indexRelation, indexInfo, + progress, callback, callback_state, + estate, predicate); + table_endscan(scan); + + if (progress) + { + /* report the last "virtual" block as "done" */ + ColumnarReportTotalVirtualBlocks(columnarRelation, snapshot, + PROGRESS_SCAN_BLOCKS_DONE); + } + + if (snapshotRegisteredByUs) + { + UnregisterSnapshot(snapshot); + } + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(estate); + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; + + return reltuples; +} + + +/* + * ColumnarReportTotalVirtualBlocks reports progress for index build based on + * number of "virtual" blocks that given relation has. + * "progressArrIndex" argument determines which entry in st_progress_param + * array should be updated. In this case, we only expect PROGRESS_SCAN_BLOCKS_TOTAL + * or PROGRESS_SCAN_BLOCKS_DONE to specify whether we want to report calculated + * number of blocks as "done" or as "total" number of "virtual" blocks to scan. + */ +static void +ColumnarReportTotalVirtualBlocks(Relation relation, Snapshot snapshot, + int progressArrIndex) +{ + /* + * Indeed, columnar tables might have gaps between row numbers, e.g + * due to aborted transactions etc. Also, ItemPointer BlockNumber's + * for columnar tables don't actually correspond to actual disk blocks + * as in heapAM. For this reason, we call them as "virtual" blocks. At + * the moment, we believe it is better to report our progress based on + * this "virtual" block concept instead of doing nothing. + */ + Assert(progressArrIndex == PROGRESS_SCAN_BLOCKS_TOTAL || + progressArrIndex == PROGRESS_SCAN_BLOCKS_DONE); + BlockNumber nvirtualBlocks = + ColumnarGetNumberOfVirtualBlocks(relation, snapshot); + pgstat_progress_update_param(progressArrIndex, nvirtualBlocks); +} + + +/* + * ColumnarGetNumberOfVirtualBlocks returns total number of "virtual" blocks + * that given columnar table has based on based on ItemPointer BlockNumber's. + */ +static BlockNumber +ColumnarGetNumberOfVirtualBlocks(Relation relation, Snapshot snapshot) +{ + ItemPointerData highestItemPointer = + ColumnarGetHighestItemPointer(relation, snapshot); + if (!ItemPointerIsValid(&highestItemPointer)) + { + /* table is empty according to our snapshot */ + return 0; + } + + /* + * Since BlockNumber is 0-based, increment it by 1 to find the total + * number of "virtual" blocks. + */ + return ItemPointerGetBlockNumber(&highestItemPointer) + 1; +} + + +/* + * ColumnarGetHighestItemPointer returns ItemPointerData for the tuple with + * highest tid for given relation. + * If given relation is empty, then returns invalid item pointer. + */ +static ItemPointerData +ColumnarGetHighestItemPointer(Relation relation, Snapshot snapshot) +{ + StripeMetadata *stripeWithHighestRowNumber = + FindStripeWithHighestRowNumber(relation, snapshot); + if (stripeWithHighestRowNumber == NULL) + { + /* table is empty according to our snapshot */ + ItemPointerData invalidItemPtr; + ItemPointerSetInvalid(&invalidItemPtr); + return invalidItemPtr; + } + + uint64 highestRowNumber = stripeWithHighestRowNumber->firstRowNumber + + stripeWithHighestRowNumber->rowCount - 1; + return row_number_to_tid(highestRowNumber); +} + + +/* + * ColumnarReadRowsIntoIndex builds indexRelation tuples by reading the + * actual relation based on given "scan" and returns number of tuples + * scanned to build the indexRelation. + */ +static double +ColumnarReadRowsIntoIndex(TableScanDesc scan, Relation indexRelation, + IndexInfo *indexInfo, bool progress, + IndexBuildCallback indexCallback, + void *indexCallbackState, EState *estate, + ExprState *predicate) +{ + double reltuples = 0; + + BlockNumber lastReportedBlockNumber = InvalidBlockNumber; + + ExprContext *econtext = GetPerTupleExprContext(estate); + TupleTableSlot *slot = econtext->ecxt_scantuple; + while (columnar_getnextslot(scan, ForwardScanDirection, slot)) + { + CHECK_FOR_INTERRUPTS(); + + BlockNumber currentBlockNumber = ItemPointerGetBlockNumber(&slot->tts_tid); + if (progress && lastReportedBlockNumber != currentBlockNumber) + { + /* + * columnar_getnextslot guarantees that returned tuple will + * always have a greater ItemPointer than the ones we fetched + * before, so we directly use BlockNumber to report our progress. + */ + Assert(lastReportedBlockNumber == InvalidBlockNumber || + currentBlockNumber >= lastReportedBlockNumber); + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + currentBlockNumber); + lastReportedBlockNumber = currentBlockNumber; + } + + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + if (predicate != NULL && !ExecQual(predicate, econtext)) + { + /* for partial indexes, discard tuples that don't satisfy the predicate */ + continue; + } + + Datum indexValues[INDEX_MAX_KEYS]; + bool indexNulls[INDEX_MAX_KEYS]; + FormIndexDatum(indexInfo, slot, estate, indexValues, indexNulls); + + ItemPointerData itemPointerData = slot->tts_tid; + + /* currently, columnar tables can't have dead tuples */ + bool tupleIsAlive = true; +#if PG_VERSION_NUM >= PG_VERSION_13 + indexCallback(indexRelation, &itemPointerData, indexValues, indexNulls, + tupleIsAlive, indexCallbackState); +#else + HeapTuple scanTuple = ExecCopySlotHeapTuple(slot); + scanTuple->t_self = itemPointerData; + indexCallback(indexRelation, scanTuple, indexValues, indexNulls, + tupleIsAlive, indexCallbackState); +#endif + + reltuples++; + } + + return reltuples; } @@ -988,8 +1378,15 @@ columnar_index_validate_scan(Relation heapRelation, Snapshot snapshot, ValidateIndexState *state) { + /* + * This is only called for concurrent index builds, + * see table_index_validate_scan. + * Note that we already error out for concurrent index + * builds in utility hook but be on the safe side. + */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + errmsg("concurrent index builds are not supported for " + "columnar tables"))); } @@ -1179,13 +1576,7 @@ ColumnarSlotCopyHeapTuple(TupleTableSlot *slot) slot->tts_values, slot->tts_isnull); - /* - * We need to set item pointer, since implementation of ANALYZE - * requires it. See the qsort in acquire_sample_rows() and - * also compare_rows in backend/commands/analyze.c. - * - * slot->tts_tid is filled in columnar_getnextslot. - */ + /* slot->tts_tid is filled in columnar_getnextslot */ tuple->t_self = slot->tts_tid; return tuple; @@ -1315,24 +1706,34 @@ ColumnarProcessUtility(PlannedStmt *pstmt, { IndexStmt *indexStmt = (IndexStmt *) parsetree; - /* - * We should reject CREATE INDEX CONCURRENTLY before DefineIndex() is - * called. Erroring in callbacks called from DefineIndex() will create - * the index and mark it as INVALID, which will cause segfault during - * inserts. - */ - if (indexStmt->concurrent) + Relation rel = relation_openrv(indexStmt->relation, + GetCreateIndexRelationLockMode(indexStmt)); + if (rel->rd_tableam == GetColumnarTableAmRoutine()) { - Relation rel = relation_openrv(indexStmt->relation, - ShareUpdateExclusiveLock); - if (rel->rd_tableam == GetColumnarTableAmRoutine()) + /* + * We should reject CREATE INDEX CONCURRENTLY before DefineIndex() is + * called. Erroring in callbacks called from DefineIndex() will create + * the index and mark it as INVALID, which will cause segfault during + * inserts. + */ + if (indexStmt->concurrent) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + errmsg("concurrent index commands are not " + "supported for columnar tables"))); } - RelationClose(rel); + /* for now, we don't support index access methods other than btree & hash */ + if (strncmp(indexStmt->accessMethod, "btree", NAMEDATALEN) != 0 && + strncmp(indexStmt->accessMethod, "hash", NAMEDATALEN) != 0) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only btree and hash indexes are supported on " + "columnar tables "))); + } } + + RelationClose(rel); } PrevProcessUtilityHook(pstmt, queryString, context, @@ -1374,6 +1775,17 @@ static const TableAmRoutine columnar_am_methods = { .scan_rescan = columnar_rescan, .scan_getnextslot = columnar_getnextslot, + /* + * Postgres calls following three callbacks during index builds, if it + * decides to use parallel workers when building the index. On the other + * hand, we don't support parallel scans on columnar tables but we also + * want to fallback to serial index build. For this reason, we both skip + * parallel workers in columnar_index_build_range_scan and also provide + * basic implementations for those callbacks based on their corresponding + * implementations in heapAM. + * Note that for regular query plans, we already ignore parallel paths via + * ColumnarSetRelPathlistHook. + */ .parallelscan_estimate = columnar_parallelscan_estimate, .parallelscan_initialize = columnar_parallelscan_initialize, .parallelscan_reinitialize = columnar_parallelscan_reinitialize, @@ -1840,3 +2252,75 @@ alter_columnar_table_reset(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + + +/* + * upgrade_columnar_storage - upgrade columnar storage to the current + * version. + * + * DDL: + * CREATE OR REPLACE FUNCTION upgrade_columnar_storage(rel regclass) + * RETURNS VOID + * STRICT + * LANGUAGE c AS 'MODULE_PATHNAME', 'upgrade_columnar_storage'; + */ +PG_FUNCTION_INFO_V1(upgrade_columnar_storage); +Datum +upgrade_columnar_storage(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + + /* + * ACCESS EXCLUSIVE LOCK is not required by the low-level routines, so we + * can take only an ACCESS SHARE LOCK. But all access to non-current + * columnar tables will fail anyway, so it's better to take ACCESS + * EXLUSIVE LOCK now. + */ + Relation rel = table_open(relid, AccessExclusiveLock); + if (!IsColumnarTableAmTable(relid)) + { + ereport(ERROR, (errmsg("table %s is not a columnar table", + quote_identifier(RelationGetRelationName(rel))))); + } + + ColumnarStorageUpdateIfNeeded(rel, true); + + table_close(rel, AccessExclusiveLock); + PG_RETURN_VOID(); +} + + +/* + * downgrade_columnar_storage - downgrade columnar storage to the + * current version. + * + * DDL: + * CREATE OR REPLACE FUNCTION downgrade_columnar_storage(rel regclass) + * RETURNS VOID + * STRICT + * LANGUAGE c AS 'MODULE_PATHNAME', 'downgrade_columnar_storage'; + */ +PG_FUNCTION_INFO_V1(downgrade_columnar_storage); +Datum +downgrade_columnar_storage(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + + /* + * ACCESS EXCLUSIVE LOCK is not required by the low-level routines, so we + * can take only an ACCESS SHARE LOCK. But all access to non-current + * columnar tables will fail anyway, so it's better to take ACCESS + * EXLUSIVE LOCK now. + */ + Relation rel = table_open(relid, AccessExclusiveLock); + if (!IsColumnarTableAmTable(relid)) + { + ereport(ERROR, (errmsg("table %s is not a columnar table", + quote_identifier(RelationGetRelationName(rel))))); + } + + ColumnarStorageUpdateIfNeeded(rel, false); + + table_close(rel, AccessExclusiveLock); + PG_RETURN_VOID(); +} diff --git a/src/backend/columnar/columnar_writer.c b/src/backend/columnar/columnar_writer.c index 386c4781d..cafef2381 100644 --- a/src/backend/columnar/columnar_writer.c +++ b/src/backend/columnar/columnar_writer.c @@ -30,6 +30,7 @@ #include "utils/relfilenodemap.h" #include "columnar/columnar.h" +#include "columnar/columnar_storage.h" #include "columnar/columnar_version_compat.h" struct ColumnarWriteState @@ -42,6 +43,7 @@ struct ColumnarWriteState MemoryContext perTupleContext; StripeBuffers *stripeBuffers; StripeSkipList *stripeSkipList; + uint64 stripeFirstRowNumber; ColumnarOptions options; ChunkData *chunkData; @@ -128,6 +130,7 @@ ColumnarBeginWrite(RelFileNode relfilenode, writeState->comparisonFunctionArray = comparisonFunctionArray; writeState->stripeBuffers = NULL; writeState->stripeSkipList = NULL; + writeState->stripeFirstRowNumber = COLUMNAR_INVALID_ROW_NUMBER; writeState->stripeWriteContext = stripeWriteContext; writeState->chunkData = chunkData; writeState->compressionBuffer = NULL; @@ -146,8 +149,10 @@ ColumnarBeginWrite(RelFileNode relfilenode, * corresponding skip nodes. Then, whole chunk data is compressed at every * rowChunkCount insertion. Then, if row count exceeds stripeMaxRowCount, we flush * the stripe, and add its metadata to the table footer. + * + * Returns the "row number" assigned to written row. */ -void +uint64 ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *columnNulls) { uint32 columnIndex = 0; @@ -169,6 +174,14 @@ ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *colu writeState->stripeSkipList = stripeSkipList; writeState->compressionBuffer = makeStringInfo(); + Oid relationId = RelidByRelfilenode(writeState->relfilenode.spcNode, + writeState->relfilenode.relNode); + Relation relation = relation_open(relationId, NoLock); + writeState->stripeFirstRowNumber = + ColumnarStorageReserveRowNumber(relation, + options->stripeRowCount); + relation_close(relation, NoLock); + /* * serializedValueBuffer lives in stripe write memory context so it needs to be * initialized when the stripe is created. @@ -225,6 +238,7 @@ ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *colu SerializeChunkData(writeState, chunkIndex, chunkRowCount); } + uint64 writtenRowNumber = writeState->stripeFirstRowNumber + stripeBuffers->rowCount; stripeBuffers->rowCount++; if (stripeBuffers->rowCount >= options->stripeRowCount) { @@ -232,6 +246,8 @@ ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *colu } MemoryContextSwitchTo(oldContext); + + return writtenRowNumber; } @@ -351,80 +367,6 @@ CreateEmptyStripeSkipList(uint32 stripeMaxRowCount, uint32 chunkRowCount, } -void -WriteToSmgr(Relation rel, uint64 logicalOffset, char *data, uint32 dataLength) -{ - uint64 remaining = dataLength; - Buffer buffer; - - while (remaining > 0) - { - SmgrAddr addr = logical_to_smgr(logicalOffset); - - RelationOpenSmgr(rel); - BlockNumber nblocks PG_USED_FOR_ASSERTS_ONLY = - smgrnblocks(rel->rd_smgr, MAIN_FORKNUM); - Assert(addr.blockno < nblocks); - RelationCloseSmgr(rel); - - buffer = ReadBuffer(rel, addr.blockno); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - Page page = BufferGetPage(buffer); - PageHeader phdr = (PageHeader) page; - if (PageIsNew(page)) - { - PageInit(page, BLCKSZ, 0); - } - - /* - * After a transaction has been rolled-back, we might be - * over-writing the rolledback write, so phdr->pd_lower can be - * different from addr.offset. - * - * We reset pd_lower to reset the rolledback write. - */ - if (phdr->pd_lower > addr.offset) - { - ereport(DEBUG1, (errmsg("over-writing page %u", addr.blockno), - errdetail("This can happen after a roll-back."))); - phdr->pd_lower = addr.offset; - } - Assert(phdr->pd_lower == addr.offset); - - START_CRIT_SECTION(); - - uint64 to_write = Min(phdr->pd_upper - phdr->pd_lower, remaining); - memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower, data, to_write); - phdr->pd_lower += to_write; - - MarkBufferDirty(buffer); - - if (RelationNeedsWAL(rel)) - { - XLogBeginInsert(); - - /* - * Since columnar will mostly write whole pages we force the transmission of the - * whole image in the buffer - */ - XLogRegisterBuffer(0, buffer, REGBUF_FORCE_IMAGE); - - XLogRecPtr recptr = XLogInsert(RM_GENERIC_ID, 0); - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - UnlockReleaseBuffer(buffer); - - data += to_write; - remaining -= to_write; - logicalOffset += to_write; - } -} - - /* * FlushStripe flushes current stripe data into the file. The function first ensures * the last data chunk for each column is properly serialized and compressed. Then, @@ -502,7 +444,7 @@ FlushStripe(ColumnarWriteState *writeState) stripeMetadata = ReserveStripe(relation, stripeSize, stripeRowCount, columnCount, chunkCount, - chunkRowCount); + chunkRowCount, writeState->stripeFirstRowNumber); uint64 currentFileOffset = stripeMetadata.fileOffset; @@ -527,8 +469,8 @@ FlushStripe(ColumnarWriteState *writeState) columnBuffers->chunkBuffersArray[chunkIndex]; StringInfo existsBuffer = chunkBuffers->existsBuffer; - WriteToSmgr(relation, currentFileOffset, - existsBuffer->data, existsBuffer->len); + ColumnarStorageWrite(relation, currentFileOffset, + existsBuffer->data, existsBuffer->len); currentFileOffset += existsBuffer->len; } @@ -538,8 +480,8 @@ FlushStripe(ColumnarWriteState *writeState) columnBuffers->chunkBuffersArray[chunkIndex]; StringInfo valueBuffer = chunkBuffers->valueBuffer; - WriteToSmgr(relation, currentFileOffset, - valueBuffer->data, valueBuffer->len); + ColumnarStorageWrite(relation, currentFileOffset, + valueBuffer->data, valueBuffer->len); currentFileOffset += valueBuffer->len; } } diff --git a/src/backend/columnar/sql/columnar--10.1-1--10.2-1.sql b/src/backend/columnar/sql/columnar--10.1-1--10.2-1.sql new file mode 100644 index 000000000..0666d2f54 --- /dev/null +++ b/src/backend/columnar/sql/columnar--10.1-1--10.2-1.sql @@ -0,0 +1,32 @@ +/* columnar--10.1-1--10.2-1.sql */ + +-- For a proper mapping between tid & (stripe, row_num), add a new column to +-- columnar.stripe and define a BTREE index on this column. +-- Also include storage_id column for per-relation scans. +ALTER TABLE columnar.stripe ADD COLUMN first_row_number bigint; +CREATE INDEX stripe_first_row_number_idx ON columnar.stripe USING BTREE(storage_id, first_row_number); + +-- Populate first_row_number column of columnar.stripe table. +-- +-- For simplicity, we calculate MAX(row_count) value across all the stripes +-- of all the columanar tables and then use it to populate first_row_number +-- column. This would introduce some gaps however we are okay with that since +-- it's already the case with regular INSERT/COPY's. +DO $$ +DECLARE + max_row_count bigint; + -- this should be equal to columnar_storage.h/COLUMNAR_FIRST_ROW_NUMBER + COLUMNAR_FIRST_ROW_NUMBER constant bigint := 1; +BEGIN + SELECT MAX(row_count) INTO max_row_count FROM columnar.stripe; + UPDATE columnar.stripe SET first_row_number = COLUMNAR_FIRST_ROW_NUMBER + + (stripe_num - 1) * max_row_count; +END; +$$; + +#include "udfs/upgrade_columnar_storage/10.2-1.sql" +#include "udfs/downgrade_columnar_storage/10.2-1.sql" + +-- upgrade storage for all columnar relations +SELECT citus_internal.upgrade_columnar_storage(c.oid) FROM pg_class c, pg_am a + WHERE c.relam = a.oid AND amname = 'columnar'; diff --git a/src/backend/columnar/sql/downgrades/columnar--10.2-1--10.1-1.sql b/src/backend/columnar/sql/downgrades/columnar--10.2-1--10.1-1.sql new file mode 100644 index 000000000..7f39cce35 --- /dev/null +++ b/src/backend/columnar/sql/downgrades/columnar--10.2-1--10.1-1.sql @@ -0,0 +1,12 @@ +/* columnar--10.2-1--10.1-1.sql */ + +-- downgrade storage for all columnar relations +SELECT citus_internal.downgrade_columnar_storage(c.oid) FROM pg_class c, pg_am a + WHERE c.relam = a.oid AND amname = 'columnar'; + +DROP FUNCTION citus_internal.upgrade_columnar_storage(regclass); +DROP FUNCTION citus_internal.downgrade_columnar_storage(regclass); + +-- drop "first_row_number" column and the index defined on it +DROP INDEX columnar.stripe_first_row_number_idx; +ALTER TABLE columnar.stripe DROP COLUMN first_row_number; diff --git a/src/backend/columnar/sql/udfs/downgrade_columnar_storage/10.2-1.sql b/src/backend/columnar/sql/udfs/downgrade_columnar_storage/10.2-1.sql new file mode 100644 index 000000000..b41549d3d --- /dev/null +++ b/src/backend/columnar/sql/udfs/downgrade_columnar_storage/10.2-1.sql @@ -0,0 +1,7 @@ +CREATE OR REPLACE FUNCTION citus_internal.downgrade_columnar_storage(rel regclass) + RETURNS VOID + STRICT + LANGUAGE c AS 'MODULE_PATHNAME', $$downgrade_columnar_storage$$; + +COMMENT ON FUNCTION citus_internal.downgrade_columnar_storage(regclass) + IS 'function to downgrade the columnar storage, if necessary'; diff --git a/src/backend/columnar/sql/udfs/downgrade_columnar_storage/latest.sql b/src/backend/columnar/sql/udfs/downgrade_columnar_storage/latest.sql new file mode 100644 index 000000000..b41549d3d --- /dev/null +++ b/src/backend/columnar/sql/udfs/downgrade_columnar_storage/latest.sql @@ -0,0 +1,7 @@ +CREATE OR REPLACE FUNCTION citus_internal.downgrade_columnar_storage(rel regclass) + RETURNS VOID + STRICT + LANGUAGE c AS 'MODULE_PATHNAME', $$downgrade_columnar_storage$$; + +COMMENT ON FUNCTION citus_internal.downgrade_columnar_storage(regclass) + IS 'function to downgrade the columnar storage, if necessary'; diff --git a/src/backend/columnar/sql/udfs/upgrade_columnar_storage/10.2-1.sql b/src/backend/columnar/sql/udfs/upgrade_columnar_storage/10.2-1.sql new file mode 100644 index 000000000..a82bcc758 --- /dev/null +++ b/src/backend/columnar/sql/udfs/upgrade_columnar_storage/10.2-1.sql @@ -0,0 +1,7 @@ +CREATE OR REPLACE FUNCTION citus_internal.upgrade_columnar_storage(rel regclass) + RETURNS VOID + STRICT + LANGUAGE c AS 'MODULE_PATHNAME', $$upgrade_columnar_storage$$; + +COMMENT ON FUNCTION citus_internal.upgrade_columnar_storage(regclass) + IS 'function to upgrade the columnar storage, if necessary'; diff --git a/src/backend/columnar/sql/udfs/upgrade_columnar_storage/latest.sql b/src/backend/columnar/sql/udfs/upgrade_columnar_storage/latest.sql new file mode 100644 index 000000000..a82bcc758 --- /dev/null +++ b/src/backend/columnar/sql/udfs/upgrade_columnar_storage/latest.sql @@ -0,0 +1,7 @@ +CREATE OR REPLACE FUNCTION citus_internal.upgrade_columnar_storage(rel regclass) + RETURNS VOID + STRICT + LANGUAGE c AS 'MODULE_PATHNAME', $$upgrade_columnar_storage$$; + +COMMENT ON FUNCTION citus_internal.upgrade_columnar_storage(regclass) + IS 'function to upgrade the columnar storage, if necessary'; diff --git a/src/backend/distributed/commands/index.c b/src/backend/distributed/commands/index.c index c4cbc453d..54fa057c5 100644 --- a/src/backend/distributed/commands/index.c +++ b/src/backend/distributed/commands/index.c @@ -62,7 +62,6 @@ static List * GenerateIndexParameters(IndexStmt *createIndexStatement); static DDLJob * GenerateCreateIndexDDLJob(IndexStmt *createIndexStatement, const char *createIndexCommand); static Oid CreateIndexStmtGetRelationId(IndexStmt *createIndexStatement); -static LOCKMODE GetCreateIndexRelationLockMode(IndexStmt *createIndexStatement); static List * CreateIndexTaskList(IndexStmt *indexStmt); static List * CreateReindexTaskList(Oid relationId, ReindexStmt *reindexStmt); static void RangeVarCallbackForDropIndex(const RangeVar *rel, Oid relOid, Oid oldRelOid, @@ -503,7 +502,7 @@ CreateIndexStmtGetRelationId(IndexStmt *createIndexStatement) * GetCreateIndexRelationLockMode returns required lock mode to open the * relation that given CREATE INDEX command operates on. */ -static LOCKMODE +LOCKMODE GetCreateIndexRelationLockMode(IndexStmt *createIndexStatement) { if (createIndexStatement->concurrent) diff --git a/src/backend/distributed/sql/citus--10.1-1--10.2-1.sql b/src/backend/distributed/sql/citus--10.1-1--10.2-1.sql index d11c3ac18..f927237a3 100644 --- a/src/backend/distributed/sql/citus--10.1-1--10.2-1.sql +++ b/src/backend/distributed/sql/citus--10.1-1--10.2-1.sql @@ -1,4 +1,3 @@ -- citus--10.1-1--10.2-1 --- bump version to 10.2-1 - +#include "../../columnar/sql/columnar--10.1-1--10.2-1.sql" diff --git a/src/backend/distributed/sql/downgrades/citus--10.2-1--10.1-1.sql b/src/backend/distributed/sql/downgrades/citus--10.2-1--10.1-1.sql index 9b9bb7471..ddf167860 100644 --- a/src/backend/distributed/sql/downgrades/citus--10.2-1--10.1-1.sql +++ b/src/backend/distributed/sql/downgrades/citus--10.2-1--10.1-1.sql @@ -1,2 +1,3 @@ -- citus--10.2-1--10.1-1 --- this is an empty downgrade path since citus--10.1-1--10.2-1.sql is empty for now + +#include "../../../columnar/sql/downgrades/columnar--10.2-1--10.1-1.sql" diff --git a/src/backend/distributed/transaction/lock_graph.c b/src/backend/distributed/transaction/lock_graph.c index 8bd2150a4..aed021ae0 100644 --- a/src/backend/distributed/transaction/lock_graph.c +++ b/src/backend/distributed/transaction/lock_graph.c @@ -25,7 +25,6 @@ #include "distributed/lock_graph.h" #include "distributed/metadata_cache.h" #include "distributed/remote_commands.h" -#include "distributed/resource_lock.h" #include "distributed/tuplestore.h" #include "storage/proc.h" #include "utils/builtins.h" @@ -472,18 +471,9 @@ IsProcessWaitingForSafeOperations(PGPROC *proc) PROCLOCK *waitProcLock = proc->waitProcLock; LOCK *waitLock = waitProcLock->tag.myLock; - /* - * Stripe reservation locks are temporary & don't hold until end of - * transaction, so we shouldn't include them in the lock graph. - */ - bool stripeReservationLock = - waitLock->tag.locktag_type == LOCKTAG_ADVISORY && - waitLock->tag.locktag_field4 == ADV_LOCKTAG_CLASS_COLUMNAR_STRIPE_RESERVATION; - return waitLock->tag.locktag_type == LOCKTAG_RELATION_EXTEND || waitLock->tag.locktag_type == LOCKTAG_PAGE || - waitLock->tag.locktag_type == LOCKTAG_SPECULATIVE_TOKEN || - stripeReservationLock; + waitLock->tag.locktag_type == LOCKTAG_SPECULATIVE_TOKEN; } diff --git a/src/include/columnar/columnar.h b/src/include/columnar/columnar.h index e6101cf18..e4770acc2 100644 --- a/src/include/columnar/columnar.h +++ b/src/include/columnar/columnar.h @@ -39,8 +39,8 @@ #define COMPRESSION_LEVEL_MAX 19 /* Columnar file signature */ -#define COLUMNAR_VERSION_MAJOR 1 -#define COLUMNAR_VERSION_MINOR 7 +#define COLUMNAR_VERSION_MAJOR 2 +#define COLUMNAR_VERSION_MINOR 0 /* miscellaneous defines */ #define COLUMNAR_TUPLE_COST_MULTIPLIER 10 @@ -201,8 +201,8 @@ extern CompressionType ParseCompressionType(const char *compressionTypeString); extern ColumnarWriteState * ColumnarBeginWrite(RelFileNode relfilenode, ColumnarOptions options, TupleDesc tupleDescriptor); -extern void ColumnarWriteRow(ColumnarWriteState *state, Datum *columnValues, - bool *columnNulls); +extern uint64 ColumnarWriteRow(ColumnarWriteState *state, Datum *columnValues, + bool *columnNulls); extern void ColumnarFlushPendingWrites(ColumnarWriteState *state); extern void ColumnarEndWrite(ColumnarWriteState *state); extern bool ContainsPendingWrites(ColumnarWriteState *state); @@ -214,8 +214,11 @@ extern ColumnarReadState * ColumnarBeginRead(Relation relation, List *projectedColumnList, List *qualConditions); extern bool ColumnarReadNextRow(ColumnarReadState *state, Datum *columnValues, - bool *columnNulls); + bool *columnNulls, uint64 *rowNumber); extern void ColumnarRescan(ColumnarReadState *readState); +extern bool ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber, + List *neededColumnList, Datum *columnValues, + bool *columnNulls, Snapshot snapshot); extern void ColumnarEndRead(ColumnarReadState *state); extern int64 ColumnarReadChunkGroupsFiltered(ColumnarReadState *state); @@ -233,17 +236,16 @@ extern void InitColumnarOptions(Oid regclass); extern void SetColumnarOptions(Oid regclass, ColumnarOptions *options); extern bool DeleteColumnarTableOptions(Oid regclass, bool missingOk); extern bool ReadColumnarOptions(Oid regclass, ColumnarOptions *options); -extern void WriteToSmgr(Relation relation, uint64 logicalOffset, - char *data, uint32 dataLength); -extern StringInfo ReadFromSmgr(Relation rel, uint64 offset, uint32 size); extern bool IsColumnarTableAmTable(Oid relationId); /* columnar_metadata_tables.c */ extern void DeleteMetadataRows(RelFileNode relfilenode); +extern uint64 ColumnarMetadataNewStorageId(void); extern uint64 GetHighestUsedAddress(RelFileNode relfilenode); extern StripeMetadata ReserveStripe(Relation rel, uint64 size, uint64 rowCount, uint64 columnCount, - uint64 chunkCount, uint64 chunkGroupRowCount); + uint64 chunkCount, uint64 chunkGroupRowCount, + uint64 stripeFirstRowNumber); extern void SaveStripeSkipList(RelFileNode relfilenode, uint64 stripe, StripeSkipList *stripeSkipList, TupleDesc tupleDescriptor); @@ -252,6 +254,10 @@ extern void SaveChunkGroups(RelFileNode relfilenode, uint64 stripe, extern StripeSkipList * ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe, TupleDesc tupleDescriptor, uint32 chunkCount); +extern StripeMetadata * FindStripeByRowNumber(Relation relation, uint64 rowNumber, + Snapshot snapshot); +extern StripeMetadata * FindStripeWithHighestRowNumber(Relation relation, + Snapshot snapshot); extern Datum columnar_relation_storageid(PG_FUNCTION_ARGS); @@ -271,51 +277,5 @@ extern bool PendingWritesInUpperTransactions(Oid relfilenode, SubTransactionId currentSubXid); extern MemoryContext GetWriteContextForDebug(void); -typedef struct SmgrAddr -{ - BlockNumber blockno; - uint32 offset; -} SmgrAddr; - -/* - * Map logical offsets (as tracked in the metadata) to a physical page and - * offset where the data is kept. - */ -static inline SmgrAddr -logical_to_smgr(uint64 logicalOffset) -{ - SmgrAddr addr; - - addr.blockno = logicalOffset / COLUMNAR_BYTES_PER_PAGE; - addr.offset = SizeOfPageHeaderData + (logicalOffset % COLUMNAR_BYTES_PER_PAGE); - - return addr; -} - - -/* - * Map a physical page adnd offset address to a logical address. - */ -static inline uint64 -smgr_to_logical(SmgrAddr addr) -{ - return COLUMNAR_BYTES_PER_PAGE * addr.blockno + addr.offset - SizeOfPageHeaderData; -} - - -/* - * Get the first usable address of next block. - */ -static inline SmgrAddr -next_block_start(SmgrAddr addr) -{ - SmgrAddr result = { - .blockno = addr.blockno + 1, - .offset = SizeOfPageHeaderData - }; - - return result; -} - #endif /* COLUMNAR_H */ diff --git a/src/include/columnar/columnar_metadata.h b/src/include/columnar/columnar_metadata.h index 26b3effcf..b3a9f1414 100644 --- a/src/include/columnar/columnar_metadata.h +++ b/src/include/columnar/columnar_metadata.h @@ -25,8 +25,10 @@ typedef struct StripeMetadata uint32 chunkGroupRowCount; uint64 rowCount; uint64 id; + uint64 firstRowNumber; } StripeMetadata; extern List * StripesForRelfilenode(RelFileNode relfilenode); +extern void ColumnarStorageUpdateIfNeeded(Relation rel, bool isUpgrade); #endif /* COLUMNAR_METADATA_H */ diff --git a/src/include/columnar/columnar_storage.h b/src/include/columnar/columnar_storage.h new file mode 100644 index 000000000..a8ac01432 --- /dev/null +++ b/src/include/columnar/columnar_storage.h @@ -0,0 +1,64 @@ +/*------------------------------------------------------------------------- + * + * columnar_storage.h + * + * Type and function declarations for storage of columnar data in blocks. + * + * Copyright (c) Citus Data, Inc. + * + *------------------------------------------------------------------------- + */ + +#ifndef COLUMNAR_STORAGE_H +#define COLUMNAR_STORAGE_H + +#include "postgres.h" + +#include "storage/smgr.h" +#include "utils/rel.h" + +#include "columnar/columnar_tableam.h" + + +#define COLUMNAR_INVALID_ROW_NUMBER ((uint64) 0) +#define COLUMNAR_FIRST_ROW_NUMBER ((uint64) 1) +#define COLUMNAR_MAX_ROW_NUMBER ((uint64) \ + (COLUMNAR_FIRST_ROW_NUMBER + \ + VALID_ITEMPOINTER_OFFSETS * \ + VALID_BLOCKNUMBERS)) + + +/* + * Logical offsets never fall on the first two physical pages. See + * comments in columnar_storage.c. + */ +#define ColumnarInvalidLogicalOffset 0 +#define ColumnarFirstLogicalOffset ((BLCKSZ - SizeOfPageHeaderData) * 2) +#define ColumnarLogicalOffsetIsValid(X) ((X) >= ColumnarFirstLogicalOffset) + + +extern void ColumnarStorageInit(SMgrRelation srel, uint64 storageId); +extern bool ColumnarStorageIsCurrent(Relation rel); +extern void ColumnarStorageUpdateCurrent(Relation rel, bool upgrade, + uint64 reservedStripeId, + uint64 reservedRowNumber, + uint64 reservedOffset); + +extern uint64 ColumnarStorageGetVersionMajor(Relation rel, bool force); +extern uint64 ColumnarStorageGetVersionMinor(Relation rel, bool force); +extern uint64 ColumnarStorageGetStorageId(Relation rel, bool force); +extern uint64 ColumnarStorageGetReservedStripeId(Relation rel, bool force); +extern uint64 ColumnarStorageGetReservedRowNumber(Relation rel, bool force); +extern uint64 ColumnarStorageGetReservedOffset(Relation rel, bool force); + +extern uint64 ColumnarStorageReserveData(Relation rel, uint64 amount); +extern uint64 ColumnarStorageReserveRowNumber(Relation rel, uint64 nrows); +extern uint64 ColumnarStorageReserveStripe(Relation rel); + +extern void ColumnarStorageRead(Relation rel, uint64 logicalOffset, + char *data, uint32 amount); +extern void ColumnarStorageWrite(Relation rel, uint64 logicalOffset, + char *data, uint32 amount); +extern bool ColumnarStorageTruncate(Relation rel, uint64 newDataReservation); + +#endif /* COLUMNAR_STORAGE_H */ diff --git a/src/include/columnar/columnar_tableam.h b/src/include/columnar/columnar_tableam.h index e93efa551..2809c6439 100644 --- a/src/include/columnar/columnar_tableam.h +++ b/src/include/columnar/columnar_tableam.h @@ -8,6 +8,40 @@ #include "distributed/coordinator_protocol.h" + +/* + * Number of valid ItemPointer Offset's for "row number" <> "ItemPointer" + * mapping. + * + * Postgres has some asserts calling either ItemPointerIsValid or + * OffsetNumberIsValid. That constraints itemPointer.offsetNumber + * for columnar tables to the following interval: + * [FirstOffsetNumber, MaxOffsetNumber]. + * + * However, bitmap scan logic assumes that itemPointer.offsetNumber cannot + * be larger than MaxHeapTuplesPerPage (see tbm_add_tuples). + * + * For this reason, we restrict itemPointer.offsetNumber + * to the following interval: [FirstOffsetNumber, MaxHeapTuplesPerPage]. + */ +#define VALID_ITEMPOINTER_OFFSETS \ + ((uint64) (MaxHeapTuplesPerPage - FirstOffsetNumber + 1)) + +/* + * Number of valid ItemPointer BlockNumber's for "row number" <> "ItemPointer" + * mapping. + * + * Similar to VALID_ITEMPOINTER_OFFSETS, due to asserts around + * itemPointer.blockNumber, we can only use values upto and including + * MaxBlockNumber. + * Note that postgres doesn't restrict blockNumber to a lower boundary. + * + * For this reason, we restrict itemPointer.blockNumber + * to the following interval: [0, MaxBlockNumber]. + */ +#define VALID_BLOCKNUMBERS ((uint64) (MaxBlockNumber + 1)) + + const TableAmRoutine * GetColumnarTableAmRoutine(void); extern void columnar_tableam_init(void); extern void columnar_tableam_finish(void); diff --git a/src/include/distributed/commands.h b/src/include/distributed/commands.h index 8556f9836..1965e7725 100644 --- a/src/include/distributed/commands.h +++ b/src/include/distributed/commands.h @@ -275,6 +275,7 @@ extern char * ChooseIndexName(const char *tabname, Oid namespaceId, bool primary, bool isconstraint); extern char * ChooseIndexNameAddition(List *colnames); extern List * ChooseIndexColumnNames(List *indexElems); +extern LOCKMODE GetCreateIndexRelationLockMode(IndexStmt *createIndexStatement); extern List * PreprocessReindexStmt(Node *ReindexStatement, const char *ReindexCommand, ProcessUtilityContext processUtilityContext); diff --git a/src/include/distributed/resource_lock.h b/src/include/distributed/resource_lock.h index 5db9403b4..7241831dd 100644 --- a/src/include/distributed/resource_lock.h +++ b/src/include/distributed/resource_lock.h @@ -39,10 +39,7 @@ typedef enum AdvisoryLocktagClass ADV_LOCKTAG_CLASS_CITUS_REBALANCE_COLOCATION = 7, ADV_LOCKTAG_CLASS_CITUS_COLOCATED_SHARDS_METADATA = 8, ADV_LOCKTAG_CLASS_CITUS_OPERATIONS = 9, - ADV_LOCKTAG_CLASS_CITUS_PLACEMENT_CLEANUP = 10, - - /* Columnar lock types */ - ADV_LOCKTAG_CLASS_COLUMNAR_STRIPE_RESERVATION = 11 + ADV_LOCKTAG_CLASS_CITUS_PLACEMENT_CLEANUP = 10 } AdvisoryLocktagClass; /* CitusOperations has constants for citus operations */ @@ -102,13 +99,6 @@ typedef enum CitusOperations (uint32) operationId, \ ADV_LOCKTAG_CLASS_CITUS_OPERATIONS) -#define SET_LOCKTAG_COLUMNAR_STRIPE_RESERVATION(tag, relation) \ - SET_LOCKTAG_ADVISORY(tag, \ - relation->rd_lockInfo.lockRelId.dbId, \ - relation->rd_lockInfo.lockRelId.relId, \ - 0, \ - ADV_LOCKTAG_CLASS_COLUMNAR_STRIPE_RESERVATION) - /* reuse advisory lock, but with different, unused field 4 (10) * Also it has the database hardcoded to MyDatabaseId, to ensure the locks * are local to each database */ diff --git a/src/test/regress/after_citus_upgrade_coord_schedule b/src/test/regress/after_citus_upgrade_coord_schedule index 4610bf535..49b4e73d9 100644 --- a/src/test/regress/after_citus_upgrade_coord_schedule +++ b/src/test/regress/after_citus_upgrade_coord_schedule @@ -3,3 +3,4 @@ test: upgrade_basic_after test: upgrade_partition_constraints_after test: upgrade_pg_dist_object_test_after +test: upgrade_columnar_metapage_after diff --git a/src/test/regress/before_citus_upgrade_coord_schedule b/src/test/regress/before_citus_upgrade_coord_schedule index 9d015838b..006217e8a 100644 --- a/src/test/regress/before_citus_upgrade_coord_schedule +++ b/src/test/regress/before_citus_upgrade_coord_schedule @@ -3,3 +3,4 @@ test: upgrade_basic_before test: upgrade_partition_constraints_before test: upgrade_pg_dist_object_test_before +test: upgrade_columnar_metapage_before diff --git a/src/test/regress/columnar_schedule b/src/test/regress/columnar_schedule index 8c25f1384..218b04c4a 100644 --- a/src/test/regress/columnar_schedule +++ b/src/test/regress/columnar_schedule @@ -4,7 +4,7 @@ test: multi_test_catalog_views test: columnar_create test: columnar_load -test: columnar_query +test: columnar_query columnar_first_row_number test: columnar_analyze test: columnar_data_types test: columnar_drop diff --git a/src/test/regress/expected/columnar_alter.out b/src/test/regress/expected/columnar_alter.out index 0c22f0046..4706fa386 100644 --- a/src/test/regress/expected/columnar_alter.out +++ b/src/test/regress/expected/columnar_alter.out @@ -12,6 +12,14 @@ WITH sample_data AS (VALUES INSERT INTO test_alter_table SELECT * FROM sample_data; -- drop a column ALTER TABLE test_alter_table DROP COLUMN a; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_alter_table'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 2 | 150001 | 16402 +(1 row) + -- test analyze ANALYZE test_alter_table; -- verify select queries run as expected @@ -59,6 +67,14 @@ SELECT * FROM test_alter_table; 3 | 5 | 8 (5 rows) +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_alter_table'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 4 | 450001 | 32724 +(1 row) + -- add a fixed-length column with default value ALTER TABLE test_alter_table ADD COLUMN e int default 3; SELECT * from test_alter_table; @@ -83,6 +99,14 @@ SELECT * from test_alter_table; 1 | 2 | 4 | 8 (6 rows) +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_alter_table'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 5 | 600001 | 40906 +(1 row) + -- add a variable-length column with default value ALTER TABLE test_alter_table ADD COLUMN f text DEFAULT 'TEXT ME'; SELECT * from test_alter_table; @@ -231,7 +255,6 @@ insert into atacc1 values(1); alter table atacc1 add column b float8 not null default random(), add primary key(a); -ERROR: indexes not supported for columnar tables -- Add a generate column with an expression value create table test_gen_ex (x int) using columnar; INSERT INTO test_gen_ex VALUES (1), (2), (3); @@ -366,30 +389,30 @@ SELECT * FROM products ORDER BY 1; 3 | pen | 2 (3 rows) --- Add a UNIQUE constraint (should fail) -CREATE TABLE products_fail ( +-- Add a UNIQUE constraint +CREATE TABLE products_unique ( product_no integer UNIQUE, name text, price numeric ) USING columnar; -ERROR: indexes not supported for columnar tables ALTER TABLE products ADD COLUMN store_id text UNIQUE; -ERROR: indexes not supported for columnar tables --- Add a PRIMARY KEY constraint (should fail) -CREATE TABLE products_fail ( +-- Add a PRIMARY KEY constraint +CREATE TABLE products_primary ( product_no integer PRIMARY KEY, name text, price numeric ) USING columnar; -ERROR: indexes not supported for columnar tables -ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY; -ERROR: indexes not supported for columnar tables +BEGIN; + ALTER TABLE products DROP COLUMN store_id; + ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY; +ERROR: column "store_id" contains null values +ROLLBACK; -- Add an EXCLUSION constraint (should fail) CREATE TABLE circles ( c circle, EXCLUDE USING gist (c WITH &&) ) USING columnar; -ERROR: indexes not supported for columnar tables +ERROR: only btree and hash indexes are supported on columnar tables -- Row level security CREATE TABLE public.row_level_security_col (id int, pgUser CHARACTER VARYING) USING columnar; CREATE USER user1; diff --git a/src/test/regress/expected/columnar_create.out b/src/test/regress/expected/columnar_create.out index 5630ee811..704c0d932 100644 --- a/src/test/regress/expected/columnar_create.out +++ b/src/test/regress/expected/columnar_create.out @@ -5,11 +5,14 @@ CREATE TABLE contestant (handle TEXT, birthdate DATE, rating INT, percentile FLOAT, country CHAR(3), achievements TEXT[]) USING columnar; --- should fail +SELECT alter_columnar_table_set('contestant', compression => 'none'); + alter_columnar_table_set +--------------------------------------------------------------------- + +(1 row) + CREATE INDEX contestant_idx on contestant(handle); -ERROR: indexes not supported for columnar tables --- Create compressed table with automatically determined file path --- COMPRESSED +-- Create zstd compressed table CREATE TABLE contestant_compressed (handle TEXT, birthdate DATE, rating INT, percentile FLOAT, country CHAR(3), achievements TEXT[]) USING columnar; diff --git a/src/test/regress/expected/columnar_empty.out b/src/test/regress/expected/columnar_empty.out index 9329581e2..728f60f12 100644 --- a/src/test/regress/expected/columnar_empty.out +++ b/src/test/regress/expected/columnar_empty.out @@ -52,6 +52,23 @@ select count(*) from t_compressed; 0 (1 row) +-- check storage +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_compressed'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 1 | 1 | 16336 +(1 row) + +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_uncompressed'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 1 | 1 | 16336 +(1 row) + -- explain explain (costs off, summary off, timing off) select * from t_uncompressed; QUERY PLAN @@ -68,16 +85,16 @@ explain (costs off, summary off, timing off) select * from t_compressed; -- vacuum vacuum verbose t_compressed; INFO: statistics for "t_compressed": -storage id: -1 -total file size: 0, total data size: 0 +storage id: xxxxx +total file size: 16384, total data size: 0 compression rate: 1.00x total row count: 0, stripe count: 0, average rows per stripe: 0 chunk count: 0, containing data for dropped columns: 0 vacuum verbose t_uncompressed; INFO: statistics for "t_uncompressed": -storage id: -1 -total file size: 0, total data size: 0 +storage id: xxxxx +total file size: 16384, total data size: 0 compression rate: 1.00x total row count: 0, stripe count: 0, average rows per stripe: 0 chunk count: 0, containing data for dropped columns: 0 @@ -85,6 +102,23 @@ chunk count: 0, containing data for dropped columns: 0 -- vacuum full vacuum full t_compressed; vacuum full t_uncompressed; +-- check storage +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_compressed'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 1 | 1 | 16336 +(1 row) + +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_uncompressed'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 1 | 1 | 16336 +(1 row) + -- analyze analyze t_uncompressed; analyze t_compressed; @@ -94,6 +128,23 @@ truncate t_compressed; -- alter type alter table t_uncompressed alter column a type text; alter table t_compressed alter column a type text; +-- check storage +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_compressed'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 1 | 1 | 16336 +(1 row) + +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_uncompressed'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 1 | 1 | 16336 +(1 row) + -- verify cost of scanning an empty table is zero, not NaN explain table t_uncompressed; QUERY PLAN diff --git a/src/test/regress/expected/columnar_first_row_number.out b/src/test/regress/expected/columnar_first_row_number.out new file mode 100644 index 000000000..87c1d9508 --- /dev/null +++ b/src/test/regress/expected/columnar_first_row_number.out @@ -0,0 +1,56 @@ +CREATE SCHEMA columnar_first_row_number; +SET search_path tO columnar_first_row_number; +CREATE TABLE col_table_1 (a int) USING columnar; +INSERT INTO col_table_1 SELECT i FROM generate_series(1, 10) i; +BEGIN; + -- we don't use same first_row_number even if the xact is rollback'ed + INSERT INTO col_table_1 SELECT i FROM generate_series(1, 11) i; +ROLLBACK; +INSERT INTO col_table_1 SELECT i FROM generate_series(1, 12) i; +SELECT alter_columnar_table_set('col_table_1', stripe_row_limit => 1000); + alter_columnar_table_set +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO col_table_1 SELECT i FROM generate_series(1, 2350) i; +SELECT row_count, first_row_number FROM columnar.stripe a +WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass) +ORDER BY stripe_num; + row_count | first_row_number +--------------------------------------------------------------------- + 10 | 1 + 12 | 300001 + 1000 | 450001 + 1000 | 451001 + 350 | 452001 +(5 rows) + +VACUUM FULL col_table_1; +-- show that we properly update first_row_number after VACUUM FULL +SELECT row_count, first_row_number FROM columnar.stripe a +WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass) +ORDER BY stripe_num; + row_count | first_row_number +--------------------------------------------------------------------- + 1000 | 1 + 1000 | 1001 + 372 | 2001 +(3 rows) + +TRUNCATE col_table_1; +BEGIN; + INSERT INTO col_table_1 SELECT i FROM generate_series(1, 16) i; + INSERT INTO col_table_1 SELECT i FROM generate_series(1, 16) i; +COMMIT; +-- show that we start with first_row_number=1 after TRUNCATE +SELECT row_count, first_row_number FROM columnar.stripe a +WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass) +ORDER BY stripe_num; + row_count | first_row_number +--------------------------------------------------------------------- + 32 | 1 +(1 row) + +SET client_min_messages TO ERROR; +DROP SCHEMA columnar_first_row_number CASCADE; diff --git a/src/test/regress/expected/columnar_indexes.out b/src/test/regress/expected/columnar_indexes.out index bd1c41802..9edc0900d 100644 --- a/src/test/regress/expected/columnar_indexes.out +++ b/src/test/regress/expected/columnar_indexes.out @@ -10,7 +10,7 @@ SET search_path tO columnar_indexes, public; -- create table t(a int, b int) using columnar; create index CONCURRENTLY t_idx on t(a, b); -ERROR: indexes not supported for columnar tables +ERROR: concurrent index commands are not supported for columnar tables \d t Table "columnar_indexes.t" Column | Type | Collation | Nullable | Default @@ -32,16 +32,15 @@ SELECT * FROM t; 1 | 2 (1 row) --- create index without the concurrent option. We should --- error out during index creation. create index t_idx on t(a, b); -ERROR: indexes not supported for columnar tables \d t Table "columnar_indexes.t" Column | Type | Collation | Nullable | Default --------------------------------------------------------------------- a | integer | | | b | integer | | | +Indexes: + "t_idx" btree (a, b) explain insert into t values (1, 2); QUERY PLAN @@ -58,5 +57,347 @@ SELECT * FROM t; 3 | 4 (2 rows) +-- make sure that we test index scan +set columnar.enable_custom_scan to 'off'; +set enable_seqscan to off; +CREATE table columnar_table (a INT, b int) USING columnar; +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(0, 16000) i; +-- unique -- +BEGIN; + INSERT INTO columnar_table VALUES (100000000); + SAVEPOINT s1; + -- errors out due to unflushed data in upper transaction + CREATE UNIQUE INDEX ON columnar_table (a); +ERROR: cannot read from table when there is unflushed data in upper transactions +ROLLBACK; +CREATE UNIQUE INDEX ON columnar_table (a); +BEGIN; + INSERT INTO columnar_table VALUES (16050); + SAVEPOINT s1; + -- index scan errors out due to unflushed data in upper transaction + SELECT a FROM columnar_table WHERE a = 16050; +ERROR: cannot read from index when there is unflushed data in upper transactions +ROLLBACK; +EXPLAIN (COSTS OFF) SELECT * FROM columnar_table WHERE a=6456; + QUERY PLAN +--------------------------------------------------------------------- + Index Scan using columnar_table_a_idx on columnar_table + Index Cond: (a = 6456) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT a FROM columnar_table WHERE a=6456; + QUERY PLAN +--------------------------------------------------------------------- + Index Only Scan using columnar_table_a_idx on columnar_table + Index Cond: (a = 6456) +(2 rows) + +SELECT (SELECT a FROM columnar_table WHERE a=6456 limit 1)=6456; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT (SELECT b FROM columnar_table WHERE a=6456 limit 1)=6456*2; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- even if a=16050 doesn't exist, we try to insert it twice so this should error out +INSERT INTO columnar_table VALUES (16050), (16050); +ERROR: duplicate key value violates unique constraint "columnar_table_a_idx" +DETAIL: Key (a)=(16050) already exists. +-- should work +INSERT INTO columnar_table VALUES (16050); +-- check edge cases around stripe boundaries, error out +INSERT INTO columnar_table VALUES (16050); +ERROR: duplicate key value violates unique constraint "columnar_table_a_idx" +DETAIL: Key (a)=(16050) already exists. +INSERT INTO columnar_table VALUES (15999); +ERROR: duplicate key value violates unique constraint "columnar_table_a_idx" +DETAIL: Key (a)=(15999) already exists. +DROP INDEX columnar_table_a_idx; +CREATE TABLE partial_unique_idx_test (a INT, b INT) USING columnar; +CREATE UNIQUE INDEX ON partial_unique_idx_test (a) +WHERE b > 500; +-- should work since b =< 500 and our partial index doesn't check this interval +INSERT INTO partial_unique_idx_test VALUES (1, 2), (1, 2); +-- should work since our partial index wouldn't cover the tuples that we inserted above +INSERT INTO partial_unique_idx_test VALUES (1, 800); +INSERT INTO partial_unique_idx_test VALUES (4, 600); +-- should error out due to (4, 600) +INSERT INTO partial_unique_idx_test VALUES (4, 700); +ERROR: duplicate key value violates unique constraint "partial_unique_idx_test_a_idx" +DETAIL: Key (a)=(4) already exists. +-- btree -- +CREATE INDEX ON columnar_table (a); +SELECT (SELECT SUM(b) FROM columnar_table WHERE a>700 and a<965)=439560; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +CREATE INDEX ON columnar_table (b) +WHERE (b > 30000 AND b < 33000); +-- partial index should be way smaller than the non-partial index +SELECT pg_total_relation_size('columnar_table_b_idx') * 5 < + pg_total_relation_size('columnar_table_a_idx'); + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- can't use index scan due to partial index boundaries +EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30000; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on columnar_table + Filter: (b = 30000) +(2 rows) + +-- can use index scan +EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30001; + QUERY PLAN +--------------------------------------------------------------------- + Index Only Scan using columnar_table_b_idx on columnar_table + Index Cond: (b = 30001) +(2 rows) + +-- some more rows +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 17000) i; +DROP INDEX columnar_table_a_idx; +TRUNCATE columnar_table; +-- pkey -- +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 16499) i; +ALTER TABLE columnar_table ADD PRIMARY KEY (a); +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16500, 17000) i; +BEGIN; + INSERT INTO columnar_table (a) SELECT 1; +ROLLBACK; +-- should work +INSERT INTO columnar_table (a) SELECT 1; +-- error out +INSERT INTO columnar_table VALUES (16100), (16101); +ERROR: duplicate key value violates unique constraint "columnar_table_pkey" +DETAIL: Key (a)=(16100) already exists. +INSERT INTO columnar_table VALUES (16999); +ERROR: duplicate key value violates unique constraint "columnar_table_pkey" +DETAIL: Key (a)=(16999) already exists. +BEGIN; + REINDEX INDEX columnar_table_pkey; + -- should error even after reindex + INSERT INTO columnar_table VALUES (16999); +ERROR: duplicate key value violates unique constraint "columnar_table_pkey" +DETAIL: Key (a)=(16999) already exists. +ROLLBACK; +VACUUM FULL columnar_table; +-- show that we don't support clustering columnar tables using indexes +CLUSTER columnar_table USING columnar_table_pkey; +ERROR: clustering columnar tables using indexes is not supported +ALTER TABLE columnar_table CLUSTER ON columnar_table_pkey; +CLUSTER columnar_table; +ERROR: clustering columnar tables using indexes is not supported +-- should error even after vacuum +INSERT INTO columnar_table VALUES (16999); +ERROR: duplicate key value violates unique constraint "columnar_table_pkey" +DETAIL: Key (a)=(16999) already exists. +TRUNCATE columnar_table; +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i; +SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +TRUNCATE columnar_table; +ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey; +-- hash -- +INSERT INTO columnar_table (a, b) SELECT i*2,i FROM generate_series(1, 8000) i; +CREATE INDEX hash_idx ON columnar_table USING HASH (b); +BEGIN; + CREATE INDEX hash_idx_fill_factor ON columnar_table USING HASH (b) WITH (fillfactor=10); + -- same hash index with lower fillfactor should be way bigger + SELECT pg_total_relation_size ('hash_idx_fill_factor') > + pg_total_relation_size ('hash_idx') * 5; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +ROLLBACK; +BEGIN; + INSERT INTO columnar_table (a, b) SELECT i*3,i FROM generate_series(1, 8000) i; +ROLLBACK; +INSERT INTO columnar_table (a, b) SELECT i*4,i FROM generate_series(1, 8000) i; +SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +BEGIN; + REINDEX TABLE columnar_table; + SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +ROLLBACK; +VACUUM FULL columnar_table; +SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- exclusion contraints -- +CREATE TABLE exclusion_test (c1 INT,c2 INT, c3 INT, c4 BOX, +EXCLUDE USING btree (c1 WITH =) INCLUDE(c3,c4) WHERE (c1 < 10)) USING columnar; +-- error out since "c1" is "1" for all rows to be inserted +INSERT INTO exclusion_test SELECT 1, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; +ERROR: conflicting key value violates exclusion constraint "exclusion_test_c1_c3_c4_excl" +DETAIL: Key (c1)=(1) conflicts with existing key (c1)=(1). +BEGIN; + INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; +ROLLBACK; +-- should work +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; +BEGIN; + -- should work thanks to "where" clause in exclusion constraint + INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; +ROLLBACK; +REINDEX TABLE exclusion_test; +-- should still work after reindex +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; +-- make sure that we respect INCLUDE syntax -- +CREATE TABLE include_test (a INT, b BIGINT, c BIGINT, d BIGINT) USING columnar; +INSERT INTO include_test SELECT i, i, i, i FROM generate_series (1, 1000) i; +CREATE UNIQUE INDEX unique_a ON include_test (a); +-- cannot use index only scan +EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + QUERY PLAN +--------------------------------------------------------------------- + Index Scan using unique_a on include_test + Index Cond: (a = 500) +(2 rows) + +CREATE UNIQUE INDEX unique_a_include_b_c_d ON include_test (a) INCLUDE(b, c, d); +-- same unique index that includes other columns should be way bigger +SELECT pg_total_relation_size ('unique_a') * 1.5 < + pg_total_relation_size ('unique_a_include_b_c_d'); + ?column? +--------------------------------------------------------------------- + t +(1 row) + +DROP INDEX unique_a; +-- should use index only scan since unique_a_include_b_c_d includes column "b" too +EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + QUERY PLAN +--------------------------------------------------------------------- + Index Only Scan using unique_a_include_b_c_d on include_test + Index Cond: (a = 500) +(2 rows) + +BEGIN; + SET enable_indexonlyscan = OFF; + -- show that we respect enable_indexonlyscan GUC + EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + QUERY PLAN +--------------------------------------------------------------------- + Index Scan using unique_a_include_b_c_d on include_test + Index Cond: (a = 500) +(2 rows) + +ROLLBACK; +-- make sure that we read the correct value for "b" when doing index only scan +SELECT b=980 FROM include_test WHERE a = 980; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- some tests with distributed & partitioned tables -- +CREATE TABLE dist_part_table( + dist_col INT, + part_col TIMESTAMPTZ, + col1 TEXT +) PARTITION BY RANGE (part_col); +-- create an index before creating a columnar partition +CREATE INDEX dist_part_table_btree ON dist_part_table (col1); +-- columnar partition +CREATE TABLE p0 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-01-01') TO ('2020-02-01') +USING columnar; +SELECT create_distributed_table('dist_part_table', 'dist_col'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- columnar partition +CREATE TABLE p1 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-02-01') TO ('2020-03-01') +USING columnar; +-- row partition +CREATE TABLE p2 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-03-01') TO ('2020-04-01'); +INSERT INTO dist_part_table VALUES (1, '2020-03-15', 'str1', POINT(1, 1)); +ERROR: INSERT has more expressions than target columns +-- insert into columnar partitions +INSERT INTO dist_part_table VALUES (1, '2020-01-15', 'str2', POINT(2, 2)); +ERROR: INSERT has more expressions than target columns +INSERT INTO dist_part_table VALUES (1, '2020-02-15', 'str3', POINT(3, 3)); +ERROR: INSERT has more expressions than target columns +-- create another index after creating a columnar partition +CREATE UNIQUE INDEX dist_part_table_unique ON dist_part_table (dist_col, part_col); +-- verify that indexes are created on columnar partitions +SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p0'; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p1'; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- unsupported index types -- +-- gin -- +CREATE TABLE testjsonb (j JSONB) USING columnar; +INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(1,10) i; +CREATE INDEX jidx ON testjsonb USING GIN (j); +ERROR: only btree and hash indexes are supported on columnar tables +INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(15,20) i; +-- gist -- +CREATE TABLE gist_point_tbl(id INT4, p POINT) USING columnar; +INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(1, 10) g; +CREATE INDEX gist_pointidx ON gist_point_tbl USING gist(p); +ERROR: only btree and hash indexes are supported on columnar tables +INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(10, 20) g; +-- sp gist -- +CREATE TABLE box_temp (f1 box) USING columnar; +INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i; +CREATE INDEX box_spgist ON box_temp USING spgist (f1); +ERROR: only btree and hash indexes are supported on columnar tables +INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i; +-- brin -- +CREATE TABLE brin_summarize (value int) USING columnar; +CREATE INDEX brin_summarize_idx ON brin_summarize USING brin (value) WITH (pages_per_range=2); +ERROR: only btree and hash indexes are supported on columnar tables +-- Show that we safely fallback to serial index build. +CREATE TABLE parallel_scan_test(a int) USING columnar WITH ( parallel_workers = 2 ); +INSERT INTO parallel_scan_test SELECT i FROM generate_series(1,10) i; +CREATE INDEX ON parallel_scan_test (a); +NOTICE: falling back to serial index build since parallel scan on columnar tables is not supported +VACUUM FULL parallel_scan_test; +NOTICE: falling back to serial index build since parallel scan on columnar tables is not supported +REINDEX TABLE parallel_scan_test; +NOTICE: falling back to serial index build since parallel scan on columnar tables is not supported SET client_min_messages TO WARNING; DROP SCHEMA columnar_indexes CASCADE; diff --git a/src/test/regress/expected/columnar_insert.out b/src/test/regress/expected/columnar_insert.out index 5f71f4949..ff950e1e0 100644 --- a/src/test/regress/expected/columnar_insert.out +++ b/src/test/regress/expected/columnar_insert.out @@ -45,6 +45,14 @@ select count(*) from test_insert_command; 3 (1 row) +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_insert_command'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 4 | 450001 | 32686 +(1 row) + SELECT * FROM columnar_test_helpers.chunk_group_consistency; consistent --------------------------------------------------------------------- @@ -141,6 +149,14 @@ FROM test_toast_columnar; 5004 | 5004 | 5004 | 5004 (1 row) +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_toast_columnar'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 2 | 150001 | 16428 +(1 row) + SELECT * FROM columnar_test_helpers.chunk_group_consistency; consistent --------------------------------------------------------------------- @@ -173,6 +189,14 @@ INSERT INTO zero_col_heap SELECT * FROM zero_col_heap; INSERT INTO zero_col_heap SELECT * FROM zero_col_heap; INSERT INTO zero_col_heap SELECT * FROM zero_col_heap; INSERT INTO zero_col SELECT * FROM zero_col_heap; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('zero_col'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 6 | 750001 | 16336 +(1 row) + SELECT relname, stripe_num, chunk_group_count, row_count FROM columnar.stripe a, pg_class b WHERE columnar_test_helpers.columnar_relation_storageid(b.oid)=a.storage_id AND relname = 'zero_col' ORDER BY 1,2,3,4; diff --git a/src/test/regress/expected/columnar_rollback.out b/src/test/regress/expected/columnar_rollback.out index 91c73bf63..6853891fe 100644 --- a/src/test/regress/expected/columnar_rollback.out +++ b/src/test/regress/expected/columnar_rollback.out @@ -14,6 +14,14 @@ SELECT count(*) FROM t; 0 (1 row) +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 1 | 150001 | 16336 +(1 row) + -- check stripe metadata also have been rolled-back SELECT count(*) FROM t_stripes; count @@ -46,6 +54,14 @@ SELECT count(*) FROM t; -- force flush SAVEPOINT s1; INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 3 | 600001 | 24606 +(1 row) + SELECT count(*) FROM t; count --------------------------------------------------------------------- @@ -68,6 +84,14 @@ SELECT count(*) FROM t; INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i; COMMIT; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 5 | 750001 | 40942 +(1 row) + SELECT count(*) FROM t; count --------------------------------------------------------------------- diff --git a/src/test/regress/expected/columnar_test_helpers.out b/src/test/regress/expected/columnar_test_helpers.out index b28086243..2f7cad23b 100644 --- a/src/test/regress/expected/columnar_test_helpers.out +++ b/src/test/regress/expected/columnar_test_helpers.out @@ -3,6 +3,16 @@ SET search_path TO columnar_test_helpers; CREATE FUNCTION columnar_relation_storageid(relid oid) RETURNS bigint LANGUAGE C STABLE STRICT AS 'citus', $$columnar_relation_storageid$$; +CREATE OR REPLACE FUNCTION columnar_storage_info( + rel regclass, + version_major OUT int4, + version_minor OUT int4, + storage_id OUT int8, + reserved_stripe_id OUT int8, + reserved_row_number OUT int8, + reserved_offset OUT int8) + STRICT + LANGUAGE c AS 'citus', $$columnar_storage_info$$; CREATE FUNCTION compression_type_supported(type text) RETURNS boolean AS $$ BEGIN diff --git a/src/test/regress/expected/columnar_truncate.out b/src/test/regress/expected/columnar_truncate.out index 55a496644..ad11ead66 100644 --- a/src/test/regress/expected/columnar_truncate.out +++ b/src/test/regress/expected/columnar_truncate.out @@ -43,7 +43,23 @@ SELECT * FROM columnar_test_helpers.chunk_group_consistency; t (1 row) +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('columnar_truncate_test'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 2 | 150001 | 16438 +(1 row) + TRUNCATE TABLE columnar_truncate_test; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('columnar_truncate_test'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 1 | 1 | 16336 +(1 row) + SELECT * FROM columnar_test_helpers.chunk_group_consistency; consistent --------------------------------------------------------------------- @@ -77,7 +93,7 @@ SELECT count(*) FROM columnar_truncate_test_compressed; SELECT pg_relation_size('columnar_truncate_test_compressed'); pg_relation_size --------------------------------------------------------------------- - 0 + 16384 (1 row) INSERT INTO columnar_truncate_test select a, a from generate_series(1, 10) a; diff --git a/src/test/regress/expected/columnar_vacuum.out b/src/test/regress/expected/columnar_vacuum.out index 949220632..75dadf152 100644 --- a/src/test/regress/expected/columnar_vacuum.out +++ b/src/test/regress/expected/columnar_vacuum.out @@ -25,6 +25,14 @@ SELECT count(*) FROM t_stripes; 3 (1 row) +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 4 | 450001 | 32756 +(1 row) + -- vacuum full should merge stripes together VACUUM FULL t; SELECT * FROM columnar_test_helpers.chunk_group_consistency; @@ -45,6 +53,14 @@ SELECT count(*) FROM t_stripes; 1 (1 row) +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 2 | 150001 | 16584 +(1 row) + -- test the case when all data cannot fit into a single stripe SELECT alter_columnar_table_set('t', stripe_row_limit => 1000); alter_columnar_table_set @@ -66,6 +82,14 @@ SELECT count(*) FROM t_stripes; (1 row) VACUUM FULL t; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 4 | 3001 | 53382 +(1 row) + SELECT * FROM columnar_test_helpers.chunk_group_consistency; consistent --------------------------------------------------------------------- @@ -215,6 +239,14 @@ compression rate: 1.25x total row count: 5530, stripe count: 5, average rows per stripe: 1106 chunk count: 7, containing data for dropped columns: 0, none compressed: 5, pglz compressed: 2 +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 16 | 21001 | 50686 +(1 row) + SELECT * FROM columnar_test_helpers.chunk_group_consistency; consistent --------------------------------------------------------------------- diff --git a/src/test/regress/expected/columnar_write_concurrency.out b/src/test/regress/expected/columnar_write_concurrency.out index 96a83ab61..2be90b141 100644 --- a/src/test/regress/expected/columnar_write_concurrency.out +++ b/src/test/regress/expected/columnar_write_concurrency.out @@ -140,3 +140,45 @@ a b 11 12 13 + +starting permutation: s1-truncate s1-begin s1-insert-10000-rows s2-begin s2-insert s2-commit s1-commit s1-verify-metadata +step s1-truncate: + TRUNCATE test_insert_concurrency; + +step s1-begin: + BEGIN; + +step s1-insert-10000-rows: + INSERT INTO test_insert_concurrency SELECT i, 2 * i FROM generate_series(1, 10000) i; + +step s2-begin: + BEGIN; + +step s2-insert: + INSERT INTO test_insert_concurrency SELECT i, 2 * i FROM generate_series(4, 6) i; + +step s2-commit: + COMMIT; + +step s1-commit: + COMMIT; + +step s1-verify-metadata: + WITH test_insert_concurrency_stripes AS ( + SELECT first_row_number, stripe_num, row_count + FROM columnar.stripe a, pg_class b + WHERE columnar_relation_storageid(b.oid)=a.storage_id AND + relname = 'test_insert_concurrency' + ) + SELECT + -- verify that table has two stripes .. + count(*) = 2 AND + -- .. and those stripes look like: + sum(case when stripe_num = 1 AND first_row_number = 150001 AND row_count = 3 then 1 end) = 1 AND + sum(case when stripe_num = 2 AND first_row_number = 1 AND row_count = 10000 then 1 end) = 1 + AS stripe_metadata_for_test_insert_concurrency_ok + FROM test_insert_concurrency_stripes; + +stripe_metadata_for_test_insert_concurrency_ok + +t diff --git a/src/test/regress/expected/multi_extension.out b/src/test/regress/expected/multi_extension.out index 48cf0298b..5a76a93b2 100644 --- a/src/test/regress/expected/multi_extension.out +++ b/src/test/regress/expected/multi_extension.out @@ -595,9 +595,11 @@ SELECT * FROM print_extension_changes(); -- Snapshot of state at 10.2-1 ALTER EXTENSION citus UPDATE TO '10.2-1'; SELECT * FROM print_extension_changes(); - previous_object | current_object + previous_object | current_object --------------------------------------------------------------------- -(0 rows) + | function citus_internal.downgrade_columnar_storage(regclass) void + | function citus_internal.upgrade_columnar_storage(regclass) void +(2 rows) DROP TABLE prev_objects, extension_diff; -- show running version diff --git a/src/test/regress/expected/multi_multiuser.out b/src/test/regress/expected/multi_multiuser.out index 937ea9ec9..6501861d4 100644 --- a/src/test/regress/expected/multi_multiuser.out +++ b/src/test/regress/expected/multi_multiuser.out @@ -259,7 +259,7 @@ ABORT; -- all below 5 commands should throw no permission errors -- read columnar metadata table SELECT * FROM columnar.stripe; - storage_id | stripe_num | file_offset | data_length | column_count | chunk_row_count | row_count | chunk_group_count + storage_id | stripe_num | file_offset | data_length | column_count | chunk_row_count | row_count | chunk_group_count | first_row_number --------------------------------------------------------------------- (0 rows) diff --git a/src/test/regress/expected/upgrade_columnar_metapage_after.out b/src/test/regress/expected/upgrade_columnar_metapage_after.out new file mode 100644 index 000000000..2c9fa5cad --- /dev/null +++ b/src/test/regress/expected/upgrade_columnar_metapage_after.out @@ -0,0 +1,122 @@ +\set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"` +SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND + substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0 +AS upgrade_test_old_citus_version_ge_10_0; + upgrade_test_old_citus_version_ge_10_0 +--------------------------------------------------------------------- + t +(1 row) + +\gset +\if :upgrade_test_old_citus_version_ge_10_0 +\else +\q +\endif +-- it's not the best practice to define this here, but we don't want to include +-- columnar_test_helpers in upgrade test schedule +CREATE OR REPLACE FUNCTION columnar_storage_info( + rel regclass, + version_major OUT int4, + version_minor OUT int4, + storage_id OUT int8, + reserved_stripe_id OUT int8, + reserved_row_number OUT int8, + reserved_offset OUT int8) +STRICT +LANGUAGE c AS 'citus', 'columnar_storage_info'; +CREATE VIEW columnar_table_stripe_info AS +SELECT columnar_table_storageids.relname relname, + columnar.stripe.stripe_num stripe_num, + columnar.stripe.row_count row_count, + columnar.stripe.first_row_number first_row_number +FROM columnar.stripe, +( + SELECT c.oid relid, c.relname relname, (columnar_storage_info(c.oid)).storage_id relstorageid + FROM pg_class c, pg_am a + WHERE c.relam = a.oid AND amname = 'columnar' +) columnar_table_storageids +WHERE relstorageid = columnar.stripe.storage_id; +SET search_path TO upgrade_columnar_metapage, public; +-- show that first_row_number values are equal to MAX(row_count) * stripe_num + COLUMNAR_FIRST_ROW_NUMBER +SELECT * FROM columnar_table_stripe_info ORDER BY relname, stripe_num; + relname | stripe_num | row_count | first_row_number +--------------------------------------------------------------------- + columnar_table_1 | 1 | 150000 | 1 + columnar_table_1 | 2 | 10000 | 150001 + columnar_table_2 | 1 | 1000 | 1 + columnar_table_2 | 2 | 901 | 150001 + columnar_table_3 | 1 | 2 | 1 +(5 rows) + +-- should work since we upgrade metapages when upgrading schema version +INSERT INTO columnar_table_1 VALUES (3); +-- state of stripe metadata for columnar_table_1 after post-upgrade insert +SELECT * FROM columnar_table_stripe_info WHERE relname = 'columnar_table_1' ORDER BY stripe_num; + relname | stripe_num | row_count | first_row_number +--------------------------------------------------------------------- + columnar_table_1 | 1 | 150000 | 1 + columnar_table_1 | 2 | 10000 | 150001 + columnar_table_1 | 3 | 1 | 160001 +(3 rows) + +-- show that all columnar relation's metapage's are upgraded to "2.0" +SELECT count(*)=0 +FROM (SELECT (columnar_storage_info(c.oid)).* t + FROM pg_class c, pg_am a + WHERE c.relam = a.oid AND amname = 'columnar') t +WHERE t.version_major != 2 and t.version_minor != 0; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- print metapage for two of the tables +SELECT columnar_storage_info('columnar_table_1'); + columnar_storage_info +--------------------------------------------------------------------- + (2,0,10000000000,4,310001,481936) +(1 row) + +SELECT columnar_storage_info('columnar_table_2'); + columnar_storage_info +--------------------------------------------------------------------- + (2,0,10000000001,3,150902,26694) +(1 row) + +-- show that no_data_columnar_table also has metapage after upgrade +SELECT columnar_storage_info('no_data_columnar_table'); + columnar_storage_info +--------------------------------------------------------------------- + (2,0,10000000003,1,1,16336) +(1 row) + +-- table is already upgraded, make sure that upgrade_columnar_metapage is no-op +SELECT citus_internal.upgrade_columnar_storage(c.oid) +FROM pg_class c, pg_am a +WHERE c.relam = a.oid AND amname = 'columnar' and relname = 'columnar_table_2'; + upgrade_columnar_storage +--------------------------------------------------------------------- + +(1 row) + +SELECT columnar_storage_info('columnar_table_2'); + columnar_storage_info +--------------------------------------------------------------------- + (2,0,10000000001,3,150902,26694) +(1 row) + +VACUUM FULL columnar_table_2; +-- print metapage and stripe metadata after post-upgrade vacuum full +SELECT columnar_storage_info('columnar_table_2'); + columnar_storage_info +--------------------------------------------------------------------- + (2,0,10000000004,3,2001,26694) +(1 row) + +SELECT * FROM columnar_table_stripe_info WHERE relname = 'columnar_table_2' ORDER BY stripe_num; + relname | stripe_num | row_count | first_row_number +--------------------------------------------------------------------- + columnar_table_2 | 1 | 1000 | 1 + columnar_table_2 | 2 | 901 | 1001 +(2 rows) + diff --git a/src/test/regress/expected/upgrade_columnar_metapage_after_0.out b/src/test/regress/expected/upgrade_columnar_metapage_after_0.out new file mode 100644 index 000000000..3d2de4490 --- /dev/null +++ b/src/test/regress/expected/upgrade_columnar_metapage_after_0.out @@ -0,0 +1,13 @@ +\set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"` +SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND + substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0 +AS upgrade_test_old_citus_version_ge_10_0; + upgrade_test_old_citus_version_ge_10_0 +--------------------------------------------------------------------- + f +(1 row) + +\gset +\if :upgrade_test_old_citus_version_ge_10_0 +\else +\q diff --git a/src/test/regress/expected/upgrade_columnar_metapage_before.out b/src/test/regress/expected/upgrade_columnar_metapage_before.out new file mode 100644 index 000000000..fc484b37d --- /dev/null +++ b/src/test/regress/expected/upgrade_columnar_metapage_before.out @@ -0,0 +1,31 @@ +\set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"` +SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND + substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0 +AS upgrade_test_old_citus_version_ge_10_0; + upgrade_test_old_citus_version_ge_10_0 +--------------------------------------------------------------------- + t +(1 row) + +\gset +\if :upgrade_test_old_citus_version_ge_10_0 +\else +\q +\endif +CREATE SCHEMA upgrade_columnar_metapage; +SET search_path TO upgrade_columnar_metapage, public; +CREATE TABLE columnar_table_1(a INT, b INT) USING columnar; +INSERT INTO columnar_table_1 SELECT i FROM generate_series(160001, 320000) i; +CREATE TABLE columnar_table_2(b INT) USING columnar; +SELECT alter_columnar_table_set('columnar_table_2', + chunk_group_row_limit => 100, + stripe_row_limit => 1000); + alter_columnar_table_set +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO columnar_table_2 SELECT i FROM generate_series(1600, 3500) i; +CREATE TABLE columnar_table_3(b INT) USING columnar; +INSERT INTO columnar_table_3 VALUES (1), (2); +CREATE TABLE no_data_columnar_table(a INT, b INT, c TEXT) USING columnar; diff --git a/src/test/regress/expected/upgrade_columnar_metapage_before_0.out b/src/test/regress/expected/upgrade_columnar_metapage_before_0.out new file mode 100644 index 000000000..3d2de4490 --- /dev/null +++ b/src/test/regress/expected/upgrade_columnar_metapage_before_0.out @@ -0,0 +1,13 @@ +\set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"` +SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND + substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0 +AS upgrade_test_old_citus_version_ge_10_0; + upgrade_test_old_citus_version_ge_10_0 +--------------------------------------------------------------------- + f +(1 row) + +\gset +\if :upgrade_test_old_citus_version_ge_10_0 +\else +\q diff --git a/src/test/regress/expected/upgrade_list_citus_objects.out b/src/test/regress/expected/upgrade_list_citus_objects.out index 2b7d7468d..6be33257a 100644 --- a/src/test/regress/expected/upgrade_list_citus_objects.out +++ b/src/test/regress/expected/upgrade_list_citus_objects.out @@ -57,6 +57,7 @@ ORDER BY 1; function citus_extradata_container(internal) function citus_finish_pg_upgrade() function citus_get_active_worker_nodes() + function citus_internal.downgrade_columnar_storage(regclass) function citus_internal.find_groupid_for_node(text,integer) function citus_internal.pg_dist_node_trigger_func() function citus_internal.pg_dist_rebalance_strategy_trigger_func() @@ -64,6 +65,7 @@ ORDER BY 1; function citus_internal.refresh_isolation_tester_prepared_statement() function citus_internal.replace_isolation_tester_func() function citus_internal.restore_isolation_tester_func() + function citus_internal.upgrade_columnar_storage(regclass) function citus_isolation_test_session_is_blocked(integer,integer[]) function citus_json_concatenate(json,json) function citus_json_concatenate_final(json) @@ -246,5 +248,5 @@ ORDER BY 1; view citus_worker_stat_activity view pg_dist_shard_placement view time_partitions -(230 rows) +(232 rows) diff --git a/src/test/regress/expected/upgrade_list_citus_objects_0.out b/src/test/regress/expected/upgrade_list_citus_objects_0.out index e3c5322bd..045b538f2 100644 --- a/src/test/regress/expected/upgrade_list_citus_objects_0.out +++ b/src/test/regress/expected/upgrade_list_citus_objects_0.out @@ -53,6 +53,7 @@ ORDER BY 1; function citus_extradata_container(internal) function citus_finish_pg_upgrade() function citus_get_active_worker_nodes() + function citus_internal.downgrade_columnar_storage(regclass) function citus_internal.find_groupid_for_node(text,integer) function citus_internal.pg_dist_node_trigger_func() function citus_internal.pg_dist_rebalance_strategy_enterprise_check() @@ -61,6 +62,7 @@ ORDER BY 1; function citus_internal.refresh_isolation_tester_prepared_statement() function citus_internal.replace_isolation_tester_func() function citus_internal.restore_isolation_tester_func() + function citus_internal.upgrade_columnar_storage(regclass) function citus_isolation_test_session_is_blocked(integer,integer[]) function citus_json_concatenate(json,json) function citus_json_concatenate_final(json) @@ -238,5 +240,5 @@ ORDER BY 1; view citus_worker_stat_activity view pg_dist_shard_placement view time_partitions -(222 rows) +(224 rows) diff --git a/src/test/regress/input/columnar_load.source b/src/test/regress/input/columnar_load.source index b4b72926c..e00107d83 100644 --- a/src/test/regress/input/columnar_load.source +++ b/src/test/regress/input/columnar_load.source @@ -15,14 +15,20 @@ COPY contestant FROM '@abs_srcdir@/data/contestants.1.csv' WITH CSV; -- COPY into uncompressed table from program COPY contestant FROM PROGRAM 'cat @abs_srcdir@/data/contestants.2.csv' WITH CSV; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('contestant'); + -- COPY into compressed table -set columnar.compression = 'pglz'; COPY contestant_compressed FROM '@abs_srcdir@/data/contestants.1.csv' WITH CSV; -- COPY into uncompressed table from program COPY contestant_compressed FROM PROGRAM 'cat @abs_srcdir@/data/contestants.2.csv' WITH CSV; -set columnar.compression to default; + +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('contestant_compressed'); -- Test column list CREATE TABLE famous_constants (id int, name text, value real) diff --git a/src/test/regress/output/columnar_load.source b/src/test/regress/output/columnar_load.source index c58f0bfb0..7f5b4ddcf 100644 --- a/src/test/regress/output/columnar_load.source +++ b/src/test/regress/output/columnar_load.source @@ -14,13 +14,27 @@ DETAIL: command not found COPY contestant FROM '@abs_srcdir@/data/contestants.1.csv' WITH CSV; -- COPY into uncompressed table from program COPY contestant FROM PROGRAM 'cat @abs_srcdir@/data/contestants.2.csv' WITH CSV; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('contestant'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 3 | 300001 | 24742 +(1 row) + -- COPY into compressed table -set columnar.compression = 'pglz'; COPY contestant_compressed FROM '@abs_srcdir@/data/contestants.1.csv' WITH CSV; -- COPY into uncompressed table from program COPY contestant_compressed FROM PROGRAM 'cat @abs_srcdir@/data/contestants.2.csv' WITH CSV; -set columnar.compression to default; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('contestant_compressed'); + version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset +--------------------------------------------------------------------- + 2 | 0 | 3 | 300001 | 24704 +(1 row) + -- Test column list CREATE TABLE famous_constants (id int, name text, value real) USING columnar; diff --git a/src/test/regress/spec/columnar_write_concurrency.spec b/src/test/regress/spec/columnar_write_concurrency.spec index 5fe32b607..edfead34a 100644 --- a/src/test/regress/spec/columnar_write_concurrency.spec +++ b/src/test/regress/spec/columnar_write_concurrency.spec @@ -1,6 +1,10 @@ setup { CREATE TABLE test_insert_concurrency (a int, b int) USING columnar; + + CREATE OR REPLACE FUNCTION columnar_relation_storageid(relid oid) RETURNS bigint + LANGUAGE C STABLE STRICT + AS 'citus', $$columnar_relation_storageid$$; } teardown @@ -20,6 +24,11 @@ step "s1-insert" INSERT INTO test_insert_concurrency SELECT i, 2 * i FROM generate_series(1, 3) i; } +step "s1-insert-10000-rows" +{ + INSERT INTO test_insert_concurrency SELECT i, 2 * i FROM generate_series(1, 10000) i; +} + step "s1-copy" { COPY test_insert_concurrency(a) FROM PROGRAM 'seq 11 13'; @@ -30,6 +39,29 @@ step "s1-select" SELECT * FROM test_insert_concurrency ORDER BY a; } +step "s1-truncate" +{ + TRUNCATE test_insert_concurrency; +} + +step "s1-verify-metadata" +{ + WITH test_insert_concurrency_stripes AS ( + SELECT first_row_number, stripe_num, row_count + FROM columnar.stripe a, pg_class b + WHERE columnar_relation_storageid(b.oid)=a.storage_id AND + relname = 'test_insert_concurrency' + ) + SELECT + -- verify that table has two stripes .. + count(*) = 2 AND + -- .. and those stripes look like: + sum(case when stripe_num = 1 AND first_row_number = 150001 AND row_count = 3 then 1 end) = 1 AND + sum(case when stripe_num = 2 AND first_row_number = 1 AND row_count = 10000 then 1 end) = 1 + AS stripe_metadata_for_test_insert_concurrency_ok + FROM test_insert_concurrency_stripes; +} + step "s1-commit" { COMMIT; @@ -65,3 +97,9 @@ permutation "s1-begin" "s2-begin" "s1-copy" "s2-insert" "s1-select" "s2-select" // insert vs copy permutation "s1-begin" "s2-begin" "s2-insert" "s1-copy" "s1-select" "s2-select" "s1-commit" "s2-commit" "s1-select" + +# insert vs insert +# Start inserting rows in session 1, reserve first_row_number to be 1 for session 1 but commit session 2 before session 1. +# Then verify that while the stripe written by session 2 has the greater first_row_number, stripe written by session 1 has +# the greater stripe_num. This is because, we reserve stripe_num and first_row_number at different times. +permutation "s1-truncate" "s1-begin" "s1-insert-10000-rows" "s2-begin" "s2-insert" "s2-commit" "s1-commit" "s1-verify-metadata" diff --git a/src/test/regress/sql/columnar_alter.sql b/src/test/regress/sql/columnar_alter.sql index b9b4498cb..ecd647fe1 100644 --- a/src/test/regress/sql/columnar_alter.sql +++ b/src/test/regress/sql/columnar_alter.sql @@ -17,6 +17,10 @@ INSERT INTO test_alter_table SELECT * FROM sample_data; -- drop a column ALTER TABLE test_alter_table DROP COLUMN a; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_alter_table'); + -- test analyze ANALYZE test_alter_table; @@ -36,6 +40,10 @@ SELECT * FROM test_alter_table; INSERT INTO test_alter_table (SELECT 3, 5, 8); SELECT * FROM test_alter_table; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_alter_table'); + -- add a fixed-length column with default value ALTER TABLE test_alter_table ADD COLUMN e int default 3; @@ -43,6 +51,10 @@ SELECT * from test_alter_table; INSERT INTO test_alter_table (SELECT 1, 2, 4, 8); SELECT * from test_alter_table; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_alter_table'); + -- add a variable-length column with default value ALTER TABLE test_alter_table ADD COLUMN f text DEFAULT 'TEXT ME'; @@ -203,21 +215,25 @@ ALTER TABLE products DROP CONSTRAINT dummy_constraint; INSERT INTO products VALUES (3, 'pen', 2); SELECT * FROM products ORDER BY 1; --- Add a UNIQUE constraint (should fail) -CREATE TABLE products_fail ( +-- Add a UNIQUE constraint +CREATE TABLE products_unique ( product_no integer UNIQUE, name text, price numeric ) USING columnar; ALTER TABLE products ADD COLUMN store_id text UNIQUE; --- Add a PRIMARY KEY constraint (should fail) -CREATE TABLE products_fail ( +-- Add a PRIMARY KEY constraint +CREATE TABLE products_primary ( product_no integer PRIMARY KEY, name text, price numeric ) USING columnar; -ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY; + +BEGIN; + ALTER TABLE products DROP COLUMN store_id; + ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY; +ROLLBACK; -- Add an EXCLUSION constraint (should fail) CREATE TABLE circles ( diff --git a/src/test/regress/sql/columnar_create.sql b/src/test/regress/sql/columnar_create.sql index 41ffc1053..f83b1c2a8 100644 --- a/src/test/regress/sql/columnar_create.sql +++ b/src/test/regress/sql/columnar_create.sql @@ -7,12 +7,11 @@ CREATE TABLE contestant (handle TEXT, birthdate DATE, rating INT, percentile FLOAT, country CHAR(3), achievements TEXT[]) USING columnar; +SELECT alter_columnar_table_set('contestant', compression => 'none'); --- should fail CREATE INDEX contestant_idx on contestant(handle); --- Create compressed table with automatically determined file path --- COMPRESSED +-- Create zstd compressed table CREATE TABLE contestant_compressed (handle TEXT, birthdate DATE, rating INT, percentile FLOAT, country CHAR(3), achievements TEXT[]) USING columnar; diff --git a/src/test/regress/sql/columnar_empty.sql b/src/test/regress/sql/columnar_empty.sql index 45098d3c8..a3699fa58 100644 --- a/src/test/regress/sql/columnar_empty.sql +++ b/src/test/regress/sql/columnar_empty.sql @@ -19,6 +19,14 @@ select count(*) from t_uncompressed; select * from t_compressed; select count(*) from t_compressed; +-- check storage +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_compressed'); +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_uncompressed'); + -- explain explain (costs off, summary off, timing off) select * from t_uncompressed; explain (costs off, summary off, timing off) select * from t_compressed; @@ -31,6 +39,14 @@ vacuum verbose t_uncompressed; vacuum full t_compressed; vacuum full t_uncompressed; +-- check storage +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_compressed'); +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_uncompressed'); + -- analyze analyze t_uncompressed; analyze t_compressed; @@ -43,6 +59,14 @@ truncate t_compressed; alter table t_uncompressed alter column a type text; alter table t_compressed alter column a type text; +-- check storage +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_compressed'); +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t_uncompressed'); + -- verify cost of scanning an empty table is zero, not NaN explain table t_uncompressed; explain table t_compressed; diff --git a/src/test/regress/sql/columnar_first_row_number.sql b/src/test/regress/sql/columnar_first_row_number.sql new file mode 100644 index 000000000..b1d0e7bd4 --- /dev/null +++ b/src/test/regress/sql/columnar_first_row_number.sql @@ -0,0 +1,43 @@ +CREATE SCHEMA columnar_first_row_number; +SET search_path tO columnar_first_row_number; + +CREATE TABLE col_table_1 (a int) USING columnar; + +INSERT INTO col_table_1 SELECT i FROM generate_series(1, 10) i; + +BEGIN; + -- we don't use same first_row_number even if the xact is rollback'ed + INSERT INTO col_table_1 SELECT i FROM generate_series(1, 11) i; +ROLLBACK; + +INSERT INTO col_table_1 SELECT i FROM generate_series(1, 12) i; + +SELECT alter_columnar_table_set('col_table_1', stripe_row_limit => 1000); + +INSERT INTO col_table_1 SELECT i FROM generate_series(1, 2350) i; + +SELECT row_count, first_row_number FROM columnar.stripe a +WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass) +ORDER BY stripe_num; + +VACUUM FULL col_table_1; + +-- show that we properly update first_row_number after VACUUM FULL +SELECT row_count, first_row_number FROM columnar.stripe a +WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass) +ORDER BY stripe_num; + +TRUNCATE col_table_1; + +BEGIN; + INSERT INTO col_table_1 SELECT i FROM generate_series(1, 16) i; + INSERT INTO col_table_1 SELECT i FROM generate_series(1, 16) i; +COMMIT; + +-- show that we start with first_row_number=1 after TRUNCATE +SELECT row_count, first_row_number FROM columnar.stripe a +WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass) +ORDER BY stripe_num; + +SET client_min_messages TO ERROR; +DROP SCHEMA columnar_first_row_number CASCADE; diff --git a/src/test/regress/sql/columnar_indexes.sql b/src/test/regress/sql/columnar_indexes.sql index 831699dc4..43ad75221 100644 --- a/src/test/regress/sql/columnar_indexes.sql +++ b/src/test/regress/sql/columnar_indexes.sql @@ -17,13 +17,283 @@ explain insert into t values (1, 2); insert into t values (1, 2); SELECT * FROM t; --- create index without the concurrent option. We should --- error out during index creation. create index t_idx on t(a, b); \d t explain insert into t values (1, 2); insert into t values (3, 4); SELECT * FROM t; +-- make sure that we test index scan +set columnar.enable_custom_scan to 'off'; +set enable_seqscan to off; + +CREATE table columnar_table (a INT, b int) USING columnar; +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(0, 16000) i; + +-- unique -- +BEGIN; + INSERT INTO columnar_table VALUES (100000000); + SAVEPOINT s1; + -- errors out due to unflushed data in upper transaction + CREATE UNIQUE INDEX ON columnar_table (a); +ROLLBACK; + +CREATE UNIQUE INDEX ON columnar_table (a); + +BEGIN; + INSERT INTO columnar_table VALUES (16050); + SAVEPOINT s1; + -- index scan errors out due to unflushed data in upper transaction + SELECT a FROM columnar_table WHERE a = 16050; +ROLLBACK; + +EXPLAIN (COSTS OFF) SELECT * FROM columnar_table WHERE a=6456; +EXPLAIN (COSTS OFF) SELECT a FROM columnar_table WHERE a=6456; +SELECT (SELECT a FROM columnar_table WHERE a=6456 limit 1)=6456; +SELECT (SELECT b FROM columnar_table WHERE a=6456 limit 1)=6456*2; + +-- even if a=16050 doesn't exist, we try to insert it twice so this should error out +INSERT INTO columnar_table VALUES (16050), (16050); + +-- should work +INSERT INTO columnar_table VALUES (16050); + +-- check edge cases around stripe boundaries, error out +INSERT INTO columnar_table VALUES (16050); +INSERT INTO columnar_table VALUES (15999); + +DROP INDEX columnar_table_a_idx; + +CREATE TABLE partial_unique_idx_test (a INT, b INT) USING columnar; +CREATE UNIQUE INDEX ON partial_unique_idx_test (a) +WHERE b > 500; + +-- should work since b =< 500 and our partial index doesn't check this interval +INSERT INTO partial_unique_idx_test VALUES (1, 2), (1, 2); + +-- should work since our partial index wouldn't cover the tuples that we inserted above +INSERT INTO partial_unique_idx_test VALUES (1, 800); + +INSERT INTO partial_unique_idx_test VALUES (4, 600); + +-- should error out due to (4, 600) +INSERT INTO partial_unique_idx_test VALUES (4, 700); + +-- btree -- +CREATE INDEX ON columnar_table (a); +SELECT (SELECT SUM(b) FROM columnar_table WHERE a>700 and a<965)=439560; + +CREATE INDEX ON columnar_table (b) +WHERE (b > 30000 AND b < 33000); + +-- partial index should be way smaller than the non-partial index +SELECT pg_total_relation_size('columnar_table_b_idx') * 5 < + pg_total_relation_size('columnar_table_a_idx'); + +-- can't use index scan due to partial index boundaries +EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30000; +-- can use index scan +EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30001; + +-- some more rows +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 17000) i; + +DROP INDEX columnar_table_a_idx; +TRUNCATE columnar_table; + +-- pkey -- +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 16499) i; +ALTER TABLE columnar_table ADD PRIMARY KEY (a); +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16500, 17000) i; + +BEGIN; + INSERT INTO columnar_table (a) SELECT 1; +ROLLBACK; + +-- should work +INSERT INTO columnar_table (a) SELECT 1; + +-- error out +INSERT INTO columnar_table VALUES (16100), (16101); +INSERT INTO columnar_table VALUES (16999); + +BEGIN; + REINDEX INDEX columnar_table_pkey; + -- should error even after reindex + INSERT INTO columnar_table VALUES (16999); +ROLLBACK; + +VACUUM FULL columnar_table; + +-- show that we don't support clustering columnar tables using indexes +CLUSTER columnar_table USING columnar_table_pkey; + +ALTER TABLE columnar_table CLUSTER ON columnar_table_pkey; +CLUSTER columnar_table; + +-- should error even after vacuum +INSERT INTO columnar_table VALUES (16999); + +TRUNCATE columnar_table; +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i; +SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000; + +TRUNCATE columnar_table; +ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey; + +-- hash -- +INSERT INTO columnar_table (a, b) SELECT i*2,i FROM generate_series(1, 8000) i; +CREATE INDEX hash_idx ON columnar_table USING HASH (b); + +BEGIN; + CREATE INDEX hash_idx_fill_factor ON columnar_table USING HASH (b) WITH (fillfactor=10); + -- same hash index with lower fillfactor should be way bigger + SELECT pg_total_relation_size ('hash_idx_fill_factor') > + pg_total_relation_size ('hash_idx') * 5; +ROLLBACK; + +BEGIN; + INSERT INTO columnar_table (a, b) SELECT i*3,i FROM generate_series(1, 8000) i; +ROLLBACK; + +INSERT INTO columnar_table (a, b) SELECT i*4,i FROM generate_series(1, 8000) i; + +SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + +BEGIN; + REINDEX TABLE columnar_table; + SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; +ROLLBACK; + +VACUUM FULL columnar_table; +SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + +-- exclusion contraints -- +CREATE TABLE exclusion_test (c1 INT,c2 INT, c3 INT, c4 BOX, +EXCLUDE USING btree (c1 WITH =) INCLUDE(c3,c4) WHERE (c1 < 10)) USING columnar; + +-- error out since "c1" is "1" for all rows to be inserted +INSERT INTO exclusion_test SELECT 1, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; + +BEGIN; + INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; +ROLLBACK; + +-- should work +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; + +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; + +BEGIN; + -- should work thanks to "where" clause in exclusion constraint + INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; +ROLLBACK; + +REINDEX TABLE exclusion_test; +-- should still work after reindex +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; + +-- make sure that we respect INCLUDE syntax -- + +CREATE TABLE include_test (a INT, b BIGINT, c BIGINT, d BIGINT) USING columnar; + +INSERT INTO include_test SELECT i, i, i, i FROM generate_series (1, 1000) i; + +CREATE UNIQUE INDEX unique_a ON include_test (a); + +-- cannot use index only scan +EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + +CREATE UNIQUE INDEX unique_a_include_b_c_d ON include_test (a) INCLUDE(b, c, d); + +-- same unique index that includes other columns should be way bigger +SELECT pg_total_relation_size ('unique_a') * 1.5 < + pg_total_relation_size ('unique_a_include_b_c_d'); + +DROP INDEX unique_a; + +-- should use index only scan since unique_a_include_b_c_d includes column "b" too +EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + +BEGIN; + SET enable_indexonlyscan = OFF; + -- show that we respect enable_indexonlyscan GUC + EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; +ROLLBACK; + +-- make sure that we read the correct value for "b" when doing index only scan +SELECT b=980 FROM include_test WHERE a = 980; + +-- some tests with distributed & partitioned tables -- + +CREATE TABLE dist_part_table( + dist_col INT, + part_col TIMESTAMPTZ, + col1 TEXT +) PARTITION BY RANGE (part_col); + +-- create an index before creating a columnar partition +CREATE INDEX dist_part_table_btree ON dist_part_table (col1); + +-- columnar partition +CREATE TABLE p0 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-01-01') TO ('2020-02-01') +USING columnar; + +SELECT create_distributed_table('dist_part_table', 'dist_col'); + +-- columnar partition +CREATE TABLE p1 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-02-01') TO ('2020-03-01') +USING columnar; + +-- row partition +CREATE TABLE p2 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-03-01') TO ('2020-04-01'); + +INSERT INTO dist_part_table VALUES (1, '2020-03-15', 'str1', POINT(1, 1)); + +-- insert into columnar partitions +INSERT INTO dist_part_table VALUES (1, '2020-01-15', 'str2', POINT(2, 2)); +INSERT INTO dist_part_table VALUES (1, '2020-02-15', 'str3', POINT(3, 3)); + +-- create another index after creating a columnar partition +CREATE UNIQUE INDEX dist_part_table_unique ON dist_part_table (dist_col, part_col); + +-- verify that indexes are created on columnar partitions +SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p0'; +SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p1'; + +-- unsupported index types -- + +-- gin -- +CREATE TABLE testjsonb (j JSONB) USING columnar; +INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(1,10) i; +CREATE INDEX jidx ON testjsonb USING GIN (j); +INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(15,20) i; + +-- gist -- +CREATE TABLE gist_point_tbl(id INT4, p POINT) USING columnar; +INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(1, 10) g; +CREATE INDEX gist_pointidx ON gist_point_tbl USING gist(p); +INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(10, 20) g; + +-- sp gist -- +CREATE TABLE box_temp (f1 box) USING columnar; +INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i; +CREATE INDEX box_spgist ON box_temp USING spgist (f1); +INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i; + +-- brin -- +CREATE TABLE brin_summarize (value int) USING columnar; +CREATE INDEX brin_summarize_idx ON brin_summarize USING brin (value) WITH (pages_per_range=2); + +-- Show that we safely fallback to serial index build. +CREATE TABLE parallel_scan_test(a int) USING columnar WITH ( parallel_workers = 2 ); +INSERT INTO parallel_scan_test SELECT i FROM generate_series(1,10) i; +CREATE INDEX ON parallel_scan_test (a); +VACUUM FULL parallel_scan_test; +REINDEX TABLE parallel_scan_test; + SET client_min_messages TO WARNING; DROP SCHEMA columnar_indexes CASCADE; diff --git a/src/test/regress/sql/columnar_insert.sql b/src/test/regress/sql/columnar_insert.sql index 7e3e9011f..ad4660fe5 100644 --- a/src/test/regress/sql/columnar_insert.sql +++ b/src/test/regress/sql/columnar_insert.sql @@ -22,6 +22,10 @@ select count(*) from test_insert_command_data; insert into test_insert_command select * from test_insert_command_data; select count(*) from test_insert_command; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_insert_command'); + SELECT * FROM columnar_test_helpers.chunk_group_consistency; drop table test_insert_command_data; @@ -99,6 +103,10 @@ SELECT pg_column_size(external), pg_column_size(extended) FROM test_toast_columnar; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('test_toast_columnar'); + SELECT * FROM columnar_test_helpers.chunk_group_consistency; DROP TABLE test_toast_row; @@ -128,6 +136,10 @@ INSERT INTO zero_col_heap SELECT * FROM zero_col_heap; INSERT INTO zero_col SELECT * FROM zero_col_heap; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('zero_col'); + SELECT relname, stripe_num, chunk_group_count, row_count FROM columnar.stripe a, pg_class b WHERE columnar_test_helpers.columnar_relation_storageid(b.oid)=a.storage_id AND relname = 'zero_col' ORDER BY 1,2,3,4; diff --git a/src/test/regress/sql/columnar_rollback.sql b/src/test/regress/sql/columnar_rollback.sql index 3022ab906..d20c68a3a 100644 --- a/src/test/regress/sql/columnar_rollback.sql +++ b/src/test/regress/sql/columnar_rollback.sql @@ -13,6 +13,10 @@ INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i; ROLLBACK; SELECT count(*) FROM t; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + -- check stripe metadata also have been rolled-back SELECT count(*) FROM t_stripes; @@ -28,6 +32,11 @@ INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i; SELECT count(*) FROM t; -- force flush SAVEPOINT s1; INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i; + +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + SELECT count(*) FROM t; ROLLBACK TO SAVEPOINT s1; SELECT count(*) FROM t; @@ -36,6 +45,10 @@ SELECT count(*) FROM t; INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i; COMMIT; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + SELECT count(*) FROM t; SELECT count(*) FROM t_stripes; diff --git a/src/test/regress/sql/columnar_test_helpers.sql b/src/test/regress/sql/columnar_test_helpers.sql index d240a98d8..08438a75f 100644 --- a/src/test/regress/sql/columnar_test_helpers.sql +++ b/src/test/regress/sql/columnar_test_helpers.sql @@ -5,6 +5,17 @@ CREATE FUNCTION columnar_relation_storageid(relid oid) RETURNS bigint LANGUAGE C STABLE STRICT AS 'citus', $$columnar_relation_storageid$$; +CREATE OR REPLACE FUNCTION columnar_storage_info( + rel regclass, + version_major OUT int4, + version_minor OUT int4, + storage_id OUT int8, + reserved_stripe_id OUT int8, + reserved_row_number OUT int8, + reserved_offset OUT int8) + STRICT + LANGUAGE c AS 'citus', $$columnar_storage_info$$; + CREATE FUNCTION compression_type_supported(type text) RETURNS boolean AS $$ BEGIN diff --git a/src/test/regress/sql/columnar_truncate.sql b/src/test/regress/sql/columnar_truncate.sql index 701b41ceb..bdbb00c23 100644 --- a/src/test/regress/sql/columnar_truncate.sql +++ b/src/test/regress/sql/columnar_truncate.sql @@ -27,8 +27,16 @@ SELECT * FROM columnar_truncate_test; SELECT * FROM columnar_test_helpers.chunk_group_consistency; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('columnar_truncate_test'); + TRUNCATE TABLE columnar_truncate_test; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('columnar_truncate_test'); + SELECT * FROM columnar_test_helpers.chunk_group_consistency; SELECT * FROM columnar_truncate_test; diff --git a/src/test/regress/sql/columnar_vacuum.sql b/src/test/regress/sql/columnar_vacuum.sql index 29babd84e..a2fe24feb 100644 --- a/src/test/regress/sql/columnar_vacuum.sql +++ b/src/test/regress/sql/columnar_vacuum.sql @@ -17,6 +17,10 @@ INSERT INTO t SELECT i, i * i FROM generate_series(21, 30) i; SELECT sum(a), sum(b) FROM t; SELECT count(*) FROM t_stripes; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + -- vacuum full should merge stripes together VACUUM FULL t; @@ -25,6 +29,10 @@ SELECT * FROM columnar_test_helpers.chunk_group_consistency; SELECT sum(a), sum(b) FROM t; SELECT count(*) FROM t_stripes; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + -- test the case when all data cannot fit into a single stripe SELECT alter_columnar_table_set('t', stripe_row_limit => 1000); INSERT INTO t SELECT i, 2 * i FROM generate_series(1,2500) i; @@ -34,6 +42,10 @@ SELECT count(*) FROM t_stripes; VACUUM FULL t; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); + SELECT * FROM columnar_test_helpers.chunk_group_consistency; SELECT sum(a), sum(b) FROM t; @@ -95,6 +107,9 @@ INSERT INTO t SELECT i / 5 FROM generate_series(1, 1500) i; COMMIT; VACUUM VERBOSE t; +select + version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset + from columnar_test_helpers.columnar_storage_info('t'); SELECT * FROM columnar_test_helpers.chunk_group_consistency; diff --git a/src/test/regress/sql/upgrade_columnar_metapage_after.sql b/src/test/regress/sql/upgrade_columnar_metapage_after.sql new file mode 100644 index 000000000..6a57fb07a --- /dev/null +++ b/src/test/regress/sql/upgrade_columnar_metapage_after.sql @@ -0,0 +1,73 @@ +\set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"` +SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND + substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0 +AS upgrade_test_old_citus_version_ge_10_0; +\gset +\if :upgrade_test_old_citus_version_ge_10_0 +\else +\q +\endif + +-- it's not the best practice to define this here, but we don't want to include +-- columnar_test_helpers in upgrade test schedule +CREATE OR REPLACE FUNCTION columnar_storage_info( + rel regclass, + version_major OUT int4, + version_minor OUT int4, + storage_id OUT int8, + reserved_stripe_id OUT int8, + reserved_row_number OUT int8, + reserved_offset OUT int8) +STRICT +LANGUAGE c AS 'citus', 'columnar_storage_info'; + +CREATE VIEW columnar_table_stripe_info AS +SELECT columnar_table_storageids.relname relname, + columnar.stripe.stripe_num stripe_num, + columnar.stripe.row_count row_count, + columnar.stripe.first_row_number first_row_number +FROM columnar.stripe, +( + SELECT c.oid relid, c.relname relname, (columnar_storage_info(c.oid)).storage_id relstorageid + FROM pg_class c, pg_am a + WHERE c.relam = a.oid AND amname = 'columnar' +) columnar_table_storageids +WHERE relstorageid = columnar.stripe.storage_id; + +SET search_path TO upgrade_columnar_metapage, public; + +-- show that first_row_number values are equal to MAX(row_count) * stripe_num + COLUMNAR_FIRST_ROW_NUMBER +SELECT * FROM columnar_table_stripe_info ORDER BY relname, stripe_num; + +-- should work since we upgrade metapages when upgrading schema version +INSERT INTO columnar_table_1 VALUES (3); + +-- state of stripe metadata for columnar_table_1 after post-upgrade insert +SELECT * FROM columnar_table_stripe_info WHERE relname = 'columnar_table_1' ORDER BY stripe_num; + +-- show that all columnar relation's metapage's are upgraded to "2.0" +SELECT count(*)=0 +FROM (SELECT (columnar_storage_info(c.oid)).* t + FROM pg_class c, pg_am a + WHERE c.relam = a.oid AND amname = 'columnar') t +WHERE t.version_major != 2 and t.version_minor != 0; + +-- print metapage for two of the tables +SELECT columnar_storage_info('columnar_table_1'); +SELECT columnar_storage_info('columnar_table_2'); + +-- show that no_data_columnar_table also has metapage after upgrade +SELECT columnar_storage_info('no_data_columnar_table'); + +-- table is already upgraded, make sure that upgrade_columnar_metapage is no-op +SELECT citus_internal.upgrade_columnar_storage(c.oid) +FROM pg_class c, pg_am a +WHERE c.relam = a.oid AND amname = 'columnar' and relname = 'columnar_table_2'; + +SELECT columnar_storage_info('columnar_table_2'); + +VACUUM FULL columnar_table_2; + +-- print metapage and stripe metadata after post-upgrade vacuum full +SELECT columnar_storage_info('columnar_table_2'); +SELECT * FROM columnar_table_stripe_info WHERE relname = 'columnar_table_2' ORDER BY stripe_num; diff --git a/src/test/regress/sql/upgrade_columnar_metapage_before.sql b/src/test/regress/sql/upgrade_columnar_metapage_before.sql new file mode 100644 index 000000000..4dd23727c --- /dev/null +++ b/src/test/regress/sql/upgrade_columnar_metapage_before.sql @@ -0,0 +1,26 @@ +\set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"` +SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND + substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0 +AS upgrade_test_old_citus_version_ge_10_0; +\gset +\if :upgrade_test_old_citus_version_ge_10_0 +\else +\q +\endif + +CREATE SCHEMA upgrade_columnar_metapage; +SET search_path TO upgrade_columnar_metapage, public; + +CREATE TABLE columnar_table_1(a INT, b INT) USING columnar; +INSERT INTO columnar_table_1 SELECT i FROM generate_series(160001, 320000) i; + +CREATE TABLE columnar_table_2(b INT) USING columnar; +SELECT alter_columnar_table_set('columnar_table_2', + chunk_group_row_limit => 100, + stripe_row_limit => 1000); +INSERT INTO columnar_table_2 SELECT i FROM generate_series(1600, 3500) i; + +CREATE TABLE columnar_table_3(b INT) USING columnar; +INSERT INTO columnar_table_3 VALUES (1), (2); + +CREATE TABLE no_data_columnar_table(a INT, b INT, c TEXT) USING columnar;