From 94f30a04284d7170c21071809a3ad717dc910d89 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Tue, 1 Jun 2021 02:19:55 +0300 Subject: [PATCH 1/7] Refactor index check in ColumnarProcessUtility --- src/backend/columnar/columnar_tableam.c | 24 ++++++++++++------------ src/backend/distributed/commands/index.c | 3 +-- src/include/distributed/commands.h | 1 + 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/backend/columnar/columnar_tableam.c b/src/backend/columnar/columnar_tableam.c index 9977a66a8..e6cf2c0e3 100644 --- a/src/backend/columnar/columnar_tableam.c +++ b/src/backend/columnar/columnar_tableam.c @@ -1348,24 +1348,24 @@ ColumnarProcessUtility(PlannedStmt *pstmt, { IndexStmt *indexStmt = (IndexStmt *) parsetree; - /* - * We should reject CREATE INDEX CONCURRENTLY before DefineIndex() is - * called. Erroring in callbacks called from DefineIndex() will create - * the index and mark it as INVALID, which will cause segfault during - * inserts. - */ - if (indexStmt->concurrent) + Relation rel = relation_openrv(indexStmt->relation, + GetCreateIndexRelationLockMode(indexStmt)); + if (rel->rd_tableam == GetColumnarTableAmRoutine()) { - Relation rel = relation_openrv(indexStmt->relation, - ShareUpdateExclusiveLock); - if (rel->rd_tableam == GetColumnarTableAmRoutine()) + /* + * We should reject CREATE INDEX CONCURRENTLY before DefineIndex() is + * called. Erroring in callbacks called from DefineIndex() will create + * the index and mark it as INVALID, which will cause segfault during + * inserts. + */ + if (indexStmt->concurrent) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("indexes not supported for columnar tables"))); } - - RelationClose(rel); } + + RelationClose(rel); } PrevProcessUtilityHook(pstmt, queryString, context, diff --git a/src/backend/distributed/commands/index.c b/src/backend/distributed/commands/index.c index 1f66e512b..ee553de1e 100644 --- a/src/backend/distributed/commands/index.c +++ b/src/backend/distributed/commands/index.c @@ -62,7 +62,6 @@ static List * GenerateIndexParameters(IndexStmt *createIndexStatement); static DDLJob * GenerateCreateIndexDDLJob(IndexStmt *createIndexStatement, const char *createIndexCommand); static Oid CreateIndexStmtGetRelationId(IndexStmt *createIndexStatement); -static LOCKMODE GetCreateIndexRelationLockMode(IndexStmt *createIndexStatement); static List * CreateIndexTaskList(IndexStmt *indexStmt); static List * CreateReindexTaskList(Oid relationId, ReindexStmt *reindexStmt); static void RangeVarCallbackForDropIndex(const RangeVar *rel, Oid relOid, Oid oldRelOid, @@ -502,7 +501,7 @@ CreateIndexStmtGetRelationId(IndexStmt *createIndexStatement) * GetCreateIndexRelationLockMode returns required lock mode to open the * relation that given CREATE INDEX command operates on. */ -static LOCKMODE +LOCKMODE GetCreateIndexRelationLockMode(IndexStmt *createIndexStatement) { if (createIndexStatement->concurrent) diff --git a/src/include/distributed/commands.h b/src/include/distributed/commands.h index 4810e5915..a5070f2ad 100644 --- a/src/include/distributed/commands.h +++ b/src/include/distributed/commands.h @@ -266,6 +266,7 @@ extern char * ChooseIndexName(const char *tabname, Oid namespaceId, bool primary, bool isconstraint); extern char * ChooseIndexNameAddition(List *colnames); extern List * ChooseIndexColumnNames(List *indexElems); +extern LOCKMODE GetCreateIndexRelationLockMode(IndexStmt *createIndexStatement); extern List * PreprocessReindexStmt(Node *ReindexStatement, const char *ReindexCommand, ProcessUtilityContext processUtilityContext); From 10a762aa88be2188bc3fde61635af2b5e29a0f02 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Thu, 15 Apr 2021 01:40:05 +0300 Subject: [PATCH 2/7] Implement columnar index support functions --- src/backend/columnar/columnar_metadata.c | 62 +++- src/backend/columnar/columnar_reader.c | 107 ++++++ src/backend/columnar/columnar_tableam.c | 262 +++++++++++++- src/include/columnar/columnar.h | 5 + src/test/regress/expected/columnar_alter.out | 21 +- src/test/regress/expected/columnar_create.out | 2 - .../regress/expected/columnar_indexes.out | 334 +++++++++++++++++- src/test/regress/sql/columnar_alter.sql | 14 +- src/test/regress/sql/columnar_create.sql | 1 - src/test/regress/sql/columnar_indexes.sql | 260 +++++++++++++- 10 files changed, 1029 insertions(+), 39 deletions(-) diff --git a/src/backend/columnar/columnar_metadata.c b/src/backend/columnar/columnar_metadata.c index e44bccc92..af62227a6 100644 --- a/src/backend/columnar/columnar_metadata.c +++ b/src/backend/columnar/columnar_metadata.c @@ -12,7 +12,7 @@ * min/max values (used for Chunk Group Filtering) * * useful for fast VACUUM operations (e.g. reporting with VACUUM VERBOSE) * * useful for stats/costing - * * TODO: maps logical row numbers to stripe IDs + * * maps logical row numbers to stripe IDs * * TODO: visibility information * *------------------------------------------------------------------------- @@ -77,6 +77,7 @@ static uint32 * ReadChunkGroupRowCounts(uint64 storageId, uint64 stripe, uint32 static Oid ColumnarStorageIdSequenceRelationId(void); static Oid ColumnarStripeRelationId(void); static Oid ColumnarStripePKeyIndexRelationId(void); +static Oid ColumnarStripeFirstRowNumberIndexRelationId(void); static Oid ColumnarOptionsRelationId(void); static Oid ColumnarOptionsIndexRegclass(void); static Oid ColumnarChunkRelationId(void); @@ -620,6 +621,53 @@ ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe, TupleDesc tupleDescri } +/* + * FindStripeByRowNumber returns StripeMetadata for the stripe that has the + * row with rowNumber by doing backward index scan on + * stripe_first_row_number_idx. If no such row exists, then returns NULL. + */ +StripeMetadata * +FindStripeByRowNumber(Relation relation, uint64 rowNumber, Snapshot snapshot) +{ + StripeMetadata *foundStripeMetadata = NULL; + + uint64 storageId = ColumnarStorageGetStorageId(relation, false); + ScanKeyData scanKey[2]; + ScanKeyInit(&scanKey[0], Anum_columnar_stripe_storageid, + BTEqualStrategyNumber, F_OIDEQ, Int32GetDatum(storageId)); + ScanKeyInit(&scanKey[1], Anum_columnar_stripe_first_row_number, + BTLessEqualStrategyNumber, F_INT8LE, UInt64GetDatum(rowNumber)); + + Relation columnarStripes = table_open(ColumnarStripeRelationId(), AccessShareLock); + Relation index = index_open(ColumnarStripeFirstRowNumberIndexRelationId(), + AccessShareLock); + SysScanDesc scanDescriptor = systable_beginscan_ordered(columnarStripes, index, + snapshot, 2, + scanKey); + + HeapTuple heapTuple = systable_getnext_ordered(scanDescriptor, BackwardScanDirection); + if (HeapTupleIsValid(heapTuple)) + { + TupleDesc tupleDescriptor = RelationGetDescr(columnarStripes); + Datum datumArray[Natts_columnar_stripe]; + bool isNullArray[Natts_columnar_stripe]; + heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray); + + StripeMetadata *stripeMetadata = BuildStripeMetadata(datumArray); + if (rowNumber < stripeMetadata->firstRowNumber + stripeMetadata->rowCount) + { + foundStripeMetadata = stripeMetadata; + } + } + + systable_endscan_ordered(scanDescriptor); + index_close(index, AccessShareLock); + table_close(columnarStripes, AccessShareLock); + + return foundStripeMetadata; +} + + /* * ReadChunkGroupRowCounts returns an array of row counts of chunk groups for the * given stripe. @@ -1153,6 +1201,18 @@ ColumnarStripePKeyIndexRelationId(void) } +/* + * ColumnarStripeFirstRowNumberIndexRelationId returns relation id of + * columnar.stripe_first_row_number_idx. + * TODO: should we cache this similar to citus? + */ +static Oid +ColumnarStripeFirstRowNumberIndexRelationId(void) +{ + return get_relname_relid("stripe_first_row_number_idx", ColumnarNamespaceId()); +} + + /* * ColumnarOptionsRelationId returns relation id of columnar.options. */ diff --git a/src/backend/columnar/columnar_reader.c b/src/backend/columnar/columnar_reader.c index 64029a5c6..0f1a1767f 100644 --- a/src/backend/columnar/columnar_reader.c +++ b/src/backend/columnar/columnar_reader.c @@ -35,6 +35,7 @@ #include "columnar/columnar.h" #include "columnar/columnar_storage.h" +#include "columnar/columnar_tableam.h" #include "columnar/columnar_version_compat.h" typedef struct ChunkGroupReadState @@ -85,6 +86,14 @@ struct ColumnarReadState /* static function declarations */ static MemoryContext CreateStripeReadMemoryContext(void); +static void ReadStripeRowByRowNumber(StripeReadState *stripeReadState, + StripeMetadata *stripeMetadata, + uint64 rowNumber, Datum *columnValues, + bool *columnNulls); +static void ReadChunkGroupRowByRowOffset(ChunkGroupReadState *chunkGroupReadState, + StripeMetadata *stripeMetadata, + uint64 stripeRowOffset, Datum *columnValues, + bool *columnNulls); static bool StripeReadInProgress(ColumnarReadState *readState); static bool HasUnreadStripe(ColumnarReadState *readState); static StripeReadState * BeginStripeRead(StripeMetadata *stripeMetadata, Relation rel, @@ -243,6 +252,104 @@ ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *col } +/* + * ColumnarReadRowByRowNumber reads row with rowNumber from given relation + * into columnValues and columnNulls, and returns true. If no such row + * exists, then returns false. + */ +bool +ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber, + List *neededColumnList, Datum *columnValues, + bool *columnNulls, Snapshot snapshot) +{ + StripeMetadata *stripeMetadata = FindStripeByRowNumber(relation, rowNumber, snapshot); + if (stripeMetadata == NULL) + { + /* no such row exists */ + return false; + } + + TupleDesc relationTupleDesc = RelationGetDescr(relation); + List *whereClauseList = NIL; + List *whereClauseVars = NIL; + MemoryContext stripeReadContext = CreateStripeReadMemoryContext(); + StripeReadState *stripeReadState = BeginStripeRead(stripeMetadata, + relation, + relationTupleDesc, + neededColumnList, + whereClauseList, + whereClauseVars, + stripeReadContext); + + ReadStripeRowByRowNumber(stripeReadState, stripeMetadata, rowNumber, + columnValues, columnNulls); + + EndStripeRead(stripeReadState); + MemoryContextReset(stripeReadContext); + + return true; +} + + +/* + * ReadStripeRowByRowNumber reads row with rowNumber from given + * stripeReadState into columnValues and columnNulls. + * Errors out if no such row exists in the stripe being read. + */ +static void +ReadStripeRowByRowNumber(StripeReadState *stripeReadState, + StripeMetadata *stripeMetadata, + uint64 rowNumber, Datum *columnValues, + bool *columnNulls) +{ + if (rowNumber < stripeMetadata->firstRowNumber) + { + /* not expected but be on the safe side */ + ereport(ERROR, (errmsg("row offset cannot be negative"))); + } + + /* find the exact chunk group to be read */ + uint64 stripeRowOffset = rowNumber - stripeMetadata->firstRowNumber; + stripeReadState->chunkGroupIndex = stripeRowOffset / + stripeMetadata->chunkGroupRowCount; + stripeReadState->chunkGroupReadState = BeginChunkGroupRead( + stripeReadState->stripeBuffers, + stripeReadState->chunkGroupIndex, + stripeReadState->tupleDescriptor, + stripeReadState->projectedColumnList, + stripeReadState->stripeReadContext); + + ReadChunkGroupRowByRowOffset(stripeReadState->chunkGroupReadState, + stripeMetadata, stripeRowOffset, + columnValues, columnNulls); + + EndChunkGroupRead(stripeReadState->chunkGroupReadState); + stripeReadState->chunkGroupReadState = NULL; +} + + +/* + * ReadChunkGroupRowByRowOffset reads row with stripeRowOffset from given + * chunkGroupReadState into columnValues and columnNulls. + * Errors out if no such row exists in the chunk group being read. + */ +static void +ReadChunkGroupRowByRowOffset(ChunkGroupReadState *chunkGroupReadState, + StripeMetadata *stripeMetadata, + uint64 stripeRowOffset, Datum *columnValues, + bool *columnNulls) +{ + /* set the exact row number to be read from given chunk roup */ + chunkGroupReadState->currentRow = stripeRowOffset % + stripeMetadata->chunkGroupRowCount; + if (!ReadChunkGroupNextRow(chunkGroupReadState, columnValues, columnNulls)) + { + /* not expected but be on the safe side */ + ereport(ERROR, (errmsg("could not find the row in stripe"))); + } +} + + /* * StripeReadInProgress returns true if we already started reading a stripe. */ diff --git a/src/backend/columnar/columnar_tableam.c b/src/backend/columnar/columnar_tableam.c index e6cf2c0e3..944032b84 100644 --- a/src/backend/columnar/columnar_tableam.c +++ b/src/backend/columnar/columnar_tableam.c @@ -56,6 +56,7 @@ #include "columnar/columnar_version_compat.h" #include "distributed/commands.h" #include "distributed/commands/utility_hook.h" +#include "distributed/listutils.h" #include "distributed/metadata_cache.h" /* @@ -111,7 +112,14 @@ static HeapTuple ColumnarSlotCopyHeapTuple(TupleTableSlot *slot); static void ColumnarCheckLogicalReplication(Relation rel); static Datum * detoast_values(TupleDesc tupleDesc, Datum *orig_values, bool *isnull); static ItemPointerData row_number_to_tid(uint64 rowNumber); +static uint64 tid_to_row_number(ItemPointerData tid); static void ErrorIfInvalidRowNumber(uint64 rowNumber); +static double ColumnarReadRowsIntoIndex(TableScanDesc scan, + Relation indexRelation, + IndexInfo *indexInfo, + IndexBuildCallback indexCallback, + void *indexCallbackState, + EState *estate, ExprState *predicate); /* Custom tuple slot ops used for columnar. Initialized in columnar_tableam_init(). */ static TupleTableSlotOps TTSOpsColumnar; @@ -294,6 +302,21 @@ row_number_to_tid(uint64 rowNumber) } +/* + * tid_to_row_number maps given ItemPointerData to rowNumber. + */ +static uint64 +tid_to_row_number(ItemPointerData tid) +{ + uint64 rowNumber = ItemPointerGetBlockNumber(&tid) * VALID_ITEMPOINTER_OFFSETS + + ItemPointerGetOffsetNumber(&tid) - FirstOffsetNumber; + + ErrorIfInvalidRowNumber(rowNumber); + + return rowNumber; +} + + /* * ErrorIfInvalidRowNumber errors out if given rowNumber is invalid. */ @@ -341,24 +364,34 @@ columnar_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) static IndexFetchTableData * columnar_index_fetch_begin(Relation rel) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + Oid relfilenode = rel->rd_node.relNode; + if (PendingWritesInUpperTransactions(relfilenode, GetCurrentSubTransactionId())) + { + /* XXX: maybe we can just flush the data and continue */ + elog(ERROR, "cannot read from index when there is unflushed data in " + "upper transactions"); + } + + FlushWriteStateForRelfilenode(relfilenode, GetCurrentSubTransactionId()); + + IndexFetchTableData *scan = palloc0(sizeof(IndexFetchTableData)); + scan->rel = rel; + return scan; } static void columnar_index_fetch_reset(IndexFetchTableData *scan) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + /* no-op */ } static void columnar_index_fetch_end(IndexFetchTableData *scan) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + columnar_index_fetch_reset(scan); + pfree(scan); } @@ -369,8 +402,37 @@ columnar_index_fetch_tuple(struct IndexFetchTableData *scan, TupleTableSlot *slot, bool *call_again, bool *all_dead) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + /* no HOT chains are possible in columnar, directly set it to false */ + *call_again = false; + + /* + * No dead tuples are possible in columnar, set it to false if it's + * passed to be non-NULL. + */ + if (all_dead) + { + *all_dead = false; + } + + ExecClearTuple(slot); + + /* we need all columns */ + int natts = scan->rel->rd_att->natts; + Bitmapset *attr_needed = bms_add_range(NULL, 0, natts - 1); + TupleDesc relationTupleDesc = RelationGetDescr(scan->rel); + List *relationColumnList = NeededColumnsList(relationTupleDesc, attr_needed); + uint64 rowNumber = tid_to_row_number(*tid); + if (!ColumnarReadRowByRowNumber(scan->rel, rowNumber, relationColumnList, + slot->tts_values, slot->tts_isnull, snapshot)) + { + return false; + } + + slot->tts_tableOid = RelationGetRelid(scan->rel); + slot->tts_tid = *tid; + ExecStoreVirtualTuple(slot); + + return true; } @@ -1003,7 +1065,7 @@ columnar_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, static double -columnar_index_build_range_scan(Relation heapRelation, +columnar_index_build_range_scan(Relation columnarRelation, Relation indexRelation, IndexInfo *indexInfo, bool allow_sync, @@ -1015,8 +1077,165 @@ columnar_index_build_range_scan(Relation heapRelation, void *callback_state, TableScanDesc scan) { - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + /* + * TODO: Should this function call pgstat_progress_update_param in + * somewhere as heapam_index_build_range_scan ? + */ + + if (start_blockno != 0 || numblocks != InvalidBlockNumber) + { + /* + * Columnar utility hook already errors out for BRIN indexes on columnar + * tables, but be on the safe side. + */ + ereport(ERROR, (errmsg("BRIN indexes on columnar tables are not supported"))); + } + + if (indexInfo->ii_Concurrent) + { + /* we already don't allow CONCURRENTLY syntax but be on the safe side */ + ereport(ERROR, (errmsg("concurrent index builds are not supported " + "for columnar tables"))); + } + + if (scan) + { + /* + * Since we don't support parallel reads on columnar tables, we + * should have already errored out for that, but be on the safe side. + */ + ereport(ERROR, (errmsg("parallel reads on columnar are not supported"))); + } + + /* + * In a normal index build, we use SnapshotAny to retrieve all tuples. In + * a concurrent build or during bootstrap, we take a regular MVCC snapshot + * and index whatever's live according to that. + */ + TransactionId OldestXmin = InvalidTransactionId; + + /* + * We already don't allow concurrent index builds so ii_Concurrent + * will always be false, but let's keep the code close to heapAM. + */ + if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) + { + /* ignore lazy VACUUM's */ + OldestXmin = GetOldestXmin(columnarRelation, PROCARRAY_FLAGS_VACUUM); + } + + Snapshot snapshot = { 0 }; + bool snapshotRegisteredByUs = false; + if (!scan) + { + /* + * For serial index build, we begin our own scan. We may also need to + * register a snapshot whose lifetime is under our direct control. + */ + if (!TransactionIdIsValid(OldestXmin)) + { + snapshot = RegisterSnapshot(GetTransactionSnapshot()); + snapshotRegisteredByUs = true; + } + else + { + snapshot = SnapshotAny; + } + + int nkeys = 0; + ScanKeyData *scanKey = NULL; + bool allowAccessStrategy = true; + scan = table_beginscan_strat(columnarRelation, snapshot, nkeys, scanKey, + allowAccessStrategy, allow_sync); + } + else + { + /* + * For parallel index build, we don't register/unregister own snapshot + * since snapshot is taken from parallel scan. Note that even if we + * don't support parallel index builds, we still continue building the + * index via the main backend and we should still rely on the snapshot + * provided by parallel scan. + */ + snapshot = scan->rs_snapshot; + } + + /* + * Set up execution state for predicate, if any. + * Note that this is only useful for partial indexes. + */ + EState *estate = CreateExecutorState(); + ExprContext *econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = table_slot_create(columnarRelation, NULL); + ExprState *predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + + double reltuples = ColumnarReadRowsIntoIndex(scan, indexRelation, indexInfo, + callback, callback_state, estate, + predicate); + table_endscan(scan); + + if (snapshotRegisteredByUs) + { + UnregisterSnapshot(snapshot); + } + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(estate); + indexInfo->ii_ExpressionsState = NIL; + indexInfo->ii_PredicateState = NULL; + + return reltuples; +} + + +/* + * ColumnarReadRowsIntoIndex builds indexRelation tuples by reading the + * actual relation based on given "scan" and returns number of tuples + * scanned to build the indexRelation. + */ +static double +ColumnarReadRowsIntoIndex(TableScanDesc scan, Relation indexRelation, + IndexInfo *indexInfo, IndexBuildCallback indexCallback, + void *indexCallbackState, EState *estate, ExprState *predicate) +{ + double reltuples = 0; + + ExprContext *econtext = GetPerTupleExprContext(estate); + TupleTableSlot *slot = econtext->ecxt_scantuple; + while (columnar_getnextslot(scan, ForwardScanDirection, slot)) + { + CHECK_FOR_INTERRUPTS(); + + MemoryContextReset(econtext->ecxt_per_tuple_memory); + + if (predicate != NULL && !ExecQual(predicate, econtext)) + { + /* for partial indexes, discard tuples that don't satisfy the predicate */ + continue; + } + + Datum indexValues[INDEX_MAX_KEYS]; + bool indexNulls[INDEX_MAX_KEYS]; + FormIndexDatum(indexInfo, slot, estate, indexValues, indexNulls); + + ItemPointerData itemPointerData = slot->tts_tid; + + /* currently, columnar tables can't have dead tuples */ + bool tupleIsAlive = true; +#if PG_VERSION_NUM >= PG_VERSION_13 + indexCallback(indexRelation, &itemPointerData, indexValues, indexNulls, + tupleIsAlive, indexCallbackState); +#else + HeapTuple scanTuple = ExecCopySlotHeapTuple(slot); + scanTuple->t_self = itemPointerData; + indexCallback(indexRelation, scanTuple, indexValues, indexNulls, + tupleIsAlive, indexCallbackState); +#endif + + reltuples++; + } + + return reltuples; } @@ -1027,8 +1246,15 @@ columnar_index_validate_scan(Relation heapRelation, Snapshot snapshot, ValidateIndexState *state) { + /* + * This is only called for concurrent index builds, + * see table_index_validate_scan. + * Note that we already error out for concurrent index + * builds in utility hook but be on the safe side. + */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + errmsg("concurrent index builds are not supported for " + "columnar tables"))); } @@ -1361,7 +1587,17 @@ ColumnarProcessUtility(PlannedStmt *pstmt, if (indexStmt->concurrent) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + errmsg("concurrent index commands are not " + "supported for columnar tables"))); + } + + /* for now, we don't support index access methods other than btree & hash */ + if (strncmp(indexStmt->accessMethod, "btree", NAMEDATALEN) != 0 && + strncmp(indexStmt->accessMethod, "hash", NAMEDATALEN) != 0) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only btree and hash indexes are supported on " + "columnar tables "))); } } diff --git a/src/include/columnar/columnar.h b/src/include/columnar/columnar.h index 2689d5947..8288a20aa 100644 --- a/src/include/columnar/columnar.h +++ b/src/include/columnar/columnar.h @@ -216,6 +216,9 @@ extern ColumnarReadState * ColumnarBeginRead(Relation relation, extern bool ColumnarReadNextRow(ColumnarReadState *state, Datum *columnValues, bool *columnNulls, uint64 *rowNumber); extern void ColumnarRescan(ColumnarReadState *readState); +extern bool ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber, + List *neededColumnList, Datum *columnValues, + bool *columnNulls, Snapshot snapshot); extern void ColumnarEndRead(ColumnarReadState *state); extern int64 ColumnarReadChunkGroupsFiltered(ColumnarReadState *state); @@ -251,6 +254,8 @@ extern void SaveChunkGroups(RelFileNode relfilenode, uint64 stripe, extern StripeSkipList * ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe, TupleDesc tupleDescriptor, uint32 chunkCount); +extern StripeMetadata * FindStripeByRowNumber(Relation relation, uint64 rowNumber, + Snapshot snapshot); extern Datum columnar_relation_storageid(PG_FUNCTION_ARGS); diff --git a/src/test/regress/expected/columnar_alter.out b/src/test/regress/expected/columnar_alter.out index 2bb229786..4706fa386 100644 --- a/src/test/regress/expected/columnar_alter.out +++ b/src/test/regress/expected/columnar_alter.out @@ -255,7 +255,6 @@ insert into atacc1 values(1); alter table atacc1 add column b float8 not null default random(), add primary key(a); -ERROR: indexes not supported for columnar tables -- Add a generate column with an expression value create table test_gen_ex (x int) using columnar; INSERT INTO test_gen_ex VALUES (1), (2), (3); @@ -390,30 +389,30 @@ SELECT * FROM products ORDER BY 1; 3 | pen | 2 (3 rows) --- Add a UNIQUE constraint (should fail) -CREATE TABLE products_fail ( +-- Add a UNIQUE constraint +CREATE TABLE products_unique ( product_no integer UNIQUE, name text, price numeric ) USING columnar; -ERROR: indexes not supported for columnar tables ALTER TABLE products ADD COLUMN store_id text UNIQUE; -ERROR: indexes not supported for columnar tables --- Add a PRIMARY KEY constraint (should fail) -CREATE TABLE products_fail ( +-- Add a PRIMARY KEY constraint +CREATE TABLE products_primary ( product_no integer PRIMARY KEY, name text, price numeric ) USING columnar; -ERROR: indexes not supported for columnar tables -ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY; -ERROR: indexes not supported for columnar tables +BEGIN; + ALTER TABLE products DROP COLUMN store_id; + ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY; +ERROR: column "store_id" contains null values +ROLLBACK; -- Add an EXCLUSION constraint (should fail) CREATE TABLE circles ( c circle, EXCLUDE USING gist (c WITH &&) ) USING columnar; -ERROR: indexes not supported for columnar tables +ERROR: only btree and hash indexes are supported on columnar tables -- Row level security CREATE TABLE public.row_level_security_col (id int, pgUser CHARACTER VARYING) USING columnar; CREATE USER user1; diff --git a/src/test/regress/expected/columnar_create.out b/src/test/regress/expected/columnar_create.out index 772d6ac64..704c0d932 100644 --- a/src/test/regress/expected/columnar_create.out +++ b/src/test/regress/expected/columnar_create.out @@ -11,9 +11,7 @@ SELECT alter_columnar_table_set('contestant', compression => 'none'); (1 row) --- should fail CREATE INDEX contestant_idx on contestant(handle); -ERROR: indexes not supported for columnar tables -- Create zstd compressed table CREATE TABLE contestant_compressed (handle TEXT, birthdate DATE, rating INT, percentile FLOAT, country CHAR(3), achievements TEXT[]) diff --git a/src/test/regress/expected/columnar_indexes.out b/src/test/regress/expected/columnar_indexes.out index bd1c41802..6062cae34 100644 --- a/src/test/regress/expected/columnar_indexes.out +++ b/src/test/regress/expected/columnar_indexes.out @@ -10,7 +10,7 @@ SET search_path tO columnar_indexes, public; -- create table t(a int, b int) using columnar; create index CONCURRENTLY t_idx on t(a, b); -ERROR: indexes not supported for columnar tables +ERROR: concurrent index commands are not supported for columnar tables \d t Table "columnar_indexes.t" Column | Type | Collation | Nullable | Default @@ -32,16 +32,15 @@ SELECT * FROM t; 1 | 2 (1 row) --- create index without the concurrent option. We should --- error out during index creation. create index t_idx on t(a, b); -ERROR: indexes not supported for columnar tables \d t Table "columnar_indexes.t" Column | Type | Collation | Nullable | Default --------------------------------------------------------------------- a | integer | | | b | integer | | | +Indexes: + "t_idx" btree (a, b) explain insert into t values (1, 2); QUERY PLAN @@ -58,5 +57,332 @@ SELECT * FROM t; 3 | 4 (2 rows) +-- make sure that we test index scan +set columnar.enable_custom_scan to 'off'; +set enable_seqscan to off; +CREATE table columnar_table (a INT, b int) USING columnar; +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(0, 16000) i; +-- unique -- +BEGIN; + INSERT INTO columnar_table VALUES (100000000); + SAVEPOINT s1; + -- errors out due to unflushed data in upper transaction + CREATE UNIQUE INDEX ON columnar_table (a); +ERROR: cannot read from table when there is unflushed data in upper transactions +ROLLBACK; +CREATE UNIQUE INDEX ON columnar_table (a); +BEGIN; + INSERT INTO columnar_table VALUES (16050); + SAVEPOINT s1; + -- index scan errors out due to unflushed data in upper transaction + SELECT a FROM columnar_table WHERE a = 16050; +ERROR: cannot read from index when there is unflushed data in upper transactions +ROLLBACK; +EXPLAIN (COSTS OFF) SELECT * FROM columnar_table WHERE a=6456; + QUERY PLAN +--------------------------------------------------------------------- + Index Scan using columnar_table_a_idx on columnar_table + Index Cond: (a = 6456) +(2 rows) + +EXPLAIN (COSTS OFF) SELECT a FROM columnar_table WHERE a=6456; + QUERY PLAN +--------------------------------------------------------------------- + Index Only Scan using columnar_table_a_idx on columnar_table + Index Cond: (a = 6456) +(2 rows) + +SELECT (SELECT a FROM columnar_table WHERE a=6456 limit 1)=6456; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT (SELECT b FROM columnar_table WHERE a=6456 limit 1)=6456*2; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- even if a=16050 doesn't exist, we try to insert it twice so this should error out +INSERT INTO columnar_table VALUES (16050), (16050); +ERROR: duplicate key value violates unique constraint "columnar_table_a_idx" +DETAIL: Key (a)=(16050) already exists. +-- should work +INSERT INTO columnar_table VALUES (16050); +-- check edge cases around stripe boundaries, error out +INSERT INTO columnar_table VALUES (16050); +ERROR: duplicate key value violates unique constraint "columnar_table_a_idx" +DETAIL: Key (a)=(16050) already exists. +INSERT INTO columnar_table VALUES (15999); +ERROR: duplicate key value violates unique constraint "columnar_table_a_idx" +DETAIL: Key (a)=(15999) already exists. +DROP INDEX columnar_table_a_idx; +CREATE TABLE partial_unique_idx_test (a INT, b INT) USING columnar; +CREATE UNIQUE INDEX ON partial_unique_idx_test (a) +WHERE b > 500; +-- should work since b =< 500 and our partial index doesn't check this interval +INSERT INTO partial_unique_idx_test VALUES (1, 2), (1, 2); +-- should work since our partial index wouldn't cover the tuples that we inserted above +INSERT INTO partial_unique_idx_test VALUES (1, 800); +INSERT INTO partial_unique_idx_test VALUES (4, 600); +-- should error out due to (4, 600) +INSERT INTO partial_unique_idx_test VALUES (4, 700); +ERROR: duplicate key value violates unique constraint "partial_unique_idx_test_a_idx" +DETAIL: Key (a)=(4) already exists. +-- btree -- +CREATE INDEX ON columnar_table (a); +SELECT (SELECT SUM(b) FROM columnar_table WHERE a>700 and a<965)=439560; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +CREATE INDEX ON columnar_table (b) +WHERE (b > 30000 AND b < 33000); +-- partial index should be way smaller than the non-partial index +SELECT pg_total_relation_size('columnar_table_b_idx') * 5 < + pg_total_relation_size('columnar_table_a_idx'); + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- can't use index scan due to partial index boundaries +EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30000; + QUERY PLAN +--------------------------------------------------------------------- + Seq Scan on columnar_table + Filter: (b = 30000) +(2 rows) + +-- can use index scan +EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30001; + QUERY PLAN +--------------------------------------------------------------------- + Index Only Scan using columnar_table_b_idx on columnar_table + Index Cond: (b = 30001) +(2 rows) + +-- some more rows +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 17000) i; +DROP INDEX columnar_table_a_idx; +TRUNCATE columnar_table; +-- pkey -- +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 16499) i; +ALTER TABLE columnar_table ADD PRIMARY KEY (a); +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16500, 17000) i; +BEGIN; + INSERT INTO columnar_table (a) SELECT 1; +ROLLBACK; +-- should work +INSERT INTO columnar_table (a) SELECT 1; +-- error out +INSERT INTO columnar_table VALUES (16100), (16101); +ERROR: duplicate key value violates unique constraint "columnar_table_pkey" +DETAIL: Key (a)=(16100) already exists. +INSERT INTO columnar_table VALUES (16999); +ERROR: duplicate key value violates unique constraint "columnar_table_pkey" +DETAIL: Key (a)=(16999) already exists. +BEGIN; + REINDEX INDEX columnar_table_pkey; + -- should error even after reindex + INSERT INTO columnar_table VALUES (16999); +ERROR: duplicate key value violates unique constraint "columnar_table_pkey" +DETAIL: Key (a)=(16999) already exists. +ROLLBACK; +VACUUM FULL columnar_table; +-- should error even after vacuum +INSERT INTO columnar_table VALUES (16999); +ERROR: duplicate key value violates unique constraint "columnar_table_pkey" +DETAIL: Key (a)=(16999) already exists. +TRUNCATE columnar_table; +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i; +SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +TRUNCATE columnar_table; +ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey; +-- hash -- +INSERT INTO columnar_table (a, b) SELECT i*2,i FROM generate_series(1, 8000) i; +CREATE INDEX hash_idx ON columnar_table USING HASH (b); +BEGIN; + CREATE INDEX hash_idx_fill_factor ON columnar_table USING HASH (b) WITH (fillfactor=10); + -- same hash index with lower fillfactor should be way bigger + SELECT pg_total_relation_size ('hash_idx_fill_factor') > + pg_total_relation_size ('hash_idx') * 5; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +ROLLBACK; +BEGIN; + INSERT INTO columnar_table (a, b) SELECT i*3,i FROM generate_series(1, 8000) i; +ROLLBACK; +INSERT INTO columnar_table (a, b) SELECT i*4,i FROM generate_series(1, 8000) i; +SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +BEGIN; + REINDEX TABLE columnar_table; + SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +ROLLBACK; +VACUUM FULL columnar_table; +SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- exclusion contraints -- +CREATE TABLE exclusion_test (c1 INT,c2 INT, c3 INT, c4 BOX, +EXCLUDE USING btree (c1 WITH =) INCLUDE(c3,c4) WHERE (c1 < 10)) USING columnar; +-- error out since "c1" is "1" for all rows to be inserted +INSERT INTO exclusion_test SELECT 1, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; +ERROR: conflicting key value violates exclusion constraint "exclusion_test_c1_c3_c4_excl" +DETAIL: Key (c1)=(1) conflicts with existing key (c1)=(1). +BEGIN; + INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; +ROLLBACK; +-- should work +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; +BEGIN; + -- should work thanks to "where" clause in exclusion constraint + INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; +ROLLBACK; +REINDEX TABLE exclusion_test; +-- should still work after reindex +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; +-- make sure that we respect INCLUDE syntax -- +CREATE TABLE include_test (a INT, b BIGINT, c BIGINT, d BIGINT) USING columnar; +INSERT INTO include_test SELECT i, i, i, i FROM generate_series (1, 1000) i; +CREATE UNIQUE INDEX unique_a ON include_test (a); +-- cannot use index only scan +EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + QUERY PLAN +--------------------------------------------------------------------- + Index Scan using unique_a on include_test + Index Cond: (a = 500) +(2 rows) + +CREATE UNIQUE INDEX unique_a_include_b_c_d ON include_test (a) INCLUDE(b, c, d); +-- same unique index that includes other columns should be way bigger +SELECT pg_total_relation_size ('unique_a') * 1.5 < + pg_total_relation_size ('unique_a_include_b_c_d'); + ?column? +--------------------------------------------------------------------- + t +(1 row) + +DROP INDEX unique_a; +-- should use index only scan since unique_a_include_b_c_d includes column "b" too +EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + QUERY PLAN +--------------------------------------------------------------------- + Index Only Scan using unique_a_include_b_c_d on include_test + Index Cond: (a = 500) +(2 rows) + +BEGIN; + SET enable_indexonlyscan = OFF; + -- show that we respect enable_indexonlyscan GUC + EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + QUERY PLAN +--------------------------------------------------------------------- + Index Scan using unique_a_include_b_c_d on include_test + Index Cond: (a = 500) +(2 rows) + +ROLLBACK; +-- make sure that we read the correct value for "b" when doing index only scan +SELECT b=980 FROM include_test WHERE a = 980; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- some tests with distributed & partitioned tables -- +CREATE TABLE dist_part_table( + dist_col INT, + part_col TIMESTAMPTZ, + col1 TEXT +) PARTITION BY RANGE (part_col); +-- create an index before creating a columnar partition +CREATE INDEX dist_part_table_btree ON dist_part_table (col1); +-- columnar partition +CREATE TABLE p0 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-01-01') TO ('2020-02-01') +USING columnar; +SELECT create_distributed_table('dist_part_table', 'dist_col'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- columnar partition +CREATE TABLE p1 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-02-01') TO ('2020-03-01') +USING columnar; +-- row partition +CREATE TABLE p2 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-03-01') TO ('2020-04-01'); +INSERT INTO dist_part_table VALUES (1, '2020-03-15', 'str1', POINT(1, 1)); +ERROR: INSERT has more expressions than target columns +-- insert into columnar partitions +INSERT INTO dist_part_table VALUES (1, '2020-01-15', 'str2', POINT(2, 2)); +ERROR: INSERT has more expressions than target columns +INSERT INTO dist_part_table VALUES (1, '2020-02-15', 'str3', POINT(3, 3)); +ERROR: INSERT has more expressions than target columns +-- create another index after creating a columnar partition +CREATE UNIQUE INDEX dist_part_table_unique ON dist_part_table (dist_col, part_col); +-- verify that indexes are created on columnar partitions +SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p0'; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p1'; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +-- unsupported index types -- +-- gin -- +CREATE TABLE testjsonb (j JSONB) USING columnar; +INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(1,10) i; +CREATE INDEX jidx ON testjsonb USING GIN (j); +ERROR: only btree and hash indexes are supported on columnar tables +INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(15,20) i; +-- gist -- +CREATE TABLE gist_point_tbl(id INT4, p POINT) USING columnar; +INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(1, 10) g; +CREATE INDEX gist_pointidx ON gist_point_tbl USING gist(p); +ERROR: only btree and hash indexes are supported on columnar tables +INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(10, 20) g; +-- sp gist -- +CREATE TABLE box_temp (f1 box) USING columnar; +INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i; +CREATE INDEX box_spgist ON box_temp USING spgist (f1); +ERROR: only btree and hash indexes are supported on columnar tables +INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i; +-- brin -- +CREATE TABLE brin_summarize (value int) USING columnar; +CREATE INDEX brin_summarize_idx ON brin_summarize USING brin (value) WITH (pages_per_range=2); +ERROR: only btree and hash indexes are supported on columnar tables SET client_min_messages TO WARNING; DROP SCHEMA columnar_indexes CASCADE; diff --git a/src/test/regress/sql/columnar_alter.sql b/src/test/regress/sql/columnar_alter.sql index ed916967e..ecd647fe1 100644 --- a/src/test/regress/sql/columnar_alter.sql +++ b/src/test/regress/sql/columnar_alter.sql @@ -215,21 +215,25 @@ ALTER TABLE products DROP CONSTRAINT dummy_constraint; INSERT INTO products VALUES (3, 'pen', 2); SELECT * FROM products ORDER BY 1; --- Add a UNIQUE constraint (should fail) -CREATE TABLE products_fail ( +-- Add a UNIQUE constraint +CREATE TABLE products_unique ( product_no integer UNIQUE, name text, price numeric ) USING columnar; ALTER TABLE products ADD COLUMN store_id text UNIQUE; --- Add a PRIMARY KEY constraint (should fail) -CREATE TABLE products_fail ( +-- Add a PRIMARY KEY constraint +CREATE TABLE products_primary ( product_no integer PRIMARY KEY, name text, price numeric ) USING columnar; -ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY; + +BEGIN; + ALTER TABLE products DROP COLUMN store_id; + ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY; +ROLLBACK; -- Add an EXCLUSION constraint (should fail) CREATE TABLE circles ( diff --git a/src/test/regress/sql/columnar_create.sql b/src/test/regress/sql/columnar_create.sql index 0c18f2212..f83b1c2a8 100644 --- a/src/test/regress/sql/columnar_create.sql +++ b/src/test/regress/sql/columnar_create.sql @@ -9,7 +9,6 @@ CREATE TABLE contestant (handle TEXT, birthdate DATE, rating INT, USING columnar; SELECT alter_columnar_table_set('contestant', compression => 'none'); --- should fail CREATE INDEX contestant_idx on contestant(handle); -- Create zstd compressed table diff --git a/src/test/regress/sql/columnar_indexes.sql b/src/test/regress/sql/columnar_indexes.sql index 831699dc4..60991b56a 100644 --- a/src/test/regress/sql/columnar_indexes.sql +++ b/src/test/regress/sql/columnar_indexes.sql @@ -17,13 +17,269 @@ explain insert into t values (1, 2); insert into t values (1, 2); SELECT * FROM t; --- create index without the concurrent option. We should --- error out during index creation. create index t_idx on t(a, b); \d t explain insert into t values (1, 2); insert into t values (3, 4); SELECT * FROM t; +-- make sure that we test index scan +set columnar.enable_custom_scan to 'off'; +set enable_seqscan to off; + +CREATE table columnar_table (a INT, b int) USING columnar; +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(0, 16000) i; + +-- unique -- +BEGIN; + INSERT INTO columnar_table VALUES (100000000); + SAVEPOINT s1; + -- errors out due to unflushed data in upper transaction + CREATE UNIQUE INDEX ON columnar_table (a); +ROLLBACK; + +CREATE UNIQUE INDEX ON columnar_table (a); + +BEGIN; + INSERT INTO columnar_table VALUES (16050); + SAVEPOINT s1; + -- index scan errors out due to unflushed data in upper transaction + SELECT a FROM columnar_table WHERE a = 16050; +ROLLBACK; + +EXPLAIN (COSTS OFF) SELECT * FROM columnar_table WHERE a=6456; +EXPLAIN (COSTS OFF) SELECT a FROM columnar_table WHERE a=6456; +SELECT (SELECT a FROM columnar_table WHERE a=6456 limit 1)=6456; +SELECT (SELECT b FROM columnar_table WHERE a=6456 limit 1)=6456*2; + +-- even if a=16050 doesn't exist, we try to insert it twice so this should error out +INSERT INTO columnar_table VALUES (16050), (16050); + +-- should work +INSERT INTO columnar_table VALUES (16050); + +-- check edge cases around stripe boundaries, error out +INSERT INTO columnar_table VALUES (16050); +INSERT INTO columnar_table VALUES (15999); + +DROP INDEX columnar_table_a_idx; + +CREATE TABLE partial_unique_idx_test (a INT, b INT) USING columnar; +CREATE UNIQUE INDEX ON partial_unique_idx_test (a) +WHERE b > 500; + +-- should work since b =< 500 and our partial index doesn't check this interval +INSERT INTO partial_unique_idx_test VALUES (1, 2), (1, 2); + +-- should work since our partial index wouldn't cover the tuples that we inserted above +INSERT INTO partial_unique_idx_test VALUES (1, 800); + +INSERT INTO partial_unique_idx_test VALUES (4, 600); + +-- should error out due to (4, 600) +INSERT INTO partial_unique_idx_test VALUES (4, 700); + +-- btree -- +CREATE INDEX ON columnar_table (a); +SELECT (SELECT SUM(b) FROM columnar_table WHERE a>700 and a<965)=439560; + +CREATE INDEX ON columnar_table (b) +WHERE (b > 30000 AND b < 33000); + +-- partial index should be way smaller than the non-partial index +SELECT pg_total_relation_size('columnar_table_b_idx') * 5 < + pg_total_relation_size('columnar_table_a_idx'); + +-- can't use index scan due to partial index boundaries +EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30000; +-- can use index scan +EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30001; + +-- some more rows +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 17000) i; + +DROP INDEX columnar_table_a_idx; +TRUNCATE columnar_table; + +-- pkey -- +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 16499) i; +ALTER TABLE columnar_table ADD PRIMARY KEY (a); +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16500, 17000) i; + +BEGIN; + INSERT INTO columnar_table (a) SELECT 1; +ROLLBACK; + +-- should work +INSERT INTO columnar_table (a) SELECT 1; + +-- error out +INSERT INTO columnar_table VALUES (16100), (16101); +INSERT INTO columnar_table VALUES (16999); + +BEGIN; + REINDEX INDEX columnar_table_pkey; + -- should error even after reindex + INSERT INTO columnar_table VALUES (16999); +ROLLBACK; + +VACUUM FULL columnar_table; +-- should error even after vacuum +INSERT INTO columnar_table VALUES (16999); + +TRUNCATE columnar_table; +INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i; +SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000; + +TRUNCATE columnar_table; +ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey; + +-- hash -- +INSERT INTO columnar_table (a, b) SELECT i*2,i FROM generate_series(1, 8000) i; +CREATE INDEX hash_idx ON columnar_table USING HASH (b); + +BEGIN; + CREATE INDEX hash_idx_fill_factor ON columnar_table USING HASH (b) WITH (fillfactor=10); + -- same hash index with lower fillfactor should be way bigger + SELECT pg_total_relation_size ('hash_idx_fill_factor') > + pg_total_relation_size ('hash_idx') * 5; +ROLLBACK; + +BEGIN; + INSERT INTO columnar_table (a, b) SELECT i*3,i FROM generate_series(1, 8000) i; +ROLLBACK; + +INSERT INTO columnar_table (a, b) SELECT i*4,i FROM generate_series(1, 8000) i; + +SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + +BEGIN; + REINDEX TABLE columnar_table; + SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; +ROLLBACK; + +VACUUM FULL columnar_table; +SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000; + +-- exclusion contraints -- +CREATE TABLE exclusion_test (c1 INT,c2 INT, c3 INT, c4 BOX, +EXCLUDE USING btree (c1 WITH =) INCLUDE(c3,c4) WHERE (c1 < 10)) USING columnar; + +-- error out since "c1" is "1" for all rows to be inserted +INSERT INTO exclusion_test SELECT 1, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; + +BEGIN; + INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; +ROLLBACK; + +-- should work +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x; + +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; + +BEGIN; + -- should work thanks to "where" clause in exclusion constraint + INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; +ROLLBACK; + +REINDEX TABLE exclusion_test; +-- should still work after reindex +INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x; + +-- make sure that we respect INCLUDE syntax -- + +CREATE TABLE include_test (a INT, b BIGINT, c BIGINT, d BIGINT) USING columnar; + +INSERT INTO include_test SELECT i, i, i, i FROM generate_series (1, 1000) i; + +CREATE UNIQUE INDEX unique_a ON include_test (a); + +-- cannot use index only scan +EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + +CREATE UNIQUE INDEX unique_a_include_b_c_d ON include_test (a) INCLUDE(b, c, d); + +-- same unique index that includes other columns should be way bigger +SELECT pg_total_relation_size ('unique_a') * 1.5 < + pg_total_relation_size ('unique_a_include_b_c_d'); + +DROP INDEX unique_a; + +-- should use index only scan since unique_a_include_b_c_d includes column "b" too +EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; + +BEGIN; + SET enable_indexonlyscan = OFF; + -- show that we respect enable_indexonlyscan GUC + EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500; +ROLLBACK; + +-- make sure that we read the correct value for "b" when doing index only scan +SELECT b=980 FROM include_test WHERE a = 980; + +-- some tests with distributed & partitioned tables -- + +CREATE TABLE dist_part_table( + dist_col INT, + part_col TIMESTAMPTZ, + col1 TEXT +) PARTITION BY RANGE (part_col); + +-- create an index before creating a columnar partition +CREATE INDEX dist_part_table_btree ON dist_part_table (col1); + +-- columnar partition +CREATE TABLE p0 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-01-01') TO ('2020-02-01') +USING columnar; + +SELECT create_distributed_table('dist_part_table', 'dist_col'); + +-- columnar partition +CREATE TABLE p1 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-02-01') TO ('2020-03-01') +USING columnar; + +-- row partition +CREATE TABLE p2 PARTITION OF dist_part_table +FOR VALUES FROM ('2020-03-01') TO ('2020-04-01'); + +INSERT INTO dist_part_table VALUES (1, '2020-03-15', 'str1', POINT(1, 1)); + +-- insert into columnar partitions +INSERT INTO dist_part_table VALUES (1, '2020-01-15', 'str2', POINT(2, 2)); +INSERT INTO dist_part_table VALUES (1, '2020-02-15', 'str3', POINT(3, 3)); + +-- create another index after creating a columnar partition +CREATE UNIQUE INDEX dist_part_table_unique ON dist_part_table (dist_col, part_col); + +-- verify that indexes are created on columnar partitions +SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p0'; +SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p1'; + +-- unsupported index types -- + +-- gin -- +CREATE TABLE testjsonb (j JSONB) USING columnar; +INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(1,10) i; +CREATE INDEX jidx ON testjsonb USING GIN (j); +INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(15,20) i; + +-- gist -- +CREATE TABLE gist_point_tbl(id INT4, p POINT) USING columnar; +INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(1, 10) g; +CREATE INDEX gist_pointidx ON gist_point_tbl USING gist(p); +INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(10, 20) g; + +-- sp gist -- +CREATE TABLE box_temp (f1 box) USING columnar; +INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i; +CREATE INDEX box_spgist ON box_temp USING spgist (f1); +INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i; + +-- brin -- +CREATE TABLE brin_summarize (value int) USING columnar; +CREATE INDEX brin_summarize_idx ON brin_summarize USING brin (value) WITH (pages_per_range=2); + SET client_min_messages TO WARNING; DROP SCHEMA columnar_indexes CASCADE; From 1af50e98b3f2da5e9fe742ed433a7ea028877d69 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Wed, 26 May 2021 16:45:05 +0300 Subject: [PATCH 3/7] Fix a comment in ColumnarMetapageRead --- src/backend/columnar/columnar_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/columnar/columnar_storage.c b/src/backend/columnar/columnar_storage.c index a73275249..aba79303f 100644 --- a/src/backend/columnar/columnar_storage.c +++ b/src/backend/columnar/columnar_storage.c @@ -615,7 +615,7 @@ ColumnarMetapageRead(Relation rel, bool force) if (nblocks == 0) { /* - * We only expect this to happen during binary ugrades. This is because, + * We only expect this to happen when upgrading citus.so. This is because, * in current version of columnar, we immediately create the metapage * for columnar tables, i.e right after creating the table. * However in older versions, we were creating metapages lazily, i.e From 82ea1b5dafba66c1091bb16f86aa527009d8b178 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Tue, 1 Jun 2021 10:43:51 +0300 Subject: [PATCH 4/7] Not remove all paths, keep IndexPath's --- src/backend/columnar/columnar_customscan.c | 59 +++++++++++++++++----- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/src/backend/columnar/columnar_customscan.c b/src/backend/columnar/columnar_customscan.c index 4866c81ad..88760c654 100644 --- a/src/backend/columnar/columnar_customscan.c +++ b/src/backend/columnar/columnar_customscan.c @@ -27,6 +27,7 @@ #include "columnar/columnar_customscan.h" #include "columnar/columnar_metadata.h" #include "columnar/columnar_tableam.h" +#include "distributed/listutils.h" typedef struct ColumnarScanPath { @@ -50,8 +51,13 @@ typedef struct ColumnarScanState } ColumnarScanState; +typedef bool (*PathPredicate)(Path *path); + + static void ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); +static void RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate); +static bool IsNotIndexPath(Path *path); static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static Cost ColumnarScanCost(RangeTblEntry *rte); @@ -137,18 +143,6 @@ columnar_customscan_init() } -static void -clear_paths(RelOptInfo *rel) -{ - rel->pathlist = NIL; - rel->partial_pathlist = NIL; - rel->cheapest_startup_path = NULL; - rel->cheapest_total_path = NULL; - rel->cheapest_unique_path = NULL; - rel->cheapest_parameterized_paths = NIL; -} - - static void ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte) @@ -188,8 +182,13 @@ ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, ereport(DEBUG1, (errmsg("pathlist hook for columnar table am"))); - /* we propose a new path that will be the only path for scanning this relation */ - clear_paths(rel); + /* + * TODO: Since we don't have a proper costing model for + * ColumnarCustomScan, we remove other paths to force postgres + * using ColumnarCustomScan. Note that we still keep index paths + * since they still might be useful. + */ + RemovePathsByPredicate(rel, IsNotIndexPath); add_path(rel, customPath); } } @@ -197,6 +196,38 @@ ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, } +/* + * RemovePathsByPredicate removes the paths that removePathPredicate + * evaluates to true from pathlist of given rel. + */ +static void +RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate) +{ + List *filteredPathList = NIL; + + Path *path = NULL; + foreach_ptr(path, rel->pathlist) + { + if (!removePathPredicate(path)) + { + filteredPathList = lappend(filteredPathList, path); + } + } + + rel->pathlist = filteredPathList; +} + + +/* + * IsNotIndexPath returns true if given path is not an IndexPath. + */ +static bool +IsNotIndexPath(Path *path) +{ + return !IsA(path, IndexPath); +} + + static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) { From 9b4dc2f804d67a1f5b41fab9d0dfeaa4fe365739 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Tue, 1 Jun 2021 02:22:26 +0300 Subject: [PATCH 5/7] Prevent using parallel scan for columnar index builds --- src/backend/columnar/columnar_tableam.c | 46 ++++++++++++++++--- .../regress/expected/columnar_indexes.out | 9 ++++ src/test/regress/sql/columnar_indexes.sql | 7 +++ 3 files changed, 56 insertions(+), 6 deletions(-) diff --git a/src/backend/columnar/columnar_tableam.c b/src/backend/columnar/columnar_tableam.c index 944032b84..3ed5d8187 100644 --- a/src/backend/columnar/columnar_tableam.c +++ b/src/backend/columnar/columnar_tableam.c @@ -343,21 +343,33 @@ ErrorIfInvalidRowNumber(uint64 rowNumber) static Size columnar_parallelscan_estimate(Relation rel) { - elog(ERROR, "columnar_parallelscan_estimate not implemented"); + return sizeof(ParallelBlockTableScanDescData); } static Size columnar_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan) { - elog(ERROR, "columnar_parallelscan_initialize not implemented"); + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + + bpscan->base.phs_relid = RelationGetRelid(rel); + bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel); + bpscan->base.phs_syncscan = synchronize_seqscans && + !RelationUsesLocalBuffers(rel) && + bpscan->phs_nblocks > NBuffers / 4; + SpinLockInit(&bpscan->phs_mutex); + bpscan->phs_startblock = InvalidBlockNumber; + pg_atomic_init_u64(&bpscan->phs_nallocated, 0); + + return sizeof(ParallelBlockTableScanDescData); } static void columnar_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) { - elog(ERROR, "columnar_parallelscan_reinitialize not implemented"); + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + pg_atomic_write_u64(&bpscan->phs_nallocated, 0); } @@ -1101,10 +1113,21 @@ columnar_index_build_range_scan(Relation columnarRelation, if (scan) { /* - * Since we don't support parallel reads on columnar tables, we - * should have already errored out for that, but be on the safe side. + * Scan is initialized iff postgres decided to build the index using + * parallel workers. In this case, we simply return for parallel + * workers since we don't support parallel scan on columnar tables. */ - ereport(ERROR, (errmsg("parallel reads on columnar are not supported"))); + if (IsBackgroundWorker) + { + ereport(DEBUG4, (errmsg("ignoring parallel worker when building " + "index since parallel scan on columnar " + "tables is not supported"))); + return 0; + } + + ereport(NOTICE, (errmsg("falling back to serial index build since " + "parallel scan on columnar tables is not " + "supported"))); } /* @@ -1643,6 +1666,17 @@ static const TableAmRoutine columnar_am_methods = { .scan_rescan = columnar_rescan, .scan_getnextslot = columnar_getnextslot, + /* + * Postgres calls following three callbacks during index builds, if it + * decides to use parallel workers when building the index. On the other + * hand, we don't support parallel scans on columnar tables but we also + * want to fallback to serial index build. For this reason, we both skip + * parallel workers in columnar_index_build_range_scan and also provide + * basic implementations for those callbacks based on their corresponding + * implementations in heapAM. + * Note that for regular query plans, we already ignore parallel paths via + * ColumnarSetRelPathlistHook. + */ .parallelscan_estimate = columnar_parallelscan_estimate, .parallelscan_initialize = columnar_parallelscan_initialize, .parallelscan_reinitialize = columnar_parallelscan_reinitialize, diff --git a/src/test/regress/expected/columnar_indexes.out b/src/test/regress/expected/columnar_indexes.out index 6062cae34..32af88c92 100644 --- a/src/test/regress/expected/columnar_indexes.out +++ b/src/test/regress/expected/columnar_indexes.out @@ -384,5 +384,14 @@ INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_ CREATE TABLE brin_summarize (value int) USING columnar; CREATE INDEX brin_summarize_idx ON brin_summarize USING brin (value) WITH (pages_per_range=2); ERROR: only btree and hash indexes are supported on columnar tables +-- Show that we safely fallback to serial index build. +CREATE TABLE parallel_scan_test(a int) USING columnar WITH ( parallel_workers = 2 ); +INSERT INTO parallel_scan_test SELECT i FROM generate_series(1,10) i; +CREATE INDEX ON parallel_scan_test (a); +NOTICE: falling back to serial index build since parallel scan on columnar tables is not supported +VACUUM FULL parallel_scan_test; +NOTICE: falling back to serial index build since parallel scan on columnar tables is not supported +REINDEX TABLE parallel_scan_test; +NOTICE: falling back to serial index build since parallel scan on columnar tables is not supported SET client_min_messages TO WARNING; DROP SCHEMA columnar_indexes CASCADE; diff --git a/src/test/regress/sql/columnar_indexes.sql b/src/test/regress/sql/columnar_indexes.sql index 60991b56a..2b48e2a37 100644 --- a/src/test/regress/sql/columnar_indexes.sql +++ b/src/test/regress/sql/columnar_indexes.sql @@ -281,5 +281,12 @@ INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_ CREATE TABLE brin_summarize (value int) USING columnar; CREATE INDEX brin_summarize_idx ON brin_summarize USING brin (value) WITH (pages_per_range=2); +-- Show that we safely fallback to serial index build. +CREATE TABLE parallel_scan_test(a int) USING columnar WITH ( parallel_workers = 2 ); +INSERT INTO parallel_scan_test SELECT i FROM generate_series(1,10) i; +CREATE INDEX ON parallel_scan_test (a); +VACUUM FULL parallel_scan_test; +REINDEX TABLE parallel_scan_test; + SET client_min_messages TO WARNING; DROP SCHEMA columnar_indexes CASCADE; From 5adab2a3acd5e815ff6fdeda4c2d83560524f89c Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Thu, 3 Jun 2021 17:56:47 +0300 Subject: [PATCH 6/7] Report progress when building index on columnar tables --- src/backend/columnar/columnar_metadata.c | 43 +++++++- src/backend/columnar/columnar_tableam.c | 126 +++++++++++++++++++++-- src/include/columnar/columnar.h | 2 + 3 files changed, 161 insertions(+), 10 deletions(-) diff --git a/src/backend/columnar/columnar_metadata.c b/src/backend/columnar/columnar_metadata.c index af62227a6..0ebb533c7 100644 --- a/src/backend/columnar/columnar_metadata.c +++ b/src/backend/columnar/columnar_metadata.c @@ -668,6 +668,46 @@ FindStripeByRowNumber(Relation relation, uint64 rowNumber, Snapshot snapshot) } +/* + * FindStripeWithHighestRowNumber returns StripeMetadata for the stripe that + * has the row with highest rowNumber by doing backward index scan on + * stripe_first_row_number_idx. If given relation is empty, then returns NULL. + */ +StripeMetadata * +FindStripeWithHighestRowNumber(Relation relation, Snapshot snapshot) +{ + StripeMetadata *stripeWithHighestRowNumber = NULL; + + uint64 storageId = ColumnarStorageGetStorageId(relation, false); + ScanKeyData scanKey[1]; + ScanKeyInit(&scanKey[0], Anum_columnar_stripe_storageid, + BTEqualStrategyNumber, F_OIDEQ, Int32GetDatum(storageId)); + + Relation columnarStripes = table_open(ColumnarStripeRelationId(), AccessShareLock); + Relation index = index_open(ColumnarStripeFirstRowNumberIndexRelationId(), + AccessShareLock); + SysScanDesc scanDescriptor = systable_beginscan_ordered(columnarStripes, index, + snapshot, 1, scanKey); + + HeapTuple heapTuple = systable_getnext_ordered(scanDescriptor, BackwardScanDirection); + if (HeapTupleIsValid(heapTuple)) + { + TupleDesc tupleDescriptor = RelationGetDescr(columnarStripes); + Datum datumArray[Natts_columnar_stripe]; + bool isNullArray[Natts_columnar_stripe]; + heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray); + + stripeWithHighestRowNumber = BuildStripeMetadata(datumArray); + } + + systable_endscan_ordered(scanDescriptor); + index_close(index, AccessShareLock); + table_close(columnarStripes, AccessShareLock); + + return stripeWithHighestRowNumber; +} + + /* * ReadChunkGroupRowCounts returns an array of row counts of chunk groups for the * given stripe. @@ -876,7 +916,8 @@ ReadDataFileStripeList(uint64 storageId, Snapshot snapshot) Oid columnarStripesOid = ColumnarStripeRelationId(); Relation columnarStripes = table_open(columnarStripesOid, AccessShareLock); - Relation index = index_open(ColumnarStripePKeyIndexRelationId(), AccessShareLock); + Relation index = index_open(ColumnarStripeFirstRowNumberIndexRelationId(), + AccessShareLock); TupleDesc tupleDescriptor = RelationGetDescr(columnarStripes); SysScanDesc scanDescriptor = systable_beginscan_ordered(columnarStripes, index, diff --git a/src/backend/columnar/columnar_tableam.c b/src/backend/columnar/columnar_tableam.c index 3ed5d8187..89cb10ece 100644 --- a/src/backend/columnar/columnar_tableam.c +++ b/src/backend/columnar/columnar_tableam.c @@ -114,9 +114,15 @@ static Datum * detoast_values(TupleDesc tupleDesc, Datum *orig_values, bool *isn static ItemPointerData row_number_to_tid(uint64 rowNumber); static uint64 tid_to_row_number(ItemPointerData tid); static void ErrorIfInvalidRowNumber(uint64 rowNumber); +static void ColumnarReportTotalVirtualBlocks(Relation relation, Snapshot snapshot, + int progressArrIndex); +static BlockNumber ColumnarGetNumberOfVirtualBlocks(Relation relation, Snapshot snapshot); +static ItemPointerData ColumnarGetHighestItemPointer(Relation relation, + Snapshot snapshot); static double ColumnarReadRowsIntoIndex(TableScanDesc scan, Relation indexRelation, IndexInfo *indexInfo, + bool progress, IndexBuildCallback indexCallback, void *indexCallbackState, EState *estate, ExprState *predicate); @@ -1089,11 +1095,6 @@ columnar_index_build_range_scan(Relation columnarRelation, void *callback_state, TableScanDesc scan) { - /* - * TODO: Should this function call pgstat_progress_update_param in - * somewhere as heapam_index_build_range_scan ? - */ - if (start_blockno != 0 || numblocks != InvalidBlockNumber) { /* @@ -1183,6 +1184,12 @@ columnar_index_build_range_scan(Relation columnarRelation, snapshot = scan->rs_snapshot; } + if (progress) + { + ColumnarReportTotalVirtualBlocks(columnarRelation, snapshot, + PROGRESS_SCAN_BLOCKS_TOTAL); + } + /* * Set up execution state for predicate, if any. * Note that this is only useful for partial indexes. @@ -1193,10 +1200,17 @@ columnar_index_build_range_scan(Relation columnarRelation, ExprState *predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); double reltuples = ColumnarReadRowsIntoIndex(scan, indexRelation, indexInfo, - callback, callback_state, estate, - predicate); + progress, callback, callback_state, + estate, predicate); table_endscan(scan); + if (progress) + { + /* report the last "virtual" block as "done" */ + ColumnarReportTotalVirtualBlocks(columnarRelation, snapshot, + PROGRESS_SCAN_BLOCKS_DONE); + } + if (snapshotRegisteredByUs) { UnregisterSnapshot(snapshot); @@ -1211,6 +1225,81 @@ columnar_index_build_range_scan(Relation columnarRelation, } +/* + * ColumnarReportTotalVirtualBlocks reports progress for index build based on + * number of "virtual" blocks that given relation has. + * "progressArrIndex" argument determines which entry in st_progress_param + * array should be updated. In this case, we only expect PROGRESS_SCAN_BLOCKS_TOTAL + * or PROGRESS_SCAN_BLOCKS_DONE to specify whether we want to report calculated + * number of blocks as "done" or as "total" number of "virtual" blocks to scan. + */ +static void +ColumnarReportTotalVirtualBlocks(Relation relation, Snapshot snapshot, + int progressArrIndex) +{ + /* + * Indeed, columnar tables might have gaps between row numbers, e.g + * due to aborted transactions etc. Also, ItemPointer BlockNumber's + * for columnar tables don't actually correspond to actual disk blocks + * as in heapAM. For this reason, we call them as "virtual" blocks. At + * the moment, we believe it is better to report our progress based on + * this "virtual" block concept instead of doing nothing. + */ + Assert(progressArrIndex == PROGRESS_SCAN_BLOCKS_TOTAL || + progressArrIndex == PROGRESS_SCAN_BLOCKS_DONE); + BlockNumber nvirtualBlocks = + ColumnarGetNumberOfVirtualBlocks(relation, snapshot); + pgstat_progress_update_param(progressArrIndex, nvirtualBlocks); +} + + +/* + * ColumnarGetNumberOfVirtualBlocks returns total number of "virtual" blocks + * that given columnar table has based on based on ItemPointer BlockNumber's. + */ +static BlockNumber +ColumnarGetNumberOfVirtualBlocks(Relation relation, Snapshot snapshot) +{ + ItemPointerData highestItemPointer = + ColumnarGetHighestItemPointer(relation, snapshot); + if (!ItemPointerIsValid(&highestItemPointer)) + { + /* table is empty according to our snapshot */ + return 0; + } + + /* + * Since BlockNumber is 0-based, increment it by 1 to find the total + * number of "virtual" blocks. + */ + return ItemPointerGetBlockNumber(&highestItemPointer) + 1; +} + + +/* + * ColumnarGetHighestItemPointer returns ItemPointerData for the tuple with + * highest tid for given relation. + * If given relation is empty, then returns invalid item pointer. + */ +static ItemPointerData +ColumnarGetHighestItemPointer(Relation relation, Snapshot snapshot) +{ + StripeMetadata *stripeWithHighestRowNumber = + FindStripeWithHighestRowNumber(relation, snapshot); + if (stripeWithHighestRowNumber == NULL) + { + /* table is empty according to our snapshot */ + ItemPointerData invalidItemPtr; + ItemPointerSetInvalid(&invalidItemPtr); + return invalidItemPtr; + } + + uint64 highestRowNumber = stripeWithHighestRowNumber->firstRowNumber + + stripeWithHighestRowNumber->rowCount - 1; + return row_number_to_tid(highestRowNumber); +} + + /* * ColumnarReadRowsIntoIndex builds indexRelation tuples by reading the * actual relation based on given "scan" and returns number of tuples @@ -1218,17 +1307,36 @@ columnar_index_build_range_scan(Relation columnarRelation, */ static double ColumnarReadRowsIntoIndex(TableScanDesc scan, Relation indexRelation, - IndexInfo *indexInfo, IndexBuildCallback indexCallback, - void *indexCallbackState, EState *estate, ExprState *predicate) + IndexInfo *indexInfo, bool progress, + IndexBuildCallback indexCallback, + void *indexCallbackState, EState *estate, + ExprState *predicate) { double reltuples = 0; + BlockNumber lastReportedBlockNumber = InvalidBlockNumber; + ExprContext *econtext = GetPerTupleExprContext(estate); TupleTableSlot *slot = econtext->ecxt_scantuple; while (columnar_getnextslot(scan, ForwardScanDirection, slot)) { CHECK_FOR_INTERRUPTS(); + BlockNumber currentBlockNumber = ItemPointerGetBlockNumber(&slot->tts_tid); + if (progress && lastReportedBlockNumber != currentBlockNumber) + { + /* + * columnar_getnextslot guarantees that returned tuple will + * always have a greater ItemPointer than the ones we fetched + * before, so we directly use BlockNumber to report our progress. + */ + Assert(lastReportedBlockNumber == InvalidBlockNumber || + currentBlockNumber >= lastReportedBlockNumber); + pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, + currentBlockNumber); + lastReportedBlockNumber = currentBlockNumber; + } + MemoryContextReset(econtext->ecxt_per_tuple_memory); if (predicate != NULL && !ExecQual(predicate, econtext)) diff --git a/src/include/columnar/columnar.h b/src/include/columnar/columnar.h index 8288a20aa..e4770acc2 100644 --- a/src/include/columnar/columnar.h +++ b/src/include/columnar/columnar.h @@ -256,6 +256,8 @@ extern StripeSkipList * ReadStripeSkipList(RelFileNode relfilenode, uint64 strip uint32 chunkCount); extern StripeMetadata * FindStripeByRowNumber(Relation relation, uint64 rowNumber, Snapshot snapshot); +extern StripeMetadata * FindStripeWithHighestRowNumber(Relation relation, + Snapshot snapshot); extern Datum columnar_relation_storageid(PG_FUNCTION_ARGS); From b6b969971aafa93be638052c62f2da18b5831938 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Tue, 8 Jun 2021 19:26:55 +0300 Subject: [PATCH 7/7] Error out for CLUSTER commands on columnar tables --- src/backend/columnar/columnar_tableam.c | 3 ++- src/test/regress/expected/columnar_indexes.out | 6 ++++++ src/test/regress/sql/columnar_indexes.sql | 7 +++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/backend/columnar/columnar_tableam.c b/src/backend/columnar/columnar_tableam.c index 89cb10ece..6d0a9c7fd 100644 --- a/src/backend/columnar/columnar_tableam.c +++ b/src/backend/columnar/columnar_tableam.c @@ -707,7 +707,8 @@ columnar_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, if (OldIndex != NULL || use_sort) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("indexes not supported for columnar tables"))); + errmsg("clustering columnar tables using indexes is " + "not supported"))); } /* diff --git a/src/test/regress/expected/columnar_indexes.out b/src/test/regress/expected/columnar_indexes.out index 32af88c92..9edc0900d 100644 --- a/src/test/regress/expected/columnar_indexes.out +++ b/src/test/regress/expected/columnar_indexes.out @@ -192,6 +192,12 @@ ERROR: duplicate key value violates unique constraint "columnar_table_pkey" DETAIL: Key (a)=(16999) already exists. ROLLBACK; VACUUM FULL columnar_table; +-- show that we don't support clustering columnar tables using indexes +CLUSTER columnar_table USING columnar_table_pkey; +ERROR: clustering columnar tables using indexes is not supported +ALTER TABLE columnar_table CLUSTER ON columnar_table_pkey; +CLUSTER columnar_table; +ERROR: clustering columnar tables using indexes is not supported -- should error even after vacuum INSERT INTO columnar_table VALUES (16999); ERROR: duplicate key value violates unique constraint "columnar_table_pkey" diff --git a/src/test/regress/sql/columnar_indexes.sql b/src/test/regress/sql/columnar_indexes.sql index 2b48e2a37..43ad75221 100644 --- a/src/test/regress/sql/columnar_indexes.sql +++ b/src/test/regress/sql/columnar_indexes.sql @@ -124,6 +124,13 @@ BEGIN; ROLLBACK; VACUUM FULL columnar_table; + +-- show that we don't support clustering columnar tables using indexes +CLUSTER columnar_table USING columnar_table_pkey; + +ALTER TABLE columnar_table CLUSTER ON columnar_table_pkey; +CLUSTER columnar_table; + -- should error even after vacuum INSERT INTO columnar_table VALUES (16999);