Merge pull request #5052 from citusdata/columnar-index

Merge columnar metapage changes and basic index support
2021-06-17 14:55:40 +03:00 · 2021-06-17 14:55:40 +03:00 · b0ca823b4d
parent c4f50185e0 6215a3aa93
commit b0ca823b4d
60 changed files with 3522 additions and 559 deletions
--- a/src/backend/columnar/columnar_customscan.c
+++ b/src/backend/columnar/columnar_customscan.c
@ -27,6 +27,7 @@
 #include "columnar/columnar_customscan.h"
 #include "columnar/columnar_metadata.h"
 #include "columnar/columnar_tableam.h"
 #include "distributed/listutils.h"
 typedef struct ColumnarScanPath
 {
@ -50,8 +51,13 @@ typedef struct ColumnarScanState
 } ColumnarScanState;
 typedef bool (*PathPredicate)(Path *path);
 static void ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti,
 									   RangeTblEntry *rte);
 static void RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate);
 static bool IsNotIndexPath(Path *path);
 static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel,
 									 RangeTblEntry *rte);
 static Cost ColumnarScanCost(RangeTblEntry *rte);
@ -137,18 +143,6 @@ columnar_customscan_init()
 }
 static void
 clear_paths(RelOptInfo *rel)
 {
 	rel->pathlist = NIL;
 	rel->partial_pathlist = NIL;
 	rel->cheapest_startup_path = NULL;
 	rel->cheapest_total_path = NULL;
 	rel->cheapest_unique_path = NULL;
 	rel->cheapest_parameterized_paths = NIL;
 }
 static void
 ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti,
 						   RangeTblEntry *rte)
@ -188,8 +182,13 @@ ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti,
 			ereport(DEBUG1, (errmsg("pathlist hook for columnar table am")));
-			/* we propose a new path that will be the only path for scanning this relation */
+			/*
-			clear_paths(rel);
+			 * TODO: Since we don't have a proper costing model for
 			 * ColumnarCustomScan, we remove other paths to force postgres
 			 * using ColumnarCustomScan. Note that we still keep index paths
 			 * since they still might be useful.
 			 */
 			RemovePathsByPredicate(rel, IsNotIndexPath);
 			add_path(rel, customPath);
 		}
 	}
@ -197,6 +196,38 @@ ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti,
 }
 /*
 * RemovePathsByPredicate removes the paths that removePathPredicate
 * evaluates to true from pathlist of given rel.
 */
 static void
 RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate)
 {
 	List *filteredPathList = NIL;
 	Path *path = NULL;
 	foreach_ptr(path, rel->pathlist)
 	{
 		if (!removePathPredicate(path))
 		{
 			filteredPathList = lappend(filteredPathList, path);
 		}
 	}
 	rel->pathlist = filteredPathList;
 }
 /*
 * IsNotIndexPath returns true if given path is not an IndexPath.
 */
 static bool
 IsNotIndexPath(Path *path)
 {
 	return !IsA(path, IndexPath);
 }
 static Path *
 CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 {
--- a/src/backend/columnar/columnar_debug.c
+++ b/src/backend/columnar/columnar_debug.c
@ -12,6 +12,7 @@
 #include "pg_config.h"
 #include "access/nbtree.h"
 #include "access/table.h"
 #include "catalog/pg_am.h"
 #include "catalog/pg_type.h"
 #include "distributed/pg_version_constants.h"
@ -25,11 +26,13 @@
 #include "utils/tuplestore.h"
 #include "columnar/columnar.h"
 #include "columnar/columnar_storage.h"
 #include "columnar/columnar_version_compat.h"
 static void MemoryContextTotals(MemoryContext context, MemoryContextCounters *counters);
 PG_FUNCTION_INFO_V1(columnar_store_memory_stats);
 PG_FUNCTION_INFO_V1(columnar_storage_info);
 /*
@ -72,6 +75,74 @@ columnar_store_memory_stats(PG_FUNCTION_ARGS)
 }
 /*
 * columnar_storage_info - UDF to return internal storage info for a columnar relation.
 *
 * DDL:
 *  CREATE OR REPLACE FUNCTION columnar_storage_info(
 *      rel regclass,
 *      version_major OUT int4,
 *      version_minor OUT int4,
 *      storage_id OUT int8,
 *      reserved_stripe_id OUT int8,
 *      reserved_row_number OUT int8,
 *      reserved_offset OUT int8)
 *    STRICT
 *    LANGUAGE c AS 'MODULE_PATHNAME', 'columnar_storage_info';
 */
 Datum
 columnar_storage_info(PG_FUNCTION_ARGS)
 {
 #define STORAGE_INFO_NATTS 6
 	Oid relid = PG_GETARG_OID(0);
 	TupleDesc tupdesc;
 	/* Build a tuple descriptor for our result type */
 	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
 	{
 		elog(ERROR, "return type must be a row type");
 	}
 	if (tupdesc->natts != STORAGE_INFO_NATTS)
 	{
 		elog(ERROR, "return type must have %d columns", STORAGE_INFO_NATTS);
 	}
 	Relation rel = table_open(relid, AccessShareLock);
 	if (!IsColumnarTableAmTable(relid))
 	{
 		ereport(ERROR, (errmsg("table \"%s\" is not a columnar table",
 							   RelationGetRelationName(rel))));
 	}
 	RelationOpenSmgr(rel);
 	Datum values[STORAGE_INFO_NATTS] = { 0 };
 	bool nulls[STORAGE_INFO_NATTS] = { 0 };
 	/*
 	 * Pass force = true so that we can inspect metapages that are not the
 	 * current version.
 	 *
 	 * NB: ensure the order and number of attributes correspond to DDL
 	 * declaration.
 	 */
 	values[0] = Int32GetDatum(ColumnarStorageGetVersionMajor(rel, true));
 	values[1] = Int32GetDatum(ColumnarStorageGetVersionMinor(rel, true));
 	values[2] = Int64GetDatum(ColumnarStorageGetStorageId(rel, true));
 	values[3] = Int64GetDatum(ColumnarStorageGetReservedStripeId(rel, true));
 	values[4] = Int64GetDatum(ColumnarStorageGetReservedRowNumber(rel, true));
 	values[5] = Int64GetDatum(ColumnarStorageGetReservedOffset(rel, true));
 	/* release lock */
 	table_close(rel, AccessShareLock);
 	HeapTuple tuple = heap_form_tuple(tupdesc, values, nulls);
 	PG_RETURN_DATUM(HeapTupleGetDatum(tuple));
 }
 /*
 * MemoryContextTotals adds stats of the given memory context and its
 * subtree to the given counters.
--- a/src/backend/columnar/columnar_metadata.c
+++ b/src/backend/columnar/columnar_metadata.c
@ -1,8 +1,19 @@
 /*-------------------------------------------------------------------------
 *
- * columnar_metadata_tables.c
+ * columnar_metadata.c
 *
- * Copyright (c), Citus Data, Inc.
+ * Copyright (c) Citus Data, Inc.
 *
 * Manages metadata for columnar relations in separate, shared metadata tables
 * in the "columnar" schema.
 *
 *   * holds basic stripe information including data size and row counts
 *   * holds basic chunk and chunk group information like data offsets and
 *     min/max values (used for Chunk Group Filtering)
 *   * useful for fast VACUUM operations (e.g. reporting with VACUUM VERBOSE)
 *   * useful for stats/costing
 *   * maps logical row numbers to stripe IDs
 *   * TODO: visibility information
 *
 *-------------------------------------------------------------------------
 */
@ -14,7 +25,9 @@
 #include "citus_version.h"
 #include "columnar/columnar.h"
 #include "columnar/columnar_storage.h"
 #include "columnar/columnar_version_compat.h"
 #include "distributed/listutils.h"
 #include <sys/stat.h>
 #include "access/heapam.h"
@ -30,7 +43,6 @@
 #include "commands/sequence.h"
 #include "commands/trigger.h"
 #include "distributed/metadata_cache.h"
 #include "distributed/resource_lock.h"
 #include "executor/executor.h"
 #include "executor/spi.h"
 #include "miscadmin.h"
@ -48,28 +60,6 @@
 #include "utils/relfilenodemap.h"
 /*
 * Content of the first page in main fork, which stores metadata at file
 * level.
 */
 typedef struct ColumnarMetapage
 {
 	/*
 	 * Store version of file format used, so we can detect files from
 	 * previous versions if we change file format.
 	 */
 	int versionMajor;
 	int versionMinor;
 	/*
 	 * Each of the metadata table rows are identified by a storageId.
 	 * We store it also in the main fork so we can link metadata rows
 	 * with data files.
 	 */
 	uint64 storageId;
 } ColumnarMetapage;
 typedef struct
 {
 	Relation rel;
@ -80,14 +70,14 @@ static void InsertStripeMetadataRow(uint64 storageId, StripeMetadata *stripe);
 static void GetHighestUsedAddressAndId(uint64 storageId,
 									   uint64 *highestUsedAddress,
 									   uint64 *highestUsedId);
 static void LockForStripeReservation(Relation rel, LOCKMODE mode);
 static void UnlockForStripeReservation(Relation rel, LOCKMODE mode);
 static List * ReadDataFileStripeList(uint64 storageId, Snapshot snapshot);
 static StripeMetadata * BuildStripeMetadata(Datum *datumArray);
 static uint32 * ReadChunkGroupRowCounts(uint64 storageId, uint64 stripe, uint32
 										chunkGroupCount);
 static Oid ColumnarStorageIdSequenceRelationId(void);
 static Oid ColumnarStripeRelationId(void);
-static Oid ColumnarStripeIndexRelationId(void);
+static Oid ColumnarStripePKeyIndexRelationId(void);
 static Oid ColumnarStripeFirstRowNumberIndexRelationId(void);
 static Oid ColumnarOptionsRelationId(void);
 static Oid ColumnarOptionsIndexRegclass(void);
 static Oid ColumnarChunkRelationId(void);
@ -95,6 +85,8 @@ static Oid ColumnarChunkGroupRelationId(void);
 static Oid ColumnarChunkIndexRelationId(void);
 static Oid ColumnarChunkGroupIndexRelationId(void);
 static Oid ColumnarNamespaceId(void);
 static uint64 LookupStorageId(RelFileNode relfilenode);
 static uint64 GetHighestUsedFirstRowNumber(uint64 storageId);
 static void DeleteStorageFromColumnarMetadataTable(Oid metadataTableId,
 												   AttrNumber storageIdAtrrNumber,
 												   Oid storageIdIndexId,
@ -107,8 +99,6 @@ static void FinishModifyRelation(ModifyState *state);
 static EState * create_estate_for_relation(Relation rel);
 static bytea * DatumToBytea(Datum value, Form_pg_attribute attrForm);
 static Datum ByteaToDatum(bytea *bytes, Form_pg_attribute attrForm);
 static ColumnarMetapage * InitMetapage(Relation relation);
 static ColumnarMetapage * ReadMetapage(RelFileNode relfilenode, bool missingOk);
 static bool WriteColumnarOptions(Oid regclass, ColumnarOptions *options, bool overwrite);
 PG_FUNCTION_INFO_V1(columnar_relation_storageid);
@ -140,7 +130,7 @@ typedef FormData_columnar_options *Form_columnar_options;
 /* constants for columnar.stripe */
-#define Natts_columnar_stripe 8
+#define Natts_columnar_stripe 9
 #define Anum_columnar_stripe_storageid 1
 #define Anum_columnar_stripe_stripe 2
 #define Anum_columnar_stripe_file_offset 3
@ -149,6 +139,7 @@ typedef FormData_columnar_options *Form_columnar_options;
 #define Anum_columnar_stripe_chunk_row_count 6
 #define Anum_columnar_stripe_row_count 7
 #define Anum_columnar_stripe_chunk_count 8
 #define Anum_columnar_stripe_first_row_number 9
 /* constants for columnar.chunk_group */
 #define Natts_columnar_chunkgroup 4
@ -423,7 +414,7 @@ SaveStripeSkipList(RelFileNode relfilenode, uint64 stripe, StripeSkipList *chunk
 	uint32 chunkIndex = 0;
 	uint32 columnCount = chunkList->columnCount;
-	ColumnarMetapage *metapage = ReadMetapage(relfilenode, false);
+	uint64 storageId = LookupStorageId(relfilenode);
 	Oid columnarChunkOid = ColumnarChunkRelationId();
 	Relation columnarChunk = table_open(columnarChunkOid, RowExclusiveLock);
 	ModifyState *modifyState = StartModifyRelation(columnarChunk);
@ -436,7 +427,7 @@ SaveStripeSkipList(RelFileNode relfilenode, uint64 stripe, StripeSkipList *chunk
 				&chunkList->chunkSkipNodeArray[columnIndex][chunkIndex];
 			Datum values[Natts_columnar_chunk] = {
-				UInt64GetDatum(metapage->storageId),
+				UInt64GetDatum(storageId),
 				Int64GetDatum(stripe),
 				Int32GetDatum(columnIndex + 1),
 				Int32GetDatum(chunkIndex),
@ -487,7 +478,7 @@ void
 SaveChunkGroups(RelFileNode relfilenode, uint64 stripe,
 				List *chunkGroupRowCounts)
 {
-	ColumnarMetapage *metapage = ReadMetapage(relfilenode, false);
+	uint64 storageId = LookupStorageId(relfilenode);
 	Oid columnarChunkGroupOid = ColumnarChunkGroupRelationId();
 	Relation columnarChunkGroup = table_open(columnarChunkGroupOid, RowExclusiveLock);
 	ModifyState *modifyState = StartModifyRelation(columnarChunkGroup);
@ -499,7 +490,7 @@ SaveChunkGroups(RelFileNode relfilenode, uint64 stripe,
 	{
 		int64 rowCount = lfirst_int(lc);
 		Datum values[Natts_columnar_chunkgroup] = {
-			UInt64GetDatum(metapage->storageId),
+			UInt64GetDatum(storageId),
 			Int64GetDatum(stripe),
 			Int32GetDatum(chunkId),
 			Int64GetDatum(rowCount)
@ -530,14 +521,14 @@ ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe, TupleDesc tupleDescri
 	uint32 columnCount = tupleDescriptor->natts;
 	ScanKeyData scanKey[2];
-	ColumnarMetapage *metapage = ReadMetapage(relfilenode, false);
+	uint64 storageId = LookupStorageId(relfilenode);
 	Oid columnarChunkOid = ColumnarChunkRelationId();
 	Relation columnarChunk = table_open(columnarChunkOid, AccessShareLock);
 	Relation index = index_open(ColumnarChunkIndexRelationId(), AccessShareLock);
 	ScanKeyInit(&scanKey[0], Anum_columnar_chunk_storageid,
-				BTEqualStrategyNumber, F_OIDEQ, UInt64GetDatum(metapage->storageId));
+				BTEqualStrategyNumber, F_OIDEQ, UInt64GetDatum(storageId));
 	ScanKeyInit(&scanKey[1], Anum_columnar_chunk_stripe,
 				BTEqualStrategyNumber, F_OIDEQ, Int32GetDatum(stripe));
@ -624,12 +615,99 @@ ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe, TupleDesc tupleDescri
 	table_close(columnarChunk, AccessShareLock);
 	chunkList->chunkGroupRowCounts =
-		ReadChunkGroupRowCounts(metapage->storageId, stripe, chunkCount);
+		ReadChunkGroupRowCounts(storageId, stripe, chunkCount);
 	return chunkList;
 }
 /*
 * FindStripeByRowNumber returns StripeMetadata for the stripe that has the
 * row with rowNumber by doing backward index scan on
 * stripe_first_row_number_idx. If no such row exists, then returns NULL.
 */
 StripeMetadata *
 FindStripeByRowNumber(Relation relation, uint64 rowNumber, Snapshot snapshot)
 {
 	StripeMetadata *foundStripeMetadata = NULL;
 	uint64 storageId = ColumnarStorageGetStorageId(relation, false);
 	ScanKeyData scanKey[2];
 	ScanKeyInit(&scanKey[0], Anum_columnar_stripe_storageid,
 				BTEqualStrategyNumber, F_OIDEQ, Int32GetDatum(storageId));
 	ScanKeyInit(&scanKey[1], Anum_columnar_stripe_first_row_number,
 				BTLessEqualStrategyNumber, F_INT8LE, UInt64GetDatum(rowNumber));
 	Relation columnarStripes = table_open(ColumnarStripeRelationId(), AccessShareLock);
 	Relation index = index_open(ColumnarStripeFirstRowNumberIndexRelationId(),
 								AccessShareLock);
 	SysScanDesc scanDescriptor = systable_beginscan_ordered(columnarStripes, index,
 															snapshot, 2,
 															scanKey);
 	HeapTuple heapTuple = systable_getnext_ordered(scanDescriptor, BackwardScanDirection);
 	if (HeapTupleIsValid(heapTuple))
 	{
 		TupleDesc tupleDescriptor = RelationGetDescr(columnarStripes);
 		Datum datumArray[Natts_columnar_stripe];
 		bool isNullArray[Natts_columnar_stripe];
 		heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray);
 		StripeMetadata *stripeMetadata = BuildStripeMetadata(datumArray);
 		if (rowNumber < stripeMetadata->firstRowNumber + stripeMetadata->rowCount)
 		{
 			foundStripeMetadata = stripeMetadata;
 		}
 	}
 	systable_endscan_ordered(scanDescriptor);
 	index_close(index, AccessShareLock);
 	table_close(columnarStripes, AccessShareLock);
 	return foundStripeMetadata;
 }
 /*
 * FindStripeWithHighestRowNumber returns StripeMetadata for the stripe that
 * has the row with highest rowNumber by doing backward index scan on
 * stripe_first_row_number_idx. If given relation is empty, then returns NULL.
 */
 StripeMetadata *
 FindStripeWithHighestRowNumber(Relation relation, Snapshot snapshot)
 {
 	StripeMetadata *stripeWithHighestRowNumber = NULL;
 	uint64 storageId = ColumnarStorageGetStorageId(relation, false);
 	ScanKeyData scanKey[1];
 	ScanKeyInit(&scanKey[0], Anum_columnar_stripe_storageid,
 				BTEqualStrategyNumber, F_OIDEQ, Int32GetDatum(storageId));
 	Relation columnarStripes = table_open(ColumnarStripeRelationId(), AccessShareLock);
 	Relation index = index_open(ColumnarStripeFirstRowNumberIndexRelationId(),
 								AccessShareLock);
 	SysScanDesc scanDescriptor = systable_beginscan_ordered(columnarStripes, index,
 															snapshot, 1, scanKey);
 	HeapTuple heapTuple = systable_getnext_ordered(scanDescriptor, BackwardScanDirection);
 	if (HeapTupleIsValid(heapTuple))
 	{
 		TupleDesc tupleDescriptor = RelationGetDescr(columnarStripes);
 		Datum datumArray[Natts_columnar_stripe];
 		bool isNullArray[Natts_columnar_stripe];
 		heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray);
 		stripeWithHighestRowNumber = BuildStripeMetadata(datumArray);
 	}
 	systable_endscan_ordered(scanDescriptor);
 	index_close(index, AccessShareLock);
 	table_close(columnarStripes, AccessShareLock);
 	return stripeWithHighestRowNumber;
 }
 /*
 * ReadChunkGroupRowCounts returns an array of row counts of chunk groups for the
 * given stripe.
@ -704,7 +782,8 @@ InsertStripeMetadataRow(uint64 storageId, StripeMetadata *stripe)
 		Int32GetDatum(stripe->columnCount),
 		Int32GetDatum(stripe->chunkGroupRowCount),
 		Int64GetDatum(stripe->rowCount),
-		Int32GetDatum(stripe->chunkCount)
+		Int32GetDatum(stripe->chunkCount),
 		UInt64GetDatum(stripe->firstRowNumber)
 	};
 	Oid columnarStripesOid = ColumnarStripeRelationId();
@ -729,15 +808,9 @@ InsertStripeMetadataRow(uint64 storageId, StripeMetadata *stripe)
 List *
 StripesForRelfilenode(RelFileNode relfilenode)
 {
-	ColumnarMetapage *metapage = ReadMetapage(relfilenode, true);
+	uint64 storageId = LookupStorageId(relfilenode);
 	if (metapage == NULL)
 	{
 		/* empty relation */
 		return NIL;
 	}
-
+	return ReadDataFileStripeList(storageId, GetTransactionSnapshot());
 	return ReadDataFileStripeList(metapage->storageId, GetTransactionSnapshot());
 }
@ -752,17 +825,11 @@ StripesForRelfilenode(RelFileNode relfilenode)
 uint64
 GetHighestUsedAddress(RelFileNode relfilenode)
 {
 	uint64 storageId = LookupStorageId(relfilenode);
 	uint64 highestUsedAddress = 0;
 	uint64 highestUsedId = 0;
-	ColumnarMetapage *metapage = ReadMetapage(relfilenode, true);
+	GetHighestUsedAddressAndId(storageId, &highestUsedAddress, &highestUsedId);
 	/* empty data file? */
 	if (metapage == NULL)
 	{
 		return 0;
 	}
 	GetHighestUsedAddressAndId(metapage->storageId, &highestUsedAddress, &highestUsedId);
 	return highestUsedAddress;
 }
@ -799,35 +866,6 @@ GetHighestUsedAddressAndId(uint64 storageId,
 }
 /*
 * LockForStripeReservation acquires a lock for stripe reservation.
 */
 static void
 LockForStripeReservation(Relation rel, LOCKMODE mode)
 {
 	/*
 	 * We use an advisory lock here so we can easily detect these kind of
 	 * locks in IsProcessWaitingForSafeOperations() and don't include them
 	 * in the lock graph.
 	 */
 	LOCKTAG tag;
 	SET_LOCKTAG_COLUMNAR_STRIPE_RESERVATION(tag, rel);
 	LockAcquire(&tag, mode, false, false);
 }
 /*
 * UnlockForStripeReservation releases the stripe reservation lock.
 */
 static void
 UnlockForStripeReservation(Relation rel, LOCKMODE mode)
 {
 	LOCKTAG tag;
 	SET_LOCKTAG_COLUMNAR_STRIPE_RESERVATION(tag, rel);
 	LockRelease(&tag, mode, false);
 }
 /*
 * ReserveStripe reserves and stripe of given size for the given relation,
 * and inserts it into columnar.stripe. It is guaranteed that concurrent
@ -836,50 +874,15 @@ UnlockForStripeReservation(Relation rel, LOCKMODE mode)
 StripeMetadata
 ReserveStripe(Relation rel, uint64 sizeBytes,
 			  uint64 rowCount, uint64 columnCount,
-			  uint64 chunkCount, uint64 chunkGroupRowCount)
+			  uint64 chunkCount, uint64 chunkGroupRowCount,
 			  uint64 stripeFirstRowNumber)
 {
 	StripeMetadata stripe = { 0 };
 	uint64 currLogicalHigh = 0;
 	uint64 highestId = 0;
-	/*
+	uint64 storageId = ColumnarStorageGetStorageId(rel, false);
 	 * We take ExclusiveLock here, so two space reservations conflict.
 	 */
 	LOCKMODE lockMode = ExclusiveLock;
 	LockForStripeReservation(rel, lockMode);
-	RelFileNode relfilenode = rel->rd_node;
+	uint64 stripeId = ColumnarStorageReserveStripe(rel);
-
+	uint64 resLogicalStart = ColumnarStorageReserveData(rel, sizeBytes);
 	/*
 	 * If this is the first stripe for this relation, initialize the
 	 * metapage, otherwise use the previously initialized metapage.
 	 */
 	ColumnarMetapage *metapage = ReadMetapage(relfilenode, true);
 	if (metapage == NULL)
 	{
 		metapage = InitMetapage(rel);
 	}
 	GetHighestUsedAddressAndId(metapage->storageId, &currLogicalHigh, &highestId);
 	SmgrAddr currSmgrHigh = logical_to_smgr(currLogicalHigh);
 	SmgrAddr resSmgrStart = next_block_start(currSmgrHigh);
 	uint64 resLogicalStart = smgr_to_logical(resSmgrStart);
 	uint64 resLogicalEnd = resLogicalStart + sizeBytes - 1;
 	SmgrAddr resSmgrEnd = logical_to_smgr(resLogicalEnd);
 	RelationOpenSmgr(rel);
 	uint64 nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	while (resSmgrEnd.blockno >= nblocks)
 	{
 		Buffer newBuffer = ReadBuffer(rel, P_NEW);
 		ReleaseBuffer(newBuffer);
 		nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	}
 	RelationCloseSmgr(rel);
 	stripe.fileOffset = resLogicalStart;
 	stripe.dataLength = sizeBytes;
@ -887,11 +890,10 @@ ReserveStripe(Relation rel, uint64 sizeBytes,
 	stripe.chunkGroupRowCount = chunkGroupRowCount;
 	stripe.columnCount = columnCount;
 	stripe.rowCount = rowCount;
-	stripe.id = highestId + 1;
+	stripe.id = stripeId;
 	stripe.firstRowNumber = stripeFirstRowNumber;
-	InsertStripeMetadataRow(metapage->storageId, &stripe);
+	InsertStripeMetadataRow(storageId, &stripe);
 	UnlockForStripeReservation(rel, lockMode);
 	return stripe;
 }
@ -914,7 +916,8 @@ ReadDataFileStripeList(uint64 storageId, Snapshot snapshot)
 	Oid columnarStripesOid = ColumnarStripeRelationId();
 	Relation columnarStripes = table_open(columnarStripesOid, AccessShareLock);
-	Relation index = index_open(ColumnarStripeIndexRelationId(), AccessShareLock);
+	Relation index = index_open(ColumnarStripeFirstRowNumberIndexRelationId(),
 								AccessShareLock);
 	TupleDesc tupleDescriptor = RelationGetDescr(columnarStripes);
 	SysScanDesc scanDescriptor = systable_beginscan_ordered(columnarStripes, index,
@ -927,22 +930,7 @@ ReadDataFileStripeList(uint64 storageId, Snapshot snapshot)
 		bool isNullArray[Natts_columnar_stripe];
 		heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray);
-
+		StripeMetadata *stripeMetadata = BuildStripeMetadata(datumArray);
 		StripeMetadata *stripeMetadata = palloc0(sizeof(StripeMetadata));
 		stripeMetadata->id = DatumGetInt64(datumArray[Anum_columnar_stripe_stripe - 1]);
 		stripeMetadata->fileOffset = DatumGetInt64(
 			datumArray[Anum_columnar_stripe_file_offset - 1]);
 		stripeMetadata->dataLength = DatumGetInt64(
 			datumArray[Anum_columnar_stripe_data_length - 1]);
 		stripeMetadata->columnCount = DatumGetInt32(
 			datumArray[Anum_columnar_stripe_column_count - 1]);
 		stripeMetadata->chunkCount = DatumGetInt32(
 			datumArray[Anum_columnar_stripe_chunk_count - 1]);
 		stripeMetadata->chunkGroupRowCount = DatumGetInt32(
 			datumArray[Anum_columnar_stripe_chunk_row_count - 1]);
 		stripeMetadata->rowCount = DatumGetInt64(
 			datumArray[Anum_columnar_stripe_row_count - 1]);
 		stripeMetadataList = lappend(stripeMetadataList, stripeMetadata);
 	}
@ -954,6 +942,32 @@ ReadDataFileStripeList(uint64 storageId, Snapshot snapshot)
 }
 /*
 * BuildStripeMetadata builds a StripeMetadata object from given datumArray.
 */
 static StripeMetadata *
 BuildStripeMetadata(Datum *datumArray)
 {
 	StripeMetadata *stripeMetadata = palloc0(sizeof(StripeMetadata));
 	stripeMetadata->id = DatumGetInt64(datumArray[Anum_columnar_stripe_stripe - 1]);
 	stripeMetadata->fileOffset = DatumGetInt64(
 		datumArray[Anum_columnar_stripe_file_offset - 1]);
 	stripeMetadata->dataLength = DatumGetInt64(
 		datumArray[Anum_columnar_stripe_data_length - 1]);
 	stripeMetadata->columnCount = DatumGetInt32(
 		datumArray[Anum_columnar_stripe_column_count - 1]);
 	stripeMetadata->chunkCount = DatumGetInt32(
 		datumArray[Anum_columnar_stripe_chunk_count - 1]);
 	stripeMetadata->chunkGroupRowCount = DatumGetInt32(
 		datumArray[Anum_columnar_stripe_chunk_row_count - 1]);
 	stripeMetadata->rowCount = DatumGetInt64(
 		datumArray[Anum_columnar_stripe_row_count - 1]);
 	stripeMetadata->firstRowNumber = DatumGetUInt64(
 		datumArray[Anum_columnar_stripe_first_row_number - 1]);
 	return stripeMetadata;
 }
 /*
 * DeleteMetadataRows removes the rows with given relfilenode from columnar
 * metadata tables.
@ -970,28 +984,20 @@ DeleteMetadataRows(RelFileNode relfilenode)
 		return;
 	}
-	ColumnarMetapage *metapage = ReadMetapage(relfilenode, true);
+	uint64 storageId = LookupStorageId(relfilenode);
 	if (metapage == NULL)
 	{
 		/*
 		 * No data has been written to this storage yet, so there is no
 		 * associated metadata yet.
 		 */
 		return;
 	}
 	DeleteStorageFromColumnarMetadataTable(ColumnarStripeRelationId(),
 										   Anum_columnar_stripe_storageid,
-										   ColumnarStripeIndexRelationId(),
+										   ColumnarStripePKeyIndexRelationId(),
-										   metapage->storageId);
+										   storageId);
 	DeleteStorageFromColumnarMetadataTable(ColumnarChunkGroupRelationId(),
 										   Anum_columnar_chunkgroup_storageid,
 										   ColumnarChunkGroupIndexRelationId(),
-										   metapage->storageId);
+										   storageId);
 	DeleteStorageFromColumnarMetadataTable(ColumnarChunkRelationId(),
 										   Anum_columnar_chunk_storageid,
 										   ColumnarChunkIndexRelationId(),
-										   metapage->storageId);
+										   storageId);
 }
@ -1226,16 +1232,28 @@ ColumnarStripeRelationId(void)
 /*
- * ColumnarStripeIndexRelationId returns relation id of columnar.stripe_pkey.
+ * ColumnarStripePKeyIndexRelationId returns relation id of columnar.stripe_pkey.
 * TODO: should we cache this similar to citus?
 */
 static Oid
-ColumnarStripeIndexRelationId(void)
+ColumnarStripePKeyIndexRelationId(void)
 {
 	return get_relname_relid("stripe_pkey", ColumnarNamespaceId());
 }
 /*
 * ColumnarStripeFirstRowNumberIndexRelationId returns relation id of
 * columnar.stripe_first_row_number_idx.
 * TODO: should we cache this similar to citus?
 */
 static Oid
 ColumnarStripeFirstRowNumberIndexRelationId(void)
 {
 	return get_relname_relid("stripe_first_row_number_idx", ColumnarNamespaceId());
 }
 /*
 * ColumnarOptionsRelationId returns relation id of columnar.options.
 */
@ -1312,75 +1330,31 @@ ColumnarNamespaceId(void)
 /*
- * ReadMetapage reads metapage for the given relfilenode. It returns
+ * LookupStorageId reads storage metapage to find the storage ID for the given relfilenode. It returns
 * false if the relation doesn't have a meta page yet.
 */
-static ColumnarMetapage *
+static uint64
-ReadMetapage(RelFileNode relfilenode, bool missingOk)
+LookupStorageId(RelFileNode relfilenode)
 {
 	StringInfo metapageBuffer = NULL;
 	Oid relationId = RelidByRelfilenode(relfilenode.spcNode,
 										relfilenode.relNode);
 	if (OidIsValid(relationId))
 	{
 		Relation relation = relation_open(relationId, NoLock);
-		RelationOpenSmgr(relation);
+	Relation relation = relation_open(relationId, AccessShareLock);
-		int nblocks = smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
+	uint64 storageId = ColumnarStorageGetStorageId(relation, false);
-		RelationCloseSmgr(relation);
+	table_close(relation, AccessShareLock);
-		if (nblocks != 0)
+	return storageId;
 		{
 			metapageBuffer = ReadFromSmgr(relation, 0, sizeof(ColumnarMetapage));
 		}
 		relation_close(relation, NoLock);
 	}
 	if (metapageBuffer == NULL)
 	{
 		if (!missingOk)
 		{
 			elog(ERROR, "columnar metapage was not found");
 		}
 		return NULL;
 	}
 	ColumnarMetapage *metapage = palloc0(sizeof(ColumnarMetapage));
 	memcpy_s((void *) metapage, sizeof(ColumnarMetapage),
 			 metapageBuffer->data, sizeof(ColumnarMetapage));
 	return metapage;
 }
 /*
- * InitMetapage initializes metapage for the given relation.
+ * ColumnarMetadataNewStorageId - create a new, unique storage id and return
 * it.
 */
-static ColumnarMetapage *
+uint64
-InitMetapage(Relation relation)
+ColumnarMetadataNewStorageId()
 {
-	/*
+	return nextval_internal(ColumnarStorageIdSequenceRelationId(), false);
 	 * If we init metapage during upgrade, we might override the
 	 * pre-upgrade storage id which will render pre-upgrade data
 	 * invisible.
 	 */
 	Assert(!IsBinaryUpgrade);
 	ColumnarMetapage *metapage = palloc0(sizeof(ColumnarMetapage));
 	metapage->storageId = nextval_internal(ColumnarStorageIdSequenceRelationId(), false);
 	metapage->versionMajor = COLUMNAR_VERSION_MAJOR;
 	metapage->versionMinor = COLUMNAR_VERSION_MINOR;
 	/* create the first block */
 	Buffer newBuffer = ReadBuffer(relation, P_NEW);
 	ReleaseBuffer(newBuffer);
 	Assert(sizeof(ColumnarMetapage) <= BLCKSZ - SizeOfPageHeaderData);
 	WriteToSmgr(relation, 0, (char *) metapage, sizeof(ColumnarMetapage));
 	return metapage;
 }
@ -1391,20 +1365,85 @@ InitMetapage(Relation relation)
 Datum
 columnar_relation_storageid(PG_FUNCTION_ARGS)
 {
 	uint64 storageId = -1;
 	Oid relationId = PG_GETARG_OID(0);
 	Relation relation = relation_open(relationId, AccessShareLock);
-	if (IsColumnarTableAmTable(relationId))
+	if (!IsColumnarTableAmTable(relationId))
 	{
-		ColumnarMetapage *metadata = ReadMetapage(relation->rd_node, true);
+		elog(ERROR, "relation \"%s\" is not a columnar table",
-		if (metadata != NULL)
+			 RelationGetRelationName(relation));
 		{
 			storageId = metadata->storageId;
 		}
 	}
 	uint64 storageId = ColumnarStorageGetStorageId(relation, false);
 	relation_close(relation, AccessShareLock);
 	PG_RETURN_INT64(storageId);
 }
 /*
 * ColumnarStorageUpdateIfNeeded - upgrade columnar storage to the current version by
 * using information from the metadata tables.
 */
 void
 ColumnarStorageUpdateIfNeeded(Relation rel, bool isUpgrade)
 {
 	if (ColumnarStorageIsCurrent(rel))
 	{
 		return;
 	}
 	RelationOpenSmgr(rel);
 	BlockNumber nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	if (nblocks < 2)
 	{
 		ColumnarStorageInit(rel->rd_smgr, ColumnarMetadataNewStorageId());
 		return;
 	}
 	uint64 storageId = ColumnarStorageGetStorageId(rel, true);
 	uint64 highestId;
 	uint64 highestOffset;
 	GetHighestUsedAddressAndId(storageId, &highestOffset, &highestId);
 	uint64 reservedStripeId = highestId + 1;
 	uint64 reservedOffset = highestOffset + 1;
 	uint64 reservedRowNumber = GetHighestUsedFirstRowNumber(storageId) + 1;
 	ColumnarStorageUpdateCurrent(rel, isUpgrade, reservedStripeId,
 								 reservedRowNumber, reservedOffset);
 }
 /*
 * GetHighestUsedFirstRowNumber returns the highest used first_row_number
 * for given storageId. Returns COLUMNAR_INVALID_ROW_NUMBER if storage with
 * storageId has no stripes.
 * Note that normally we would use ColumnarStorageGetReservedRowNumber
 * to decide that. However, this function is designed to be used when
 * building the metapage itself during upgrades.
 */
 static uint64
 GetHighestUsedFirstRowNumber(uint64 storageId)
 {
 	List *stripeMetadataList = ReadDataFileStripeList(storageId,
 													  GetTransactionSnapshot());
 	if (list_length(stripeMetadataList) == 0)
 	{
 		return COLUMNAR_INVALID_ROW_NUMBER;
 	}
 	/* XXX: Better to have an invalid value for StripeMetadata.rowCount too */
 	uint64 stripeRowCount = -1;
 	uint64 highestFirstRowNumber = COLUMNAR_INVALID_ROW_NUMBER;
 	StripeMetadata *stripeMetadata = NULL;
 	foreach_ptr(stripeMetadata, stripeMetadataList)
 	{
 		highestFirstRowNumber = Max(highestFirstRowNumber,
 									stripeMetadata->firstRowNumber);
 		stripeRowCount = stripeMetadata->rowCount;
 	}
 	return highestFirstRowNumber + stripeRowCount - 1;
 }
--- a/src/backend/columnar/columnar_reader.c
+++ b/src/backend/columnar/columnar_reader.c
@ -34,6 +34,8 @@
 #include "utils/rel.h"
 #include "columnar/columnar.h"
 #include "columnar/columnar_storage.h"
 #include "columnar/columnar_tableam.h"
 #include "columnar/columnar_version_compat.h"
 typedef struct ChunkGroupReadState
@ -84,6 +86,14 @@ struct ColumnarReadState
 /* static function declarations */
 static MemoryContext CreateStripeReadMemoryContext(void);
 static void ReadStripeRowByRowNumber(StripeReadState *stripeReadState,
 									 StripeMetadata *stripeMetadata,
 									 uint64 rowNumber, Datum *columnValues,
 									 bool *columnNulls);
 static void ReadChunkGroupRowByRowOffset(ChunkGroupReadState *chunkGroupReadState,
 										 StripeMetadata *stripeMetadata,
 										 uint64 stripeRowOffset, Datum *columnValues,
 										 bool *columnNulls);
 static bool StripeReadInProgress(ColumnarReadState *readState);
 static bool HasUnreadStripe(ColumnarReadState *readState);
 static StripeReadState * BeginStripeRead(StripeMetadata *stripeMetadata, Relation rel,
@ -194,11 +204,12 @@ CreateStripeReadMemoryContext()
 /*
 * ColumnarReadNextRow tries to read a row from the columnar table. On success, it sets
- * column values and nulls, and returns true. If there are no more rows to read,
+ * column values, column nulls and rowNumber (if passed to be non-NULL), and returns true.
- * the function returns false.
+ * If there are no more rows to read, the function returns false.
 */
 bool
-ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *columnNulls)
+ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *columnNulls,
 					uint64 *rowNumber)
 {
 	while (true)
 	{
@ -226,6 +237,14 @@ ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *col
 			continue;
 		}
 		if (rowNumber)
 		{
 			StripeMetadata *stripeMetadata = list_nth(readState->stripeList,
 													  readState->currentStripe);
 			*rowNumber = stripeMetadata->firstRowNumber +
 						 readState->stripeReadState->currentRow - 1;
 		}
 		return true;
 	}
@ -233,6 +252,104 @@ ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *col
 }
 /*
 * ColumnarReadRowByRowNumber reads row with rowNumber from given relation
 * into columnValues and columnNulls, and returns true. If no such row
 * exists, then returns false.
 */
 bool
 ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber,
 						   List *neededColumnList, Datum *columnValues,
 						   bool *columnNulls, Snapshot snapshot)
 {
 	StripeMetadata *stripeMetadata = FindStripeByRowNumber(relation, rowNumber, snapshot);
 	if (stripeMetadata == NULL)
 	{
 		/* no such row exists */
 		return false;
 	}
 	TupleDesc relationTupleDesc = RelationGetDescr(relation);
 	List *whereClauseList = NIL;
 	List *whereClauseVars = NIL;
 	MemoryContext stripeReadContext = CreateStripeReadMemoryContext();
 	StripeReadState *stripeReadState = BeginStripeRead(stripeMetadata,
 													   relation,
 													   relationTupleDesc,
 													   neededColumnList,
 													   whereClauseList,
 													   whereClauseVars,
 													   stripeReadContext);
 	ReadStripeRowByRowNumber(stripeReadState, stripeMetadata, rowNumber,
 							 columnValues, columnNulls);
 	EndStripeRead(stripeReadState);
 	MemoryContextReset(stripeReadContext);
 	return true;
 }
 /*
 * ReadStripeRowByRowNumber reads row with rowNumber from given
 * stripeReadState into columnValues and columnNulls.
 * Errors out if no such row exists in the stripe being read.
 */
 static void
 ReadStripeRowByRowNumber(StripeReadState *stripeReadState,
 						 StripeMetadata *stripeMetadata,
 						 uint64 rowNumber, Datum *columnValues,
 						 bool *columnNulls)
 {
 	if (rowNumber < stripeMetadata->firstRowNumber)
 	{
 		/* not expected but be on the safe side */
 		ereport(ERROR, (errmsg("row offset cannot be negative")));
 	}
 	/* find the exact chunk group to be read */
 	uint64 stripeRowOffset = rowNumber - stripeMetadata->firstRowNumber;
 	stripeReadState->chunkGroupIndex = stripeRowOffset /
 									   stripeMetadata->chunkGroupRowCount;
 	stripeReadState->chunkGroupReadState = BeginChunkGroupRead(
 		stripeReadState->stripeBuffers,
 		stripeReadState->chunkGroupIndex,
 		stripeReadState->tupleDescriptor,
 		stripeReadState->projectedColumnList,
 		stripeReadState->stripeReadContext);
 	ReadChunkGroupRowByRowOffset(stripeReadState->chunkGroupReadState,
 								 stripeMetadata, stripeRowOffset,
 								 columnValues, columnNulls);
 	EndChunkGroupRead(stripeReadState->chunkGroupReadState);
 	stripeReadState->chunkGroupReadState = NULL;
 }
 /*
 * ReadChunkGroupRowByRowOffset reads row with stripeRowOffset from given
 * chunkGroupReadState into columnValues and columnNulls.
 * Errors out if no such row exists in the chunk group being read.
 */
 static void
 ReadChunkGroupRowByRowOffset(ChunkGroupReadState *chunkGroupReadState,
 							 StripeMetadata *stripeMetadata,
 							 uint64 stripeRowOffset, Datum *columnValues,
 							 bool *columnNulls)
 {
 	/* set the exact row number to be read from given chunk roup */
 	chunkGroupReadState->currentRow = stripeRowOffset %
 									  stripeMetadata->chunkGroupRowCount;
 	if (!ReadChunkGroupNextRow(chunkGroupReadState, columnValues, columnNulls))
 	{
 		/* not expected but be on the safe side */
 		ereport(ERROR, (errmsg("could not find the row in stripe")));
 	}
 }
 /*
 * StripeReadInProgress returns true if we already started reading a stripe.
 */
@ -667,8 +784,12 @@ LoadColumnBuffers(Relation relation, ColumnChunkSkipNode *chunkSkipNodeArray,
 	{
 		ColumnChunkSkipNode *chunkSkipNode = &chunkSkipNodeArray[chunkIndex];
 		uint64 existsOffset = stripeOffset + chunkSkipNode->existsChunkOffset;
-		StringInfo rawExistsBuffer = ReadFromSmgr(relation, existsOffset,
+		StringInfo rawExistsBuffer = makeStringInfo();
-												  chunkSkipNode->existsLength);
+
 		enlargeStringInfo(rawExistsBuffer, chunkSkipNode->existsLength);
 		rawExistsBuffer->len = chunkSkipNode->existsLength;
 		ColumnarStorageRead(relation, existsOffset, rawExistsBuffer->data,
 							chunkSkipNode->existsLength);
 		chunkBuffersArray[chunkIndex]->existsBuffer = rawExistsBuffer;
 	}
@ -679,8 +800,12 @@ LoadColumnBuffers(Relation relation, ColumnChunkSkipNode *chunkSkipNodeArray,
 		ColumnChunkSkipNode *chunkSkipNode = &chunkSkipNodeArray[chunkIndex];
 		CompressionType compressionType = chunkSkipNode->valueCompressionType;
 		uint64 valueOffset = stripeOffset + chunkSkipNode->valueChunkOffset;
-		StringInfo rawValueBuffer = ReadFromSmgr(relation, valueOffset,
+		StringInfo rawValueBuffer = makeStringInfo();
-												 chunkSkipNode->valueLength);
+
 		enlargeStringInfo(rawValueBuffer, chunkSkipNode->valueLength);
 		rawValueBuffer->len = chunkSkipNode->valueLength;
 		ColumnarStorageRead(relation, valueOffset, rawValueBuffer->data,
 							chunkSkipNode->valueLength);
 		chunkBuffersArray[chunkIndex]->valueBuffer = rawValueBuffer;
 		chunkBuffersArray[chunkIndex]->valueCompressionType = compressionType;
@ -1269,30 +1394,3 @@ ColumnDefaultValue(TupleConstr *tupleConstraints, Form_pg_attribute attributeFor
 								"does not evaluate to constant value")));
 	}
 }
 StringInfo
 ReadFromSmgr(Relation rel, uint64 offset, uint32 size)
 {
 	StringInfo resultBuffer = makeStringInfo();
 	uint64 read = 0;
 	enlargeStringInfo(resultBuffer, size);
 	resultBuffer->len = size;
 	while (read < size)
 	{
 		SmgrAddr addr = logical_to_smgr(offset + read);
 		Buffer buffer = ReadBuffer(rel, addr.blockno);
 		Page page = BufferGetPage(buffer);
 		PageHeader phdr = (PageHeader) page;
 		uint32 to_read = Min(size - read, phdr->pd_upper - addr.offset);
 		memcpy_s(resultBuffer->data + read, size - read, page + addr.offset, to_read);
 		ReleaseBuffer(buffer);
 		read += to_read;
 	}
 	return resultBuffer;
 }
--- a/src/backend/columnar/columnar_storage.c
+++ b/src/backend/columnar/columnar_storage.c
@ -0,0 +1,809 @@
 /*-------------------------------------------------------------------------
 *
 * columnar_storage.c
 *
 * Copyright (c) Citus Data, Inc.
 *
 * Low-level storage layer for columnar.
 *   - Translates columnar read/write operations on logical offsets into operations on pages/blocks.
 *   - Emits WAL.
 *   - Reads/writes the columnar metapage.
 *   - Reserves data offsets, stripe numbers, and row offsets.
 *   - Truncation.
 *
 * Higher-level columnar operations deal with logical offsets and large
 * contiguous buffers of data that need to be stored. But the buffer manager
 * and WAL depend on formatted pages with headers, so these large buffers need
 * to be written across many pages. This module translates the contiguous
 * buffers into individual block reads/writes, and performs WAL when
 * necessary.
 *
 * Storage layout: a metapage in block 0, followed by an empty page in block
 * 1, followed by logical data starting at the first byte after the page
 * header in block 2 (having logical offset ColumnarFirstLogicalOffset). (XXX:
 * Block 1 is left empty for no particular reason. Reconsider?). A columnar
 * table should always have at least 2 blocks.
 *
 * Reservation is done with a relation extension lock, and designed for
 * concurrency, so the callers only need an ordinary lock on the
 * relation. Initializing the metapage or truncating the relation require that
 * the caller holds an AccessExclusiveLock. (XXX: New reservations of data are
 * aligned onto a new page for no particular reason. Reconsider?).
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
 #include "safe_lib.h"
 #include "catalog/storage.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "columnar/columnar.h"
 #include "columnar/columnar_storage.h"
 /*
 * Content of the first page in main fork, which stores metadata at file
 * level.
 */
 typedef struct ColumnarMetapage
 {
 	/*
 	 * Store version of file format used, so we can detect files from
 	 * previous versions if we change file format.
 	 */
 	uint32 versionMajor;
 	uint32 versionMinor;
 	/*
 	 * Each of the metadata table rows are identified by a storageId.
 	 * We store it also in the main fork so we can link metadata rows
 	 * with data files.
 	 */
 	uint64 storageId;
 	uint64 reservedStripeId; /* first unused stripe id */
 	uint64 reservedRowNumber; /* first unused row number */
 	uint64 reservedOffset; /* first unused byte offset */
 } ColumnarMetapage;
 /* represents a "physical" block+offset address */
 typedef struct PhysicalAddr
 {
 	BlockNumber blockno;
 	uint32 offset;
 } PhysicalAddr;
 #define COLUMNAR_METAPAGE_BLOCKNO 0
 #define COLUMNAR_EMPTY_BLOCKNO 1
 #define COLUMNAR_INVALID_STRIPE_ID 0
 #define COLUMNAR_FIRST_STRIPE_ID 1
 #define OLD_METAPAGE_VERSION_HINT "Use \"VACUUM\" to upgrade the columnar table format " \
 								  "version or run \"ALTER EXTENSION citus UPDATE\"."
 /*
 * Map logical offsets to a physical page and offset where the data is kept.
 */
 static inline PhysicalAddr
 LogicalToPhysical(uint64 logicalOffset)
 {
 	PhysicalAddr addr;
 	addr.blockno = logicalOffset / COLUMNAR_BYTES_PER_PAGE;
 	addr.offset = SizeOfPageHeaderData + (logicalOffset % COLUMNAR_BYTES_PER_PAGE);
 	return addr;
 }
 /*
 * Map a physical page and offset address to a logical address.
 */
 static inline uint64
 PhysicalToLogical(PhysicalAddr addr)
 {
 	return COLUMNAR_BYTES_PER_PAGE * addr.blockno + addr.offset - SizeOfPageHeaderData;
 }
 static void ColumnarOverwriteMetapage(Relation relation,
 									  ColumnarMetapage columnarMetapage);
 static ColumnarMetapage ColumnarMetapageRead(Relation rel, bool force);
 static void ReadFromBlock(Relation rel, BlockNumber blockno, uint32 offset,
 						  char *buf, uint32 len, bool force);
 static void WriteToBlock(Relation rel, BlockNumber blockno, uint32 offset,
 						 char *buf, uint32 len, bool clear);
 static uint64 AlignReservation(uint64 prevReservation);
 static bool ColumnarMetapageIsCurrent(ColumnarMetapage *metapage);
 static bool ColumnarMetapageIsOlder(ColumnarMetapage *metapage);
 static bool ColumnarMetapageIsNewer(ColumnarMetapage *metapage);
 static void ColumnarMetapageCheckVersion(Relation rel, ColumnarMetapage *metapage);
 /*
 * ColumnarStorageInit - initialize a new metapage in an empty relation
 * with the given storageId.
 *
 * Caller must hold AccessExclusiveLock on the relation.
 */
 void
 ColumnarStorageInit(SMgrRelation srel, uint64 storageId)
 {
 	BlockNumber nblocks = smgrnblocks(srel, MAIN_FORKNUM);
 	if (nblocks > 0)
 	{
 		elog(ERROR,
 			 "attempted to initialize metapage, but %d pages already exist",
 			 nblocks);
 	}
 	/* create two pages */
 	PGAlignedBlock block;
 	Page page = block.data;
 	/* write metapage */
 	PageInit(page, BLCKSZ, 0);
 	PageHeader phdr = (PageHeader) page;
 	ColumnarMetapage metapage = { 0 };
 	metapage.storageId = storageId;
 	metapage.versionMajor = COLUMNAR_VERSION_MAJOR;
 	metapage.versionMinor = COLUMNAR_VERSION_MINOR;
 	metapage.reservedStripeId = COLUMNAR_FIRST_STRIPE_ID;
 	metapage.reservedRowNumber = COLUMNAR_FIRST_ROW_NUMBER;
 	metapage.reservedOffset = ColumnarFirstLogicalOffset;
 	memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower,
 			 (char *) &metapage, sizeof(ColumnarMetapage));
 	phdr->pd_lower += sizeof(ColumnarMetapage);
 	PageSetChecksumInplace(page, COLUMNAR_METAPAGE_BLOCKNO);
 	smgrwrite(srel, MAIN_FORKNUM, COLUMNAR_METAPAGE_BLOCKNO, page, true);
 	log_newpage(&srel->smgr_rnode.node, MAIN_FORKNUM,
 				COLUMNAR_METAPAGE_BLOCKNO, page, true);
 	/* write empty page */
 	PageInit(page, BLCKSZ, 0);
 	PageSetChecksumInplace(page, COLUMNAR_EMPTY_BLOCKNO);
 	smgrwrite(srel, MAIN_FORKNUM, COLUMNAR_EMPTY_BLOCKNO, page, true);
 	log_newpage(&srel->smgr_rnode.node, MAIN_FORKNUM,
 				COLUMNAR_EMPTY_BLOCKNO, page, true);
 	/*
 	 * An immediate sync is required even if we xlog'd the page, because the
 	 * write did not go through shared_buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
 	 */
 	smgrimmedsync(srel, MAIN_FORKNUM);
 }
 /*
 * ColumnarStorageUpdateCurrent - update the metapage to the current
 * version. No effect if the version already matches. If 'upgrade' is true,
 * throw an error if metapage version is newer; if 'upgrade' is false, it's a
 * downgrade, so throw an error if the metapage version is older.
 *
 * NB: caller must ensure that metapage already exists, which might not be the
 * case on 10.0.
 */
 void
 ColumnarStorageUpdateCurrent(Relation rel, bool upgrade, uint64 reservedStripeId,
 							 uint64 reservedRowNumber, uint64 reservedOffset)
 {
 	LockRelationForExtension(rel, ExclusiveLock);
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, true);
 	if (ColumnarMetapageIsCurrent(&metapage))
 	{
 		/* nothing to do */
 		return;
 	}
 	if (upgrade && ColumnarMetapageIsNewer(&metapage))
 	{
 		elog(ERROR, "found newer columnar metapage while upgrading");
 	}
 	if (!upgrade && ColumnarMetapageIsOlder(&metapage))
 	{
 		elog(ERROR, "found older columnar metapage while downgrading");
 	}
 	metapage.versionMajor = COLUMNAR_VERSION_MAJOR;
 	metapage.versionMinor = COLUMNAR_VERSION_MINOR;
 	/* storageId remains the same */
 	metapage.reservedStripeId = reservedStripeId;
 	metapage.reservedRowNumber = reservedRowNumber;
 	metapage.reservedOffset = reservedOffset;
 	ColumnarOverwriteMetapage(rel, metapage);
 	UnlockRelationForExtension(rel, ExclusiveLock);
 }
 /*
 * ColumnarStorageGetVersionMajor - return major version from the metapage.
 *
 * Throw an error if the metapage is not the current version, unless
 * 'force' is true.
 */
 uint64
 ColumnarStorageGetVersionMajor(Relation rel, bool force)
 {
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
 	return metapage.versionMajor;
 }
 /*
 * ColumnarStorageGetVersionMinor - return minor version from the metapage.
 *
 * Throw an error if the metapage is not the current version, unless
 * 'force' is true.
 */
 uint64
 ColumnarStorageGetVersionMinor(Relation rel, bool force)
 {
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
 	return metapage.versionMinor;
 }
 /*
 * ColumnarStorageGetStorageId - return storage ID from the metapage.
 *
 * Throw an error if the metapage is not the current version, unless
 * 'force' is true.
 */
 uint64
 ColumnarStorageGetStorageId(Relation rel, bool force)
 {
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
 	return metapage.storageId;
 }
 /*
 * ColumnarStorageGetReservedStripeId - return reserved stripe ID from the
 * metapage.
 *
 * Throw an error if the metapage is not the current version, unless
 * 'force' is true.
 */
 uint64
 ColumnarStorageGetReservedStripeId(Relation rel, bool force)
 {
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
 	return metapage.reservedStripeId;
 }
 /*
 * ColumnarStorageGetReservedRowNumber - return reserved row number from the
 * metapage.
 *
 * Throw an error if the metapage is not the current version, unless
 * 'force' is true.
 */
 uint64
 ColumnarStorageGetReservedRowNumber(Relation rel, bool force)
 {
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
 	return metapage.reservedRowNumber;
 }
 /*
 * ColumnarStorageGetReservedOffset - return reserved offset from the metapage.
 *
 * Throw an error if the metapage is not the current version, unless
 * 'force' is true.
 */
 uint64
 ColumnarStorageGetReservedOffset(Relation rel, bool force)
 {
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, force);
 	return metapage.reservedOffset;
 }
 /*
 * ColumnarStorageIsCurrent - return true if metapage exists and is not
 * the current version.
 */
 bool
 ColumnarStorageIsCurrent(Relation rel)
 {
 	RelationOpenSmgr(rel);
 	BlockNumber nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	if (nblocks < 2)
 	{
 		return false;
 	}
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, true);
 	return ColumnarMetapageIsCurrent(&metapage);
 }
 /*
 * ColumnarStorageReserveRowNumber returns reservedRowNumber and advances
 * it for next row number reservation.
 */
 uint64
 ColumnarStorageReserveRowNumber(Relation rel, uint64 nrows)
 {
 	LockRelationForExtension(rel, ExclusiveLock);
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, false);
 	uint64 firstRowNumber = metapage.reservedRowNumber;
 	metapage.reservedRowNumber += nrows;
 	ColumnarOverwriteMetapage(rel, metapage);
 	UnlockRelationForExtension(rel, ExclusiveLock);
 	return firstRowNumber;
 }
 /*
 * ColumnarStorageReserveStripe returns stripeId and advances it for next
 * stripeId reservation.
 * Note that this function doesn't handle row number reservation.
 * This is because, unlike stripeId reservation, we immediately reserve
 * row number during writes, not when flushing stripes to disk.
 * See ColumnarStorageReserveRowNumber function.
 */
 uint64
 ColumnarStorageReserveStripe(Relation rel)
 {
 	LockRelationForExtension(rel, ExclusiveLock);
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, false);
 	uint64 stripeId = metapage.reservedStripeId;
 	metapage.reservedStripeId++;
 	ColumnarOverwriteMetapage(rel, metapage);
 	UnlockRelationForExtension(rel, ExclusiveLock);
 	return stripeId;
 }
 /*
 * ColumnarStorageReserveData - reserve logical data offsets for writing.
 */
 uint64
 ColumnarStorageReserveData(Relation rel, uint64 amount)
 {
 	if (amount == 0)
 	{
 		return ColumnarInvalidLogicalOffset;
 	}
 	LockRelationForExtension(rel, ExclusiveLock);
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, false);
 	uint64 alignedReservation = AlignReservation(metapage.reservedOffset);
 	uint64 nextReservation = alignedReservation + amount;
 	metapage.reservedOffset = nextReservation;
 	/* write new reservation */
 	ColumnarOverwriteMetapage(rel, metapage);
 	/* last used PhysicalAddr of new reservation */
 	PhysicalAddr final = LogicalToPhysical(nextReservation - 1);
 	/* extend with new pages */
 	RelationOpenSmgr(rel);
 	BlockNumber nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	while (nblocks <= final.blockno)
 	{
 		Buffer newBuffer = ReadBuffer(rel, P_NEW);
 		Assert(BufferGetBlockNumber(newBuffer) == nblocks);
 		ReleaseBuffer(newBuffer);
 		nblocks++;
 	}
 	UnlockRelationForExtension(rel, ExclusiveLock);
 	return alignedReservation;
 }
 /*
 * ColumnarStorageRead - map the logical offset to a block and offset, then
 * read the buffer from multiple blocks if necessary.
 */
 void
 ColumnarStorageRead(Relation rel, uint64 logicalOffset, char *data, uint32 amount)
 {
 	/* if there's no work to do, succeed even with invalid offset */
 	if (amount == 0)
 	{
 		return;
 	}
 	if (!ColumnarLogicalOffsetIsValid(logicalOffset))
 	{
 		elog(ERROR,
 			 "attempted columnar read on relation %d from invalid logical offset: "
 			 UINT64_FORMAT,
 			 rel->rd_id, logicalOffset);
 	}
 	uint64 read = 0;
 	while (read < amount)
 	{
 		PhysicalAddr addr = LogicalToPhysical(logicalOffset + read);
 		uint32 to_read = Min(amount - read, BLCKSZ - addr.offset);
 		ReadFromBlock(rel, addr.blockno, addr.offset, data + read, to_read,
 					  false);
 		read += to_read;
 	}
 }
 /*
 * ColumnarStorageWrite - map the logical offset to a block and offset, then
 * write the buffer across multiple blocks if necessary.
 */
 void
 ColumnarStorageWrite(Relation rel, uint64 logicalOffset, char *data, uint32 amount)
 {
 	/* if there's no work to do, succeed even with invalid offset */
 	if (amount == 0)
 	{
 		return;
 	}
 	if (!ColumnarLogicalOffsetIsValid(logicalOffset))
 	{
 		elog(ERROR,
 			 "attempted columnar write on relation %d to invalid logical offset: "
 			 UINT64_FORMAT,
 			 rel->rd_id, logicalOffset);
 	}
 	uint64 written = 0;
 	while (written < amount)
 	{
 		PhysicalAddr addr = LogicalToPhysical(logicalOffset + written);
 		uint64 to_write = Min(amount - written, BLCKSZ - addr.offset);
 		WriteToBlock(rel, addr.blockno, addr.offset, data + written, to_write,
 					 false);
 		written += to_write;
 	}
 }
 /*
 * ColumnarStorageTruncate - truncate the columnar storage such that
 * newDataReservation will be the first unused logical offset available. Free
 * pages at the end of the relation.
 *
 * Caller must hold AccessExclusiveLock on the relation.
 *
 * Returns true if pages were truncated; false otherwise.
 */
 bool
 ColumnarStorageTruncate(Relation rel, uint64 newDataReservation)
 {
 	if (!ColumnarLogicalOffsetIsValid(newDataReservation))
 	{
 		elog(ERROR,
 			 "attempted to truncate relation %d to invalid logical offset: " UINT64_FORMAT,
 			 rel->rd_id, newDataReservation);
 	}
 	RelationOpenSmgr(rel);
 	BlockNumber old_rel_pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	if (old_rel_pages == 0)
 	{
 		/* nothing to do */
 		return false;
 	}
 	LockRelationForExtension(rel, ExclusiveLock);
 	ColumnarMetapage metapage = ColumnarMetapageRead(rel, false);
 	if (metapage.reservedOffset < newDataReservation)
 	{
 		elog(ERROR,
 			 "attempted to truncate relation %d to offset " UINT64_FORMAT \
 			 " which is higher than existing offset " UINT64_FORMAT,
 			 rel->rd_id, newDataReservation, metapage.reservedOffset);
 	}
 	if (metapage.reservedOffset == newDataReservation)
 	{
 		/* nothing to do */
 		UnlockRelationForExtension(rel, ExclusiveLock);
 		return false;
 	}
 	metapage.reservedOffset = newDataReservation;
 	/* write new reservation */
 	ColumnarOverwriteMetapage(rel, metapage);
 	UnlockRelationForExtension(rel, ExclusiveLock);
 	PhysicalAddr final = LogicalToPhysical(newDataReservation - 1);
 	BlockNumber new_rel_pages = final.blockno + 1;
 	Assert(new_rel_pages <= old_rel_pages);
 	/*
 	 * Truncate the storage. Note that RelationTruncate() takes care of
 	 * Write Ahead Logging.
 	 */
 	if (new_rel_pages < old_rel_pages)
 	{
 		RelationTruncate(rel, new_rel_pages);
 		return true;
 	}
 	return false;
 }
 /*
 * ColumnarOverwriteMetapage writes given columnarMetapage back to metapage
 * for given relation.
 */
 static void
 ColumnarOverwriteMetapage(Relation relation, ColumnarMetapage columnarMetapage)
 {
 	/* clear metapage because we are overwriting */
 	bool clear = true;
 	WriteToBlock(relation, COLUMNAR_METAPAGE_BLOCKNO, SizeOfPageHeaderData,
 				 (char *) &columnarMetapage, sizeof(ColumnarMetapage), clear);
 }
 /*
 * ColumnarMetapageRead - read the current contents of the metapage. Error if
 * it does not exist. Throw an error if the metapage is not the current
 * version, unless 'force' is true.
 *
 * NB: it's safe to read a different version of a metapage because we
 * guarantee that fields will only be added and existing fields will never be
 * changed. However, it's important that we don't depend on new fields being
 * set properly when we read an old metapage; an old metapage should only be
 * read for the purposes of upgrading or error checking.
 */
 static ColumnarMetapage
 ColumnarMetapageRead(Relation rel, bool force)
 {
 	RelationOpenSmgr(rel);
 	BlockNumber nblocks = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	if (nblocks == 0)
 	{
 		/*
 		 * We only expect this to happen when upgrading citus.so. This is because,
 		 * in current version of columnar, we immediately create the metapage
 		 * for columnar tables, i.e right after creating the table.
 		 * However in older versions, we were creating metapages lazily, i.e
 		 * when ingesting data to columnar table.
 		 */
 		ereport(ERROR, (errmsg("columnar metapage for relation \"%s\" does not exist",
 							   RelationGetRelationName(rel)),
 						errhint(OLD_METAPAGE_VERSION_HINT)));
 	}
 	/*
 	 * Regardless of "force" parameter, always force read metapage block.
 	 * We will check metapage version in ColumnarMetapageCheckVersion
 	 * depending on "force".
 	 */
 	bool forceReadBlock = true;
 	ColumnarMetapage metapage;
 	ReadFromBlock(rel, COLUMNAR_METAPAGE_BLOCKNO, SizeOfPageHeaderData,
 				  (char *) &metapage, sizeof(ColumnarMetapage), forceReadBlock);
 	if (!force)
 	{
 		ColumnarMetapageCheckVersion(rel, &metapage);
 	}
 	return metapage;
 }
 /*
 * ReadFromBlock - read bytes from a page at the given offset. If 'force' is
 * true, don't check pd_lower; useful when reading a metapage of unknown
 * version.
 */
 static void
 ReadFromBlock(Relation rel, BlockNumber blockno, uint32 offset, char *buf,
 			  uint32 len, bool force)
 {
 	Buffer buffer = ReadBuffer(rel, blockno);
 	Page page = BufferGetPage(buffer);
 	PageHeader phdr = (PageHeader) page;
 	if (BLCKSZ < offset + len || (!force && (phdr->pd_lower < offset + len)))
 	{
 		elog(ERROR,
 			 "attempt to read columnar data of length %d from offset %d of block %d of relation %d",
 			 len, offset, blockno, rel->rd_id);
 	}
 	memcpy_s(buf, len, page + offset, len);
 	ReleaseBuffer(buffer);
 }
 /*
 * WriteToBlock - append data to a block, initializing if necessary, and emit
 * WAL. If 'clear' is true, always clear the data on the page and reinitialize
 * it first, and offset must be SizeOfPageHeaderData. Otherwise, offset must
 * be equal to pd_lower and pd_lower will be set to the end of the written
 * data.
 */
 static void
 WriteToBlock(Relation rel, BlockNumber blockno, uint32 offset, char *buf,
 			 uint32 len, bool clear)
 {
 	Buffer buffer = ReadBuffer(rel, blockno);
 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 	Page page = BufferGetPage(buffer);
 	PageHeader phdr = (PageHeader) page;
 	if (PageIsNew(page) || clear)
 	{
 		PageInit(page, BLCKSZ, 0);
 	}
 	if (phdr->pd_lower != offset || phdr->pd_upper - offset < len)
 	{
 		elog(ERROR,
 			 "attempt to write columnar data of length %d to offset %d of block %d of relation %d",
 			 len, offset, blockno, rel->rd_id);
 	}
 	START_CRIT_SECTION();
 	memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower, buf, len);
 	phdr->pd_lower += len;
 	MarkBufferDirty(buffer);
 	if (RelationNeedsWAL(rel))
 	{
 		XLogBeginInsert();
 		/*
 		 * Since columnar will mostly write whole pages we force the transmission of the
 		 * whole image in the buffer
 		 */
 		XLogRegisterBuffer(0, buffer, REGBUF_FORCE_IMAGE);
 		XLogRecPtr recptr = XLogInsert(RM_GENERIC_ID, 0);
 		PageSetLSN(page, recptr);
 	}
 	END_CRIT_SECTION();
 	UnlockReleaseBuffer(buffer);
 }
 /*
 * AlignReservation - given an unused logical byte offset, align it so that it
 * falls at the start of a page.
 *
 * XXX: Reconsider whether we want/need to do this at all.
 */
 static uint64
 AlignReservation(uint64 prevReservation)
 {
 	PhysicalAddr prevAddr = LogicalToPhysical(prevReservation);
 	uint64 alignedReservation = prevReservation;
 	if (prevAddr.offset != SizeOfPageHeaderData)
 	{
 		/* not aligned; align on beginning of next page */
 		PhysicalAddr initial = { 0 };
 		initial.blockno = prevAddr.blockno + 1;
 		initial.offset = SizeOfPageHeaderData;
 		alignedReservation = PhysicalToLogical(initial);
 	}
 	Assert(alignedReservation >= prevReservation);
 	return alignedReservation;
 }
 /*
 * ColumnarMetapageIsCurrent - is the metapage at the latest version?
 */
 static bool
 ColumnarMetapageIsCurrent(ColumnarMetapage *metapage)
 {
 	return (metapage->versionMajor == COLUMNAR_VERSION_MAJOR &&
 			metapage->versionMinor == COLUMNAR_VERSION_MINOR);
 }
 /*
 * ColumnarMetapageIsOlder - is the metapage older than the current version?
 */
 static bool
 ColumnarMetapageIsOlder(ColumnarMetapage *metapage)
 {
 	return (metapage->versionMajor < COLUMNAR_VERSION_MAJOR ||
 			(metapage->versionMajor == COLUMNAR_VERSION_MAJOR &&
 			 (int) metapage->versionMinor < (int) COLUMNAR_VERSION_MINOR));
 }
 /*
 * ColumnarMetapageIsNewer - is the metapage newer than the current version?
 */
 static bool
 ColumnarMetapageIsNewer(ColumnarMetapage *metapage)
 {
 	return (metapage->versionMajor > COLUMNAR_VERSION_MAJOR ||
 			(metapage->versionMajor == COLUMNAR_VERSION_MAJOR &&
 			 metapage->versionMinor > COLUMNAR_VERSION_MINOR));
 }
 /*
 * ColumnarMetapageCheckVersion - throw an error if accessing old
 * version of metapage.
 */
 static void
 ColumnarMetapageCheckVersion(Relation rel, ColumnarMetapage *metapage)
 {
 	if (!ColumnarMetapageIsCurrent(metapage))
 	{
 		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						errmsg(
 							"attempted to access relation \"%s\", which uses an older columnar format",
 							RelationGetRelationName(rel)),
 						errdetail(
 							"Columnar format version %d.%d is required, \"%s\" has version %d.%d.",
 							COLUMNAR_VERSION_MAJOR, COLUMNAR_VERSION_MINOR,
 							RelationGetRelationName(rel),
 							metapage->versionMajor, metapage->versionMinor),
 						errhint(OLD_METAPAGE_VERSION_HINT)));
 	}
 }
--- a/src/backend/columnar/columnar_tableam.c
+++ b/src/backend/columnar/columnar_tableam.c
@ -51,10 +51,12 @@
 #include "columnar/columnar.h"
 #include "columnar/columnar_customscan.h"
 #include "columnar/columnar_storage.h"
 #include "columnar/columnar_tableam.h"
 #include "columnar/columnar_version_compat.h"
 #include "distributed/commands.h"
 #include "distributed/commands/utility_hook.h"
 #include "distributed/listutils.h"
 #include "distributed/metadata_cache.h"
 /*
@ -81,12 +83,6 @@ typedef struct ColumnarScanDescData
 	MemoryContext scanContext;
 	Bitmapset *attr_needed;
 	List *scanQual;
 	/*
 	 * ANALYZE requires an item pointer for sorting. We keep track of row
 	 * number so we can construct an item pointer based on that.
 	 */
 	uint64 rowNumber;
 } ColumnarScanDescData;
 typedef struct ColumnarScanDescData *ColumnarScanDesc;
@ -115,6 +111,21 @@ static void TruncateColumnar(Relation rel, int elevel);
 static HeapTuple ColumnarSlotCopyHeapTuple(TupleTableSlot *slot);
 static void ColumnarCheckLogicalReplication(Relation rel);
 static Datum * detoast_values(TupleDesc tupleDesc, Datum *orig_values, bool *isnull);
 static ItemPointerData row_number_to_tid(uint64 rowNumber);
 static uint64 tid_to_row_number(ItemPointerData tid);
 static void ErrorIfInvalidRowNumber(uint64 rowNumber);
 static void ColumnarReportTotalVirtualBlocks(Relation relation, Snapshot snapshot,
 											 int progressArrIndex);
 static BlockNumber ColumnarGetNumberOfVirtualBlocks(Relation relation, Snapshot snapshot);
 static ItemPointerData ColumnarGetHighestItemPointer(Relation relation,
 													 Snapshot snapshot);
 static double ColumnarReadRowsIntoIndex(TableScanDesc scan,
 										Relation indexRelation,
 										IndexInfo *indexInfo,
 										bool progress,
 										IndexBuildCallback indexCallback,
 										void *indexCallbackState,
 										EState *estate, ExprState *predicate);
 /* Custom tuple slot ops used for columnar. Initialized in columnar_tableam_init(). */
 static TupleTableSlotOps TTSOpsColumnar;
@ -264,8 +275,9 @@ columnar_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlo
 	ExecClearTuple(slot);
 	uint64 rowNumber;
 	bool nextRowFound = ColumnarReadNextRow(scan->cs_readState, slot->tts_values,
-											slot->tts_isnull);
+											slot->tts_isnull, &rowNumber);
 	if (!nextRowFound)
 	{
@ -274,65 +286,130 @@ columnar_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlo
 	ExecStoreVirtualTuple(slot);
-	/*
+	slot->tts_tid = row_number_to_tid(rowNumber);
 	 * Set slot's item pointer block & offset to non-zero. These are
 	 * used just for sorting in acquire_sample_rows(), so rowNumber
 	 * is good enough. See ColumnarSlotCopyHeapTuple for more info.
 	 *
 	 * offset is 16-bits, so use the first 15 bits for offset and
 	 * rest as block number.
 	 */
 	ItemPointerSetBlockNumber(&(slot->tts_tid), scan->rowNumber / (32 * 1024) + 1);
 	ItemPointerSetOffsetNumber(&(slot->tts_tid), scan->rowNumber % (32 * 1024) + 1);
 	scan->rowNumber++;
 	return true;
 }
 /*
 * row_number_to_tid maps given rowNumber to ItemPointerData.
 */
 static ItemPointerData
 row_number_to_tid(uint64 rowNumber)
 {
 	ErrorIfInvalidRowNumber(rowNumber);
 	ItemPointerData tid = { 0 };
 	ItemPointerSetBlockNumber(&tid, rowNumber / VALID_ITEMPOINTER_OFFSETS);
 	ItemPointerSetOffsetNumber(&tid, rowNumber % VALID_ITEMPOINTER_OFFSETS +
 							   FirstOffsetNumber);
 	return tid;
 }
 /*
 * tid_to_row_number maps given ItemPointerData to rowNumber.
 */
 static uint64
 tid_to_row_number(ItemPointerData tid)
 {
 	uint64 rowNumber = ItemPointerGetBlockNumber(&tid) * VALID_ITEMPOINTER_OFFSETS +
 					   ItemPointerGetOffsetNumber(&tid) - FirstOffsetNumber;
 	ErrorIfInvalidRowNumber(rowNumber);
 	return rowNumber;
 }
 /*
 * ErrorIfInvalidRowNumber errors out if given rowNumber is invalid.
 */
 static void
 ErrorIfInvalidRowNumber(uint64 rowNumber)
 {
 	if (rowNumber == COLUMNAR_INVALID_ROW_NUMBER)
 	{
 		/* not expected but be on the safe side */
 		ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
 						errmsg("unexpected row number for columnar table")));
 	}
 	else if (rowNumber > COLUMNAR_MAX_ROW_NUMBER)
 	{
 		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						errmsg("columnar tables can't have row numbers "
 							   "greater than " UINT64_FORMAT,
 							   (uint64) COLUMNAR_MAX_ROW_NUMBER),
 						errhint("Consider using VACUUM FULL for your table")));
 	}
 }
 static Size
 columnar_parallelscan_estimate(Relation rel)
 {
-	elog(ERROR, "columnar_parallelscan_estimate not implemented");
+	return sizeof(ParallelBlockTableScanDescData);
 }
 static Size
 columnar_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
 {
-	elog(ERROR, "columnar_parallelscan_initialize not implemented");
+	ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
 	bpscan->base.phs_relid = RelationGetRelid(rel);
 	bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
 	bpscan->base.phs_syncscan = synchronize_seqscans &&
 								!RelationUsesLocalBuffers(rel) &&
 								bpscan->phs_nblocks > NBuffers / 4;
 	SpinLockInit(&bpscan->phs_mutex);
 	bpscan->phs_startblock = InvalidBlockNumber;
 	pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
 	return sizeof(ParallelBlockTableScanDescData);
 }
 static void
 columnar_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
 {
-	elog(ERROR, "columnar_parallelscan_reinitialize not implemented");
+	ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
 	pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
 }
 static IndexFetchTableData *
 columnar_index_fetch_begin(Relation rel)
 {
-	ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	Oid relfilenode = rel->rd_node.relNode;
-					errmsg("indexes not supported for columnar tables")));
+	if (PendingWritesInUpperTransactions(relfilenode, GetCurrentSubTransactionId()))
 	{
 		/* XXX: maybe we can just flush the data and continue */
 		elog(ERROR, "cannot read from index when there is unflushed data in "
 					"upper transactions");
 	}
 	FlushWriteStateForRelfilenode(relfilenode, GetCurrentSubTransactionId());
 	IndexFetchTableData *scan = palloc0(sizeof(IndexFetchTableData));
 	scan->rel = rel;
 	return scan;
 }
 static void
 columnar_index_fetch_reset(IndexFetchTableData *scan)
 {
-	ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	/* no-op */
 					errmsg("indexes not supported for columnar tables")));
 }
 static void
 columnar_index_fetch_end(IndexFetchTableData *scan)
 {
-	ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	columnar_index_fetch_reset(scan);
-					errmsg("indexes not supported for columnar tables")));
+	pfree(scan);
 }
@ -343,8 +420,37 @@ columnar_index_fetch_tuple(struct IndexFetchTableData *scan,
 						   TupleTableSlot *slot,
 						   bool *call_again, bool *all_dead)
 {
-	ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	/* no HOT chains are possible in columnar, directly set it to false */
-					errmsg("indexes not supported for columnar tables")));
+	*call_again = false;
 	/*
 	 * No dead tuples are possible in columnar, set it to false if it's
 	 * passed to be non-NULL.
 	 */
 	if (all_dead)
 	{
 		*all_dead = false;
 	}
 	ExecClearTuple(slot);
 	/* we need all columns */
 	int natts = scan->rel->rd_att->natts;
 	Bitmapset *attr_needed = bms_add_range(NULL, 0, natts - 1);
 	TupleDesc relationTupleDesc = RelationGetDescr(scan->rel);
 	List *relationColumnList = NeededColumnsList(relationTupleDesc, attr_needed);
 	uint64 rowNumber = tid_to_row_number(*tid);
 	if (!ColumnarReadRowByRowNumber(scan->rel, rowNumber, relationColumnList,
 									slot->tts_values, slot->tts_isnull, snapshot))
 	{
 		return false;
 	}
 	slot->tts_tableOid = RelationGetRelid(scan->rel);
 	slot->tts_tid = *tid;
 	ExecStoreVirtualTuple(slot);
 	return true;
 }
@ -411,7 +517,8 @@ columnar_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
 	Datum *values = detoast_values(slot->tts_tupleDescriptor,
 								   slot->tts_values, slot->tts_isnull);
-	ColumnarWriteRow(writeState, values, slot->tts_isnull);
+	uint64 writtenRowNumber = ColumnarWriteRow(writeState, values, slot->tts_isnull);
 	slot->tts_tid = row_number_to_tid(writtenRowNumber);
 	MemoryContextSwitchTo(oldContext);
 	MemoryContextReset(ColumnarWritePerTupleContext(writeState));
@ -457,7 +564,10 @@ columnar_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 		Datum *values = detoast_values(tupleSlot->tts_tupleDescriptor,
 									   tupleSlot->tts_values, tupleSlot->tts_isnull);
-		ColumnarWriteRow(writeState, values, tupleSlot->tts_isnull);
+		uint64 writtenRowNumber = ColumnarWriteRow(writeState, values,
 												   tupleSlot->tts_isnull);
 		tupleSlot->tts_tid = row_number_to_tid(writtenRowNumber);
 		MemoryContextReset(ColumnarWritePerTupleContext(writeState));
 	}
@ -516,17 +626,24 @@ columnar_relation_set_new_filenode(Relation rel,
 						errmsg("unlogged columnar tables are not supported")));
 	}
-	Oid oldRelfilenode = rel->rd_node.relNode;
+	/*
 	 * If existing and new relfilenode are different, that means the existing
 	 * storage was dropped and we also need to clean up the metadata and
 	 * state. If they are equal, this is a new relation object and we don't
 	 * need to clean anything.
 	 */
 	if (rel->rd_node.relNode != newrnode->relNode)
 	{
 		MarkRelfilenodeDropped(rel->rd_node.relNode, GetCurrentSubTransactionId());
-	MarkRelfilenodeDropped(oldRelfilenode, GetCurrentSubTransactionId());
+		DeleteMetadataRows(rel->rd_node);
-
+	}
 	/* delete old relfilenode metadata */
 	DeleteMetadataRows(rel->rd_node);
 	*freezeXid = RecentXmin;
 	*minmulti = GetOldestMultiXactId();
 	SMgrRelation srel = RelationCreateStorage(*newrnode, persistence);
 	ColumnarStorageInit(srel, ColumnarMetadataNewStorageId());
 	InitColumnarOptions(rel->rd_id);
 	smgrclose(srel);
@ -554,7 +671,9 @@ columnar_relation_nontransactional_truncate(Relation rel)
 	 */
 	RelationTruncate(rel, 0);
-	/* we will lazily initialize new metadata in first stripe reservation */
+	uint64 storageId = ColumnarMetadataNewStorageId();
 	RelationOpenSmgr(rel);
 	ColumnarStorageInit(rel->rd_smgr, storageId);
 }
@ -588,7 +707,8 @@ columnar_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	if (OldIndex != NULL || use_sort)
 	{
 		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-						errmsg("indexes not supported for columnar tables")));
+						errmsg("clustering columnar tables using indexes is "
 							   "not supported")));
 	}
 	/*
@ -619,7 +739,8 @@ columnar_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	*num_tuples = 0;
-	while (ColumnarReadNextRow(readState, values, nulls))
+	/* we don't need to know rowNumber here */
 	while (ColumnarReadNextRow(readState, values, nulls, NULL))
 	{
 		ColumnarWriteRow(writeState, values, nulls);
 		(*num_tuples)++;
@ -667,6 +788,14 @@ static void
 columnar_vacuum_rel(Relation rel, VacuumParams *params,
 					BufferAccessStrategy bstrategy)
 {
 	/*
 	 * If metapage version of relation is older, then we hint users to VACUUM
 	 * the relation in ColumnarMetapageCheckVersion. So if needed, upgrade
 	 * the metapage before doing anything.
 	 */
 	bool isUpgrade = true;
 	ColumnarStorageUpdateIfNeeded(rel, isUpgrade);
 	int elevel = (params->options & VACOPT_VERBOSE) ? INFO : DEBUG2;
 	/* this should have been resolved by vacuum.c until now */
@ -840,34 +969,25 @@ TruncateColumnar(Relation rel, int elevel)
 		return;
 	}
 	RelationOpenSmgr(rel);
 	BlockNumber old_rel_pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	RelationCloseSmgr(rel);
 	/*
 	 * Due to the AccessExclusive lock there's no danger that
 	 * new stripes be added beyond highestPhysicalAddress while
 	 * we're truncating.
 	 */
-	SmgrAddr highestPhysicalAddress =
+	uint64 newDataReservation = Max(GetHighestUsedAddress(rel->rd_node) + 1,
-		logical_to_smgr(GetHighestUsedAddress(rel->rd_node));
+									ColumnarFirstLogicalOffset);
-	/*
+	RelationOpenSmgr(rel);
-	 * Unlock and return if truncation won't reduce data file's size.
+	BlockNumber old_rel_pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
-	 */
+
-	BlockNumber new_rel_pages = Min(old_rel_pages,
+	if (!ColumnarStorageTruncate(rel, newDataReservation))
 									highestPhysicalAddress.blockno + 1);
 	if (new_rel_pages == old_rel_pages)
 	{
 		UnlockRelation(rel, AccessExclusiveLock);
 		return;
 	}
-	/*
+	RelationOpenSmgr(rel);
-	 * Truncate the storage. Note that RelationTruncate() takes care of
+	BlockNumber new_rel_pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	 * Write Ahead Logging.
 	 */
 	RelationTruncate(rel, new_rel_pages);
 	/*
 	 * We can release the exclusive lock as soon as we have truncated.
@ -964,7 +1084,7 @@ columnar_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 static double
-columnar_index_build_range_scan(Relation heapRelation,
+columnar_index_build_range_scan(Relation columnarRelation,
 								Relation indexRelation,
 								IndexInfo *indexInfo,
 								bool allow_sync,
@ -976,8 +1096,278 @@ columnar_index_build_range_scan(Relation heapRelation,
 								void *callback_state,
 								TableScanDesc scan)
 {
-	ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	if (start_blockno != 0 || numblocks != InvalidBlockNumber)
-					errmsg("indexes not supported for columnar tables")));
+	{
 		/*
 		 * Columnar utility hook already errors out for BRIN indexes on columnar
 		 * tables, but be on the safe side.
 		 */
 		ereport(ERROR, (errmsg("BRIN indexes on columnar tables are not supported")));
 	}
 	if (indexInfo->ii_Concurrent)
 	{
 		/* we already don't allow CONCURRENTLY syntax but be on the safe side */
 		ereport(ERROR, (errmsg("concurrent index builds are not supported "
 							   "for columnar tables")));
 	}
 	if (scan)
 	{
 		/*
 		 * Scan is initialized iff postgres decided to build the index using
 		 * parallel workers. In this case, we simply return for parallel
 		 * workers since we don't support parallel scan on columnar tables.
 		 */
 		if (IsBackgroundWorker)
 		{
 			ereport(DEBUG4, (errmsg("ignoring parallel worker when building "
 									"index since parallel scan on columnar "
 									"tables is not supported")));
 			return 0;
 		}
 		ereport(NOTICE, (errmsg("falling back to serial index build since "
 								"parallel scan on columnar tables is not "
 								"supported")));
 	}
 	/*
 	 * In a normal index build, we use SnapshotAny to retrieve all tuples. In
 	 * a concurrent build or during bootstrap, we take a regular MVCC snapshot
 	 * and index whatever's live according to that.
 	 */
 	TransactionId OldestXmin = InvalidTransactionId;
 	/*
 	 * We already don't allow concurrent index builds so ii_Concurrent
 	 * will always be false, but let's keep the code close to heapAM.
 	 */
 	if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
 	{
 		/* ignore lazy VACUUM's */
 		OldestXmin = GetOldestXmin(columnarRelation, PROCARRAY_FLAGS_VACUUM);
 	}
 	Snapshot snapshot = { 0 };
 	bool snapshotRegisteredByUs = false;
 	if (!scan)
 	{
 		/*
 		 * For serial index build, we begin our own scan. We may also need to
 		 * register a snapshot whose lifetime is under our direct control.
 		 */
 		if (!TransactionIdIsValid(OldestXmin))
 		{
 			snapshot = RegisterSnapshot(GetTransactionSnapshot());
 			snapshotRegisteredByUs = true;
 		}
 		else
 		{
 			snapshot = SnapshotAny;
 		}
 		int nkeys = 0;
 		ScanKeyData *scanKey = NULL;
 		bool allowAccessStrategy = true;
 		scan = table_beginscan_strat(columnarRelation, snapshot, nkeys, scanKey,
 									 allowAccessStrategy, allow_sync);
 	}
 	else
 	{
 		/*
 		 * For parallel index build, we don't register/unregister own snapshot
 		 * since snapshot is taken from parallel scan. Note that even if we
 		 * don't support parallel index builds, we still continue building the
 		 * index via the main backend and we should still rely on the snapshot
 		 * provided by parallel scan.
 		 */
 		snapshot = scan->rs_snapshot;
 	}
 	if (progress)
 	{
 		ColumnarReportTotalVirtualBlocks(columnarRelation, snapshot,
 										 PROGRESS_SCAN_BLOCKS_TOTAL);
 	}
 	/*
 	 * Set up execution state for predicate, if any.
 	 * Note that this is only useful for partial indexes.
 	 */
 	EState *estate = CreateExecutorState();
 	ExprContext *econtext = GetPerTupleExprContext(estate);
 	econtext->ecxt_scantuple = table_slot_create(columnarRelation, NULL);
 	ExprState *predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
 	double reltuples = ColumnarReadRowsIntoIndex(scan, indexRelation, indexInfo,
 												 progress, callback, callback_state,
 												 estate, predicate);
 	table_endscan(scan);
 	if (progress)
 	{
 		/* report the last "virtual" block as "done" */
 		ColumnarReportTotalVirtualBlocks(columnarRelation, snapshot,
 										 PROGRESS_SCAN_BLOCKS_DONE);
 	}
 	if (snapshotRegisteredByUs)
 	{
 		UnregisterSnapshot(snapshot);
 	}
 	ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple);
 	FreeExecutorState(estate);
 	indexInfo->ii_ExpressionsState = NIL;
 	indexInfo->ii_PredicateState = NULL;
 	return reltuples;
 }
 /*
 * ColumnarReportTotalVirtualBlocks reports progress for index build based on
 * number of "virtual" blocks that given relation has.
 * "progressArrIndex" argument determines which entry in st_progress_param
 * array should be updated. In this case, we only expect PROGRESS_SCAN_BLOCKS_TOTAL
 * or PROGRESS_SCAN_BLOCKS_DONE to specify whether we want to report calculated
 * number of blocks as "done" or as "total" number of "virtual" blocks to scan.
 */
 static void
 ColumnarReportTotalVirtualBlocks(Relation relation, Snapshot snapshot,
 								 int progressArrIndex)
 {
 	/*
 	 * Indeed, columnar tables might have gaps between row numbers, e.g
 	 * due to aborted transactions etc. Also, ItemPointer BlockNumber's
 	 * for columnar tables don't actually correspond to actual disk blocks
 	 * as in heapAM. For this reason, we call them as "virtual" blocks. At
 	 * the moment, we believe it is better to report our progress based on
 	 * this "virtual" block concept instead of doing nothing.
 	 */
 	Assert(progressArrIndex == PROGRESS_SCAN_BLOCKS_TOTAL ||
 		   progressArrIndex == PROGRESS_SCAN_BLOCKS_DONE);
 	BlockNumber nvirtualBlocks =
 		ColumnarGetNumberOfVirtualBlocks(relation, snapshot);
 	pgstat_progress_update_param(progressArrIndex, nvirtualBlocks);
 }
 /*
 * ColumnarGetNumberOfVirtualBlocks returns total number of "virtual" blocks
 * that given columnar table has based on based on ItemPointer BlockNumber's.
 */
 static BlockNumber
 ColumnarGetNumberOfVirtualBlocks(Relation relation, Snapshot snapshot)
 {
 	ItemPointerData highestItemPointer =
 		ColumnarGetHighestItemPointer(relation, snapshot);
 	if (!ItemPointerIsValid(&highestItemPointer))
 	{
 		/* table is empty according to our snapshot */
 		return 0;
 	}
 	/*
 	 * Since BlockNumber is 0-based, increment it by 1 to find the total
 	 * number of "virtual" blocks.
 	 */
 	return ItemPointerGetBlockNumber(&highestItemPointer) + 1;
 }
 /*
 * ColumnarGetHighestItemPointer returns ItemPointerData for the tuple with
 * highest tid for given relation.
 * If given relation is empty, then returns invalid item pointer.
 */
 static ItemPointerData
 ColumnarGetHighestItemPointer(Relation relation, Snapshot snapshot)
 {
 	StripeMetadata *stripeWithHighestRowNumber =
 		FindStripeWithHighestRowNumber(relation, snapshot);
 	if (stripeWithHighestRowNumber == NULL)
 	{
 		/* table is empty according to our snapshot */
 		ItemPointerData invalidItemPtr;
 		ItemPointerSetInvalid(&invalidItemPtr);
 		return invalidItemPtr;
 	}
 	uint64 highestRowNumber = stripeWithHighestRowNumber->firstRowNumber +
 							  stripeWithHighestRowNumber->rowCount - 1;
 	return row_number_to_tid(highestRowNumber);
 }
 /*
 * ColumnarReadRowsIntoIndex builds indexRelation tuples by reading the
 * actual relation based on given "scan" and returns number of tuples
 * scanned to build the indexRelation.
 */
 static double
 ColumnarReadRowsIntoIndex(TableScanDesc scan, Relation indexRelation,
 						  IndexInfo *indexInfo, bool progress,
 						  IndexBuildCallback indexCallback,
 						  void *indexCallbackState, EState *estate,
 						  ExprState *predicate)
 {
 	double reltuples = 0;
 	BlockNumber lastReportedBlockNumber = InvalidBlockNumber;
 	ExprContext *econtext = GetPerTupleExprContext(estate);
 	TupleTableSlot *slot = econtext->ecxt_scantuple;
 	while (columnar_getnextslot(scan, ForwardScanDirection, slot))
 	{
 		CHECK_FOR_INTERRUPTS();
 		BlockNumber currentBlockNumber = ItemPointerGetBlockNumber(&slot->tts_tid);
 		if (progress && lastReportedBlockNumber != currentBlockNumber)
 		{
 			/*
 			 * columnar_getnextslot guarantees that returned tuple will
 			 * always have a greater ItemPointer than the ones we fetched
 			 * before, so we directly use BlockNumber to report our progress.
 			 */
 			Assert(lastReportedBlockNumber == InvalidBlockNumber ||
 				   currentBlockNumber >= lastReportedBlockNumber);
 			pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
 										 currentBlockNumber);
 			lastReportedBlockNumber = currentBlockNumber;
 		}
 		MemoryContextReset(econtext->ecxt_per_tuple_memory);
 		if (predicate != NULL && !ExecQual(predicate, econtext))
 		{
 			/* for partial indexes, discard tuples that don't satisfy the predicate */
 			continue;
 		}
 		Datum indexValues[INDEX_MAX_KEYS];
 		bool indexNulls[INDEX_MAX_KEYS];
 		FormIndexDatum(indexInfo, slot, estate, indexValues, indexNulls);
 		ItemPointerData itemPointerData = slot->tts_tid;
 		/* currently, columnar tables can't have dead tuples */
 		bool tupleIsAlive = true;
 #if PG_VERSION_NUM >= PG_VERSION_13
 		indexCallback(indexRelation, &itemPointerData, indexValues, indexNulls,
 					  tupleIsAlive, indexCallbackState);
 #else
 		HeapTuple scanTuple = ExecCopySlotHeapTuple(slot);
 		scanTuple->t_self = itemPointerData;
 		indexCallback(indexRelation, scanTuple, indexValues, indexNulls,
 					  tupleIsAlive, indexCallbackState);
 #endif
 		reltuples++;
 	}
 	return reltuples;
 }
@ -988,8 +1378,15 @@ columnar_index_validate_scan(Relation heapRelation,
 							 Snapshot snapshot,
 							 ValidateIndexState *state)
 {
 	/*
 	 * This is only called for concurrent index builds,
 	 * see table_index_validate_scan.
 	 * Note that we already error out for concurrent index
 	 * builds in utility hook but be on the safe side.
 	 */
 	ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					errmsg("indexes not supported for columnar tables")));
+					errmsg("concurrent index builds are not supported for "
 						   "columnar tables")));
 }
@ -1179,13 +1576,7 @@ ColumnarSlotCopyHeapTuple(TupleTableSlot *slot)
 									  slot->tts_values,
 									  slot->tts_isnull);
-	/*
+	/* slot->tts_tid is filled in columnar_getnextslot */
 	 * We need to set item pointer, since implementation of ANALYZE
 	 * requires it. See the qsort in acquire_sample_rows() and
 	 * also compare_rows in backend/commands/analyze.c.
 	 *
 	 * slot->tts_tid is filled in columnar_getnextslot.
 	 */
 	tuple->t_self = slot->tts_tid;
 	return tuple;
@ -1315,24 +1706,34 @@ ColumnarProcessUtility(PlannedStmt *pstmt,
 	{
 		IndexStmt *indexStmt = (IndexStmt *) parsetree;
-		/*
+		Relation rel = relation_openrv(indexStmt->relation,
-		 * We should reject CREATE INDEX CONCURRENTLY before DefineIndex() is
+									   GetCreateIndexRelationLockMode(indexStmt));
-		 * called. Erroring in callbacks called from DefineIndex() will create
+		if (rel->rd_tableam == GetColumnarTableAmRoutine())
 		 * the index and mark it as INVALID, which will cause segfault during
 		 * inserts.
 		 */
 		if (indexStmt->concurrent)
 		{
-			Relation rel = relation_openrv(indexStmt->relation,
+			/*
-										   ShareUpdateExclusiveLock);
+			 * We should reject CREATE INDEX CONCURRENTLY before DefineIndex() is
-			if (rel->rd_tableam == GetColumnarTableAmRoutine())
+			 * called. Erroring in callbacks called from DefineIndex() will create
 			 * the index and mark it as INVALID, which will cause segfault during
 			 * inserts.
 			 */
 			if (indexStmt->concurrent)
 			{
 				ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-								errmsg("indexes not supported for columnar tables")));
+								errmsg("concurrent index commands are not "
 									   "supported for columnar tables")));
 			}
-			RelationClose(rel);
+			/* for now, we don't support index access methods other than btree & hash */
 			if (strncmp(indexStmt->accessMethod, "btree", NAMEDATALEN) != 0 &&
 				strncmp(indexStmt->accessMethod, "hash", NAMEDATALEN) != 0)
 			{
 				ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 								errmsg("only btree and hash indexes are supported on "
 									   "columnar tables ")));
 			}
 		}
 		RelationClose(rel);
 	}
 	PrevProcessUtilityHook(pstmt, queryString, context,
@ -1374,6 +1775,17 @@ static const TableAmRoutine columnar_am_methods = {
 	.scan_rescan = columnar_rescan,
 	.scan_getnextslot = columnar_getnextslot,
 	/*
 	 * Postgres calls following three callbacks during index builds, if it
 	 * decides to use parallel workers when building the index. On the other
 	 * hand, we don't support parallel scans on columnar tables but we also
 	 * want to fallback to serial index build. For this reason, we both skip
 	 * parallel workers in columnar_index_build_range_scan and also provide
 	 * basic implementations for those callbacks based on their corresponding
 	 * implementations in heapAM.
 	 * Note that for regular query plans, we already ignore parallel paths via
 	 * ColumnarSetRelPathlistHook.
 	 */
 	.parallelscan_estimate = columnar_parallelscan_estimate,
 	.parallelscan_initialize = columnar_parallelscan_initialize,
 	.parallelscan_reinitialize = columnar_parallelscan_reinitialize,
@ -1840,3 +2252,75 @@ alter_columnar_table_reset(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 /*
 * upgrade_columnar_storage - upgrade columnar storage to the current
 * version.
 *
 * DDL:
 *   CREATE OR REPLACE FUNCTION upgrade_columnar_storage(rel regclass)
 *     RETURNS VOID
 *     STRICT
 *     LANGUAGE c AS 'MODULE_PATHNAME', 'upgrade_columnar_storage';
 */
 PG_FUNCTION_INFO_V1(upgrade_columnar_storage);
 Datum
 upgrade_columnar_storage(PG_FUNCTION_ARGS)
 {
 	Oid relid = PG_GETARG_OID(0);
 	/*
 	 * ACCESS EXCLUSIVE LOCK is not required by the low-level routines, so we
 	 * can take only an ACCESS SHARE LOCK. But all access to non-current
 	 * columnar tables will fail anyway, so it's better to take ACCESS
 	 * EXLUSIVE LOCK now.
 	 */
 	Relation rel = table_open(relid, AccessExclusiveLock);
 	if (!IsColumnarTableAmTable(relid))
 	{
 		ereport(ERROR, (errmsg("table %s is not a columnar table",
 							   quote_identifier(RelationGetRelationName(rel)))));
 	}
 	ColumnarStorageUpdateIfNeeded(rel, true);
 	table_close(rel, AccessExclusiveLock);
 	PG_RETURN_VOID();
 }
 /*
 * downgrade_columnar_storage - downgrade columnar storage to the
 * current version.
 *
 * DDL:
 *   CREATE OR REPLACE FUNCTION downgrade_columnar_storage(rel regclass)
 *     RETURNS VOID
 *     STRICT
 *     LANGUAGE c AS 'MODULE_PATHNAME', 'downgrade_columnar_storage';
 */
 PG_FUNCTION_INFO_V1(downgrade_columnar_storage);
 Datum
 downgrade_columnar_storage(PG_FUNCTION_ARGS)
 {
 	Oid relid = PG_GETARG_OID(0);
 	/*
 	 * ACCESS EXCLUSIVE LOCK is not required by the low-level routines, so we
 	 * can take only an ACCESS SHARE LOCK. But all access to non-current
 	 * columnar tables will fail anyway, so it's better to take ACCESS
 	 * EXLUSIVE LOCK now.
 	 */
 	Relation rel = table_open(relid, AccessExclusiveLock);
 	if (!IsColumnarTableAmTable(relid))
 	{
 		ereport(ERROR, (errmsg("table %s is not a columnar table",
 							   quote_identifier(RelationGetRelationName(rel)))));
 	}
 	ColumnarStorageUpdateIfNeeded(rel, false);
 	table_close(rel, AccessExclusiveLock);
 	PG_RETURN_VOID();
 }
--- a/src/backend/columnar/columnar_writer.c
+++ b/src/backend/columnar/columnar_writer.c
@ -30,6 +30,7 @@
 #include "utils/relfilenodemap.h"
 #include "columnar/columnar.h"
 #include "columnar/columnar_storage.h"
 #include "columnar/columnar_version_compat.h"
 struct ColumnarWriteState
@ -42,6 +43,7 @@ struct ColumnarWriteState
 	MemoryContext perTupleContext;
 	StripeBuffers *stripeBuffers;
 	StripeSkipList *stripeSkipList;
 	uint64 stripeFirstRowNumber;
 	ColumnarOptions options;
 	ChunkData *chunkData;
@ -128,6 +130,7 @@ ColumnarBeginWrite(RelFileNode relfilenode,
 	writeState->comparisonFunctionArray = comparisonFunctionArray;
 	writeState->stripeBuffers = NULL;
 	writeState->stripeSkipList = NULL;
 	writeState->stripeFirstRowNumber = COLUMNAR_INVALID_ROW_NUMBER;
 	writeState->stripeWriteContext = stripeWriteContext;
 	writeState->chunkData = chunkData;
 	writeState->compressionBuffer = NULL;
@ -146,8 +149,10 @@ ColumnarBeginWrite(RelFileNode relfilenode,
 * corresponding skip nodes. Then, whole chunk data is compressed at every
 * rowChunkCount insertion. Then, if row count exceeds stripeMaxRowCount, we flush
 * the stripe, and add its metadata to the table footer.
 *
 * Returns the "row number" assigned to written row.
 */
-void
+uint64
 ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *columnNulls)
 {
 	uint32 columnIndex = 0;
@ -169,6 +174,14 @@ ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *colu
 		writeState->stripeSkipList = stripeSkipList;
 		writeState->compressionBuffer = makeStringInfo();
 		Oid relationId = RelidByRelfilenode(writeState->relfilenode.spcNode,
 											writeState->relfilenode.relNode);
 		Relation relation = relation_open(relationId, NoLock);
 		writeState->stripeFirstRowNumber =
 			ColumnarStorageReserveRowNumber(relation,
 											options->stripeRowCount);
 		relation_close(relation, NoLock);
 		/*
 		 * serializedValueBuffer lives in stripe write memory context so it needs to be
 		 * initialized when the stripe is created.
@ -225,6 +238,7 @@ ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *colu
 		SerializeChunkData(writeState, chunkIndex, chunkRowCount);
 	}
 	uint64 writtenRowNumber = writeState->stripeFirstRowNumber + stripeBuffers->rowCount;
 	stripeBuffers->rowCount++;
 	if (stripeBuffers->rowCount >= options->stripeRowCount)
 	{
@ -232,6 +246,8 @@ ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *colu
 	}
 	MemoryContextSwitchTo(oldContext);
 	return writtenRowNumber;
 }
@ -351,80 +367,6 @@ CreateEmptyStripeSkipList(uint32 stripeMaxRowCount, uint32 chunkRowCount,
 }
 void
 WriteToSmgr(Relation rel, uint64 logicalOffset, char *data, uint32 dataLength)
 {
 	uint64 remaining = dataLength;
 	Buffer buffer;
 	while (remaining > 0)
 	{
 		SmgrAddr addr = logical_to_smgr(logicalOffset);
 		RelationOpenSmgr(rel);
 		BlockNumber nblocks PG_USED_FOR_ASSERTS_ONLY =
 			smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 		Assert(addr.blockno < nblocks);
 		RelationCloseSmgr(rel);
 		buffer = ReadBuffer(rel, addr.blockno);
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		Page page = BufferGetPage(buffer);
 		PageHeader phdr = (PageHeader) page;
 		if (PageIsNew(page))
 		{
 			PageInit(page, BLCKSZ, 0);
 		}
 		/*
 		 * After a transaction has been rolled-back, we might be
 		 * over-writing the rolledback write, so phdr->pd_lower can be
 		 * different from addr.offset.
 		 *
 		 * We reset pd_lower to reset the rolledback write.
 		 */
 		if (phdr->pd_lower > addr.offset)
 		{
 			ereport(DEBUG1, (errmsg("over-writing page %u", addr.blockno),
 							 errdetail("This can happen after a roll-back.")));
 			phdr->pd_lower = addr.offset;
 		}
 		Assert(phdr->pd_lower == addr.offset);
 		START_CRIT_SECTION();
 		uint64 to_write = Min(phdr->pd_upper - phdr->pd_lower, remaining);
 		memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower, data, to_write);
 		phdr->pd_lower += to_write;
 		MarkBufferDirty(buffer);
 		if (RelationNeedsWAL(rel))
 		{
 			XLogBeginInsert();
 			/*
 			 * Since columnar will mostly write whole pages we force the transmission of the
 			 * whole image in the buffer
 			 */
 			XLogRegisterBuffer(0, buffer, REGBUF_FORCE_IMAGE);
 			XLogRecPtr recptr = XLogInsert(RM_GENERIC_ID, 0);
 			PageSetLSN(page, recptr);
 		}
 		END_CRIT_SECTION();
 		UnlockReleaseBuffer(buffer);
 		data += to_write;
 		remaining -= to_write;
 		logicalOffset += to_write;
 	}
 }
 /*
 * FlushStripe flushes current stripe data into the file. The function first ensures
 * the last data chunk for each column is properly serialized and compressed. Then,
@ -502,7 +444,7 @@ FlushStripe(ColumnarWriteState *writeState)
 	stripeMetadata = ReserveStripe(relation, stripeSize,
 								   stripeRowCount, columnCount, chunkCount,
-								   chunkRowCount);
+								   chunkRowCount, writeState->stripeFirstRowNumber);
 	uint64 currentFileOffset = stripeMetadata.fileOffset;
@ -527,8 +469,8 @@ FlushStripe(ColumnarWriteState *writeState)
 				columnBuffers->chunkBuffersArray[chunkIndex];
 			StringInfo existsBuffer = chunkBuffers->existsBuffer;
-			WriteToSmgr(relation, currentFileOffset,
+			ColumnarStorageWrite(relation, currentFileOffset,
-						existsBuffer->data, existsBuffer->len);
+								 existsBuffer->data, existsBuffer->len);
 			currentFileOffset += existsBuffer->len;
 		}
@ -538,8 +480,8 @@ FlushStripe(ColumnarWriteState *writeState)
 				columnBuffers->chunkBuffersArray[chunkIndex];
 			StringInfo valueBuffer = chunkBuffers->valueBuffer;
-			WriteToSmgr(relation, currentFileOffset,
+			ColumnarStorageWrite(relation, currentFileOffset,
-						valueBuffer->data, valueBuffer->len);
+								 valueBuffer->data, valueBuffer->len);
 			currentFileOffset += valueBuffer->len;
 		}
 	}
--- a/src/backend/columnar/sql/columnar--10.1-1--10.2-1.sql
+++ b/src/backend/columnar/sql/columnar--10.1-1--10.2-1.sql
@ -0,0 +1,32 @@
 /* columnar--10.1-1--10.2-1.sql */
 -- For a proper mapping between tid & (stripe, row_num), add a new column to
 -- columnar.stripe and define a BTREE index on this column.
 -- Also include storage_id column for per-relation scans.
 ALTER TABLE columnar.stripe ADD COLUMN first_row_number bigint;
 CREATE INDEX stripe_first_row_number_idx ON columnar.stripe USING BTREE(storage_id, first_row_number);
 -- Populate first_row_number column of columnar.stripe table.
 --
 -- For simplicity, we calculate MAX(row_count) value across all the stripes
 -- of all the columanar tables and then use it to populate first_row_number
 -- column. This would introduce some gaps however we are okay with that since
 -- it's already the case with regular INSERT/COPY's.
 DO $$
 DECLARE
  max_row_count bigint;
  -- this should be equal to columnar_storage.h/COLUMNAR_FIRST_ROW_NUMBER
  COLUMNAR_FIRST_ROW_NUMBER constant bigint := 1;
 BEGIN
  SELECT MAX(row_count) INTO max_row_count FROM columnar.stripe;
  UPDATE columnar.stripe SET first_row_number = COLUMNAR_FIRST_ROW_NUMBER +
                                                (stripe_num - 1) * max_row_count;
 END;
 $$;
 #include "udfs/upgrade_columnar_storage/10.2-1.sql"
 #include "udfs/downgrade_columnar_storage/10.2-1.sql"
 -- upgrade storage for all columnar relations
 SELECT citus_internal.upgrade_columnar_storage(c.oid) FROM pg_class c, pg_am a
  WHERE c.relam = a.oid AND amname = 'columnar';
--- a/src/backend/columnar/sql/downgrades/columnar--10.2-1--10.1-1.sql
+++ b/src/backend/columnar/sql/downgrades/columnar--10.2-1--10.1-1.sql
@ -0,0 +1,12 @@
 /* columnar--10.2-1--10.1-1.sql */
 -- downgrade storage for all columnar relations
 SELECT citus_internal.downgrade_columnar_storage(c.oid) FROM pg_class c, pg_am a
  WHERE c.relam = a.oid AND amname = 'columnar';
 DROP FUNCTION citus_internal.upgrade_columnar_storage(regclass);
 DROP FUNCTION citus_internal.downgrade_columnar_storage(regclass);
 -- drop "first_row_number" column and the index defined on it
 DROP INDEX columnar.stripe_first_row_number_idx;
 ALTER TABLE columnar.stripe DROP COLUMN first_row_number;
--- a/src/backend/columnar/sql/udfs/downgrade_columnar_storage/10.2-1.sql
+++ b/src/backend/columnar/sql/udfs/downgrade_columnar_storage/10.2-1.sql
@ -0,0 +1,7 @@
 CREATE OR REPLACE FUNCTION citus_internal.downgrade_columnar_storage(rel regclass)
  RETURNS VOID
  STRICT
  LANGUAGE c AS 'MODULE_PATHNAME', $$downgrade_columnar_storage$$;
 COMMENT ON FUNCTION citus_internal.downgrade_columnar_storage(regclass)
  IS 'function to downgrade the columnar storage, if necessary';
--- a/src/backend/columnar/sql/udfs/downgrade_columnar_storage/latest.sql
+++ b/src/backend/columnar/sql/udfs/downgrade_columnar_storage/latest.sql
@ -0,0 +1,7 @@
 CREATE OR REPLACE FUNCTION citus_internal.downgrade_columnar_storage(rel regclass)
  RETURNS VOID
  STRICT
  LANGUAGE c AS 'MODULE_PATHNAME', $$downgrade_columnar_storage$$;
 COMMENT ON FUNCTION citus_internal.downgrade_columnar_storage(regclass)
  IS 'function to downgrade the columnar storage, if necessary';
--- a/src/backend/columnar/sql/udfs/upgrade_columnar_storage/10.2-1.sql
+++ b/src/backend/columnar/sql/udfs/upgrade_columnar_storage/10.2-1.sql
@ -0,0 +1,7 @@
 CREATE OR REPLACE FUNCTION citus_internal.upgrade_columnar_storage(rel regclass)
  RETURNS VOID
  STRICT
  LANGUAGE c AS 'MODULE_PATHNAME', $$upgrade_columnar_storage$$;
 COMMENT ON FUNCTION citus_internal.upgrade_columnar_storage(regclass)
  IS 'function to upgrade the columnar storage, if necessary';
--- a/src/backend/columnar/sql/udfs/upgrade_columnar_storage/latest.sql
+++ b/src/backend/columnar/sql/udfs/upgrade_columnar_storage/latest.sql
@ -0,0 +1,7 @@
 CREATE OR REPLACE FUNCTION citus_internal.upgrade_columnar_storage(rel regclass)
  RETURNS VOID
  STRICT
  LANGUAGE c AS 'MODULE_PATHNAME', $$upgrade_columnar_storage$$;
 COMMENT ON FUNCTION citus_internal.upgrade_columnar_storage(regclass)
  IS 'function to upgrade the columnar storage, if necessary';
--- a/src/backend/distributed/commands/index.c
+++ b/src/backend/distributed/commands/index.c
@ -62,7 +62,6 @@ static List * GenerateIndexParameters(IndexStmt *createIndexStatement);
 static DDLJob * GenerateCreateIndexDDLJob(IndexStmt *createIndexStatement,
 										  const char *createIndexCommand);
 static Oid CreateIndexStmtGetRelationId(IndexStmt *createIndexStatement);
 static LOCKMODE GetCreateIndexRelationLockMode(IndexStmt *createIndexStatement);
 static List * CreateIndexTaskList(IndexStmt *indexStmt);
 static List * CreateReindexTaskList(Oid relationId, ReindexStmt *reindexStmt);
 static void RangeVarCallbackForDropIndex(const RangeVar *rel, Oid relOid, Oid oldRelOid,
@ -503,7 +502,7 @@ CreateIndexStmtGetRelationId(IndexStmt *createIndexStatement)
 * GetCreateIndexRelationLockMode returns required lock mode to open the
 * relation that given CREATE INDEX command operates on.
 */
-static LOCKMODE
+LOCKMODE
 GetCreateIndexRelationLockMode(IndexStmt *createIndexStatement)
 {
 	if (createIndexStatement->concurrent)
--- a/src/backend/distributed/sql/citus--10.1-1--10.2-1.sql
+++ b/src/backend/distributed/sql/citus--10.1-1--10.2-1.sql
@ -1,4 +1,3 @@
 -- citus--10.1-1--10.2-1
-- bump version to 10.2-1
+#include "../../columnar/sql/columnar--10.1-1--10.2-1.sql"
--- a/src/backend/distributed/sql/downgrades/citus--10.2-1--10.1-1.sql
+++ b/src/backend/distributed/sql/downgrades/citus--10.2-1--10.1-1.sql
@ -1,2 +1,3 @@
 -- citus--10.2-1--10.1-1
-- this is an empty downgrade path since citus--10.1-1--10.2-1.sql is empty for now
+
 #include "../../../columnar/sql/downgrades/columnar--10.2-1--10.1-1.sql"
--- a/src/backend/distributed/transaction/lock_graph.c
+++ b/src/backend/distributed/transaction/lock_graph.c
@ -25,7 +25,6 @@
 #include "distributed/lock_graph.h"
 #include "distributed/metadata_cache.h"
 #include "distributed/remote_commands.h"
 #include "distributed/resource_lock.h"
 #include "distributed/tuplestore.h"
 #include "storage/proc.h"
 #include "utils/builtins.h"
@ -472,18 +471,9 @@ IsProcessWaitingForSafeOperations(PGPROC *proc)
 	PROCLOCK *waitProcLock = proc->waitProcLock;
 	LOCK *waitLock = waitProcLock->tag.myLock;
 	/*
 	 * Stripe reservation locks are temporary & don't hold until end of
 	 * transaction, so we shouldn't include them in the lock graph.
 	 */
 	bool stripeReservationLock =
 		waitLock->tag.locktag_type == LOCKTAG_ADVISORY &&
 		waitLock->tag.locktag_field4 == ADV_LOCKTAG_CLASS_COLUMNAR_STRIPE_RESERVATION;
 	return waitLock->tag.locktag_type == LOCKTAG_RELATION_EXTEND ||
 		   waitLock->tag.locktag_type == LOCKTAG_PAGE ||
-		   waitLock->tag.locktag_type == LOCKTAG_SPECULATIVE_TOKEN ||
+		   waitLock->tag.locktag_type == LOCKTAG_SPECULATIVE_TOKEN;
 		   stripeReservationLock;
 }
--- a/src/include/columnar/columnar.h
+++ b/src/include/columnar/columnar.h
@ -39,8 +39,8 @@
 #define COMPRESSION_LEVEL_MAX 19
 /* Columnar file signature */
-#define COLUMNAR_VERSION_MAJOR 1
+#define COLUMNAR_VERSION_MAJOR 2
-#define COLUMNAR_VERSION_MINOR 7
+#define COLUMNAR_VERSION_MINOR 0
 /* miscellaneous defines */
 #define COLUMNAR_TUPLE_COST_MULTIPLIER 10
@ -201,8 +201,8 @@ extern CompressionType ParseCompressionType(const char *compressionTypeString);
 extern ColumnarWriteState * ColumnarBeginWrite(RelFileNode relfilenode,
 											   ColumnarOptions options,
 											   TupleDesc tupleDescriptor);
-extern void ColumnarWriteRow(ColumnarWriteState *state, Datum *columnValues,
+extern uint64 ColumnarWriteRow(ColumnarWriteState *state, Datum *columnValues,
-							 bool *columnNulls);
+							   bool *columnNulls);
 extern void ColumnarFlushPendingWrites(ColumnarWriteState *state);
 extern void ColumnarEndWrite(ColumnarWriteState *state);
 extern bool ContainsPendingWrites(ColumnarWriteState *state);
@ -214,8 +214,11 @@ extern ColumnarReadState * ColumnarBeginRead(Relation relation,
 											 List *projectedColumnList,
 											 List *qualConditions);
 extern bool ColumnarReadNextRow(ColumnarReadState *state, Datum *columnValues,
-								bool *columnNulls);
+								bool *columnNulls, uint64 *rowNumber);
 extern void ColumnarRescan(ColumnarReadState *readState);
 extern bool ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber,
 									   List *neededColumnList, Datum *columnValues,
 									   bool *columnNulls, Snapshot snapshot);
 extern void ColumnarEndRead(ColumnarReadState *state);
 extern int64 ColumnarReadChunkGroupsFiltered(ColumnarReadState *state);
@ -233,17 +236,16 @@ extern void InitColumnarOptions(Oid regclass);
 extern void SetColumnarOptions(Oid regclass, ColumnarOptions *options);
 extern bool DeleteColumnarTableOptions(Oid regclass, bool missingOk);
 extern bool ReadColumnarOptions(Oid regclass, ColumnarOptions *options);
 extern void WriteToSmgr(Relation relation, uint64 logicalOffset,
 						char *data, uint32 dataLength);
 extern StringInfo ReadFromSmgr(Relation rel, uint64 offset, uint32 size);
 extern bool IsColumnarTableAmTable(Oid relationId);
 /* columnar_metadata_tables.c */
 extern void DeleteMetadataRows(RelFileNode relfilenode);
 extern uint64 ColumnarMetadataNewStorageId(void);
 extern uint64 GetHighestUsedAddress(RelFileNode relfilenode);
 extern StripeMetadata ReserveStripe(Relation rel, uint64 size,
 									uint64 rowCount, uint64 columnCount,
-									uint64 chunkCount, uint64 chunkGroupRowCount);
+									uint64 chunkCount, uint64 chunkGroupRowCount,
 									uint64 stripeFirstRowNumber);
 extern void SaveStripeSkipList(RelFileNode relfilenode, uint64 stripe,
 							   StripeSkipList *stripeSkipList,
 							   TupleDesc tupleDescriptor);
@ -252,6 +254,10 @@ extern void SaveChunkGroups(RelFileNode relfilenode, uint64 stripe,
 extern StripeSkipList * ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe,
 										   TupleDesc tupleDescriptor,
 										   uint32 chunkCount);
 extern StripeMetadata * FindStripeByRowNumber(Relation relation, uint64 rowNumber,
 											  Snapshot snapshot);
 extern StripeMetadata * FindStripeWithHighestRowNumber(Relation relation,
 													   Snapshot snapshot);
 extern Datum columnar_relation_storageid(PG_FUNCTION_ARGS);
@ -271,51 +277,5 @@ extern bool PendingWritesInUpperTransactions(Oid relfilenode,
 											 SubTransactionId currentSubXid);
 extern MemoryContext GetWriteContextForDebug(void);
 typedef struct SmgrAddr
 {
 	BlockNumber blockno;
 	uint32 offset;
 } SmgrAddr;
 /*
 * Map logical offsets (as tracked in the metadata) to a physical page and
 * offset where the data is kept.
 */
 static inline SmgrAddr
 logical_to_smgr(uint64 logicalOffset)
 {
 	SmgrAddr addr;
 	addr.blockno = logicalOffset / COLUMNAR_BYTES_PER_PAGE;
 	addr.offset = SizeOfPageHeaderData + (logicalOffset % COLUMNAR_BYTES_PER_PAGE);
 	return addr;
 }
 /*
 * Map a physical page adnd offset address to a logical address.
 */
 static inline uint64
 smgr_to_logical(SmgrAddr addr)
 {
 	return COLUMNAR_BYTES_PER_PAGE * addr.blockno + addr.offset - SizeOfPageHeaderData;
 }
 /*
 * Get the first usable address of next block.
 */
 static inline SmgrAddr
 next_block_start(SmgrAddr addr)
 {
 	SmgrAddr result = {
 		.blockno = addr.blockno + 1,
 		.offset = SizeOfPageHeaderData
 	};
 	return result;
 }
 #endif /* COLUMNAR_H */
--- a/src/include/columnar/columnar_metadata.h
+++ b/src/include/columnar/columnar_metadata.h
@ -25,8 +25,10 @@ typedef struct StripeMetadata
 	uint32 chunkGroupRowCount;
 	uint64 rowCount;
 	uint64 id;
 	uint64 firstRowNumber;
 } StripeMetadata;
 extern List * StripesForRelfilenode(RelFileNode relfilenode);
 extern void ColumnarStorageUpdateIfNeeded(Relation rel, bool isUpgrade);
 #endif /* COLUMNAR_METADATA_H */
--- a/src/include/columnar/columnar_storage.h
+++ b/src/include/columnar/columnar_storage.h
@ -0,0 +1,64 @@
 /*-------------------------------------------------------------------------
 *
 * columnar_storage.h
 *
 * Type and function declarations for storage of columnar data in blocks.
 *
 * Copyright (c) Citus Data, Inc.
 *
 *-------------------------------------------------------------------------
 */
 #ifndef COLUMNAR_STORAGE_H
 #define COLUMNAR_STORAGE_H
 #include "postgres.h"
 #include "storage/smgr.h"
 #include "utils/rel.h"
 #include "columnar/columnar_tableam.h"
 #define COLUMNAR_INVALID_ROW_NUMBER ((uint64) 0)
 #define COLUMNAR_FIRST_ROW_NUMBER ((uint64) 1)
 #define COLUMNAR_MAX_ROW_NUMBER ((uint64) \
 								 (COLUMNAR_FIRST_ROW_NUMBER + \
 								  VALID_ITEMPOINTER_OFFSETS * \
 								  VALID_BLOCKNUMBERS))
 /*
 * Logical offsets never fall on the first two physical pages. See
 * comments in columnar_storage.c.
 */
 #define ColumnarInvalidLogicalOffset 0
 #define ColumnarFirstLogicalOffset ((BLCKSZ - SizeOfPageHeaderData) * 2)
 #define ColumnarLogicalOffsetIsValid(X) ((X) >= ColumnarFirstLogicalOffset)
 extern void ColumnarStorageInit(SMgrRelation srel, uint64 storageId);
 extern bool ColumnarStorageIsCurrent(Relation rel);
 extern void ColumnarStorageUpdateCurrent(Relation rel, bool upgrade,
 										 uint64 reservedStripeId,
 										 uint64 reservedRowNumber,
 										 uint64 reservedOffset);
 extern uint64 ColumnarStorageGetVersionMajor(Relation rel, bool force);
 extern uint64 ColumnarStorageGetVersionMinor(Relation rel, bool force);
 extern uint64 ColumnarStorageGetStorageId(Relation rel, bool force);
 extern uint64 ColumnarStorageGetReservedStripeId(Relation rel, bool force);
 extern uint64 ColumnarStorageGetReservedRowNumber(Relation rel, bool force);
 extern uint64 ColumnarStorageGetReservedOffset(Relation rel, bool force);
 extern uint64 ColumnarStorageReserveData(Relation rel, uint64 amount);
 extern uint64 ColumnarStorageReserveRowNumber(Relation rel, uint64 nrows);
 extern uint64 ColumnarStorageReserveStripe(Relation rel);
 extern void ColumnarStorageRead(Relation rel, uint64 logicalOffset,
 								char *data, uint32 amount);
 extern void ColumnarStorageWrite(Relation rel, uint64 logicalOffset,
 								 char *data, uint32 amount);
 extern bool ColumnarStorageTruncate(Relation rel, uint64 newDataReservation);
 #endif /* COLUMNAR_STORAGE_H */
--- a/src/include/columnar/columnar_tableam.h
+++ b/src/include/columnar/columnar_tableam.h
@ -8,6 +8,40 @@
 #include "distributed/coordinator_protocol.h"
 /*
 * Number of valid ItemPointer Offset's for "row number" <> "ItemPointer"
 * mapping.
 *
 * Postgres has some asserts calling either ItemPointerIsValid or
 * OffsetNumberIsValid. That constraints itemPointer.offsetNumber
 * for columnar tables to the following interval:
 * [FirstOffsetNumber, MaxOffsetNumber].
 *
 * However, bitmap scan logic assumes that itemPointer.offsetNumber cannot
 * be larger than MaxHeapTuplesPerPage (see tbm_add_tuples).
 *
 * For this reason, we restrict itemPointer.offsetNumber
 * to the following interval: [FirstOffsetNumber, MaxHeapTuplesPerPage].
 */
 #define VALID_ITEMPOINTER_OFFSETS \
 	((uint64) (MaxHeapTuplesPerPage - FirstOffsetNumber + 1))
 /*
 * Number of valid ItemPointer BlockNumber's for "row number" <> "ItemPointer"
 * mapping.
 *
 * Similar to VALID_ITEMPOINTER_OFFSETS, due to asserts around
 * itemPointer.blockNumber, we can only use values upto and including
 * MaxBlockNumber.
 * Note that postgres doesn't restrict blockNumber to a lower boundary.
 *
 * For this reason, we restrict itemPointer.blockNumber
 * to the following interval: [0, MaxBlockNumber].
 */
 #define VALID_BLOCKNUMBERS ((uint64) (MaxBlockNumber + 1))
 const TableAmRoutine * GetColumnarTableAmRoutine(void);
 extern void columnar_tableam_init(void);
 extern void columnar_tableam_finish(void);
--- a/src/include/distributed/commands.h
+++ b/src/include/distributed/commands.h
@ -275,6 +275,7 @@ extern char * ChooseIndexName(const char *tabname, Oid namespaceId,
 							  bool primary, bool isconstraint);
 extern char * ChooseIndexNameAddition(List *colnames);
 extern List * ChooseIndexColumnNames(List *indexElems);
 extern LOCKMODE GetCreateIndexRelationLockMode(IndexStmt *createIndexStatement);
 extern List * PreprocessReindexStmt(Node *ReindexStatement,
 									const char *ReindexCommand,
 									ProcessUtilityContext processUtilityContext);
--- a/src/include/distributed/resource_lock.h
+++ b/src/include/distributed/resource_lock.h
@ -39,10 +39,7 @@ typedef enum AdvisoryLocktagClass
 	ADV_LOCKTAG_CLASS_CITUS_REBALANCE_COLOCATION = 7,
 	ADV_LOCKTAG_CLASS_CITUS_COLOCATED_SHARDS_METADATA = 8,
 	ADV_LOCKTAG_CLASS_CITUS_OPERATIONS = 9,
-	ADV_LOCKTAG_CLASS_CITUS_PLACEMENT_CLEANUP = 10,
+	ADV_LOCKTAG_CLASS_CITUS_PLACEMENT_CLEANUP = 10
 	/* Columnar lock types */
 	ADV_LOCKTAG_CLASS_COLUMNAR_STRIPE_RESERVATION = 11
 } AdvisoryLocktagClass;
 /* CitusOperations has constants for citus operations */
@ -102,13 +99,6 @@ typedef enum CitusOperations
 						 (uint32) operationId, \
 						 ADV_LOCKTAG_CLASS_CITUS_OPERATIONS)
 #define SET_LOCKTAG_COLUMNAR_STRIPE_RESERVATION(tag, relation) \
 	SET_LOCKTAG_ADVISORY(tag, \
 						 relation->rd_lockInfo.lockRelId.dbId, \
 						 relation->rd_lockInfo.lockRelId.relId, \
 						 0, \
 						 ADV_LOCKTAG_CLASS_COLUMNAR_STRIPE_RESERVATION)
 /* reuse advisory lock, but with different, unused field 4 (10)
 * Also it has the database hardcoded to MyDatabaseId, to ensure the locks
 * are local to each database */
--- a/src/test/regress/after_citus_upgrade_coord_schedule
+++ b/src/test/regress/after_citus_upgrade_coord_schedule
@ -3,3 +3,4 @@
 test: upgrade_basic_after
 test: upgrade_partition_constraints_after
 test: upgrade_pg_dist_object_test_after
 test: upgrade_columnar_metapage_after
--- a/src/test/regress/before_citus_upgrade_coord_schedule
+++ b/src/test/regress/before_citus_upgrade_coord_schedule
@ -3,3 +3,4 @@
 test: upgrade_basic_before
 test: upgrade_partition_constraints_before
 test: upgrade_pg_dist_object_test_before
 test: upgrade_columnar_metapage_before
--- a/src/test/regress/columnar_schedule
+++ b/src/test/regress/columnar_schedule
@ -4,7 +4,7 @@ test: multi_test_catalog_views
 test: columnar_create
 test: columnar_load
-test: columnar_query
+test: columnar_query columnar_first_row_number
 test: columnar_analyze
 test: columnar_data_types
 test: columnar_drop
--- a/src/test/regress/expected/columnar_alter.out
+++ b/src/test/regress/expected/columnar_alter.out
@ -12,6 +12,14 @@ WITH sample_data AS (VALUES
 INSERT INTO test_alter_table SELECT * FROM sample_data;
 -- drop a column
 ALTER TABLE test_alter_table DROP COLUMN a;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_alter_table');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  2 |              150001 |           16402
 (1 row)
 -- test analyze
 ANALYZE test_alter_table;
 -- verify select queries run as expected
@ -59,6 +67,14 @@ SELECT * FROM test_alter_table;
 3 | 5 | 8
 (5 rows)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_alter_table');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  4 |              450001 |           32724
 (1 row)
 -- add a fixed-length column with default value
 ALTER TABLE test_alter_table ADD COLUMN e int default 3;
 SELECT * from test_alter_table;
@ -83,6 +99,14 @@ SELECT * from test_alter_table;
 1 | 2 | 4 | 8
 (6 rows)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_alter_table');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  5 |              600001 |           40906
 (1 row)
 -- add a variable-length column with default value
 ALTER TABLE test_alter_table ADD COLUMN f text DEFAULT 'TEXT ME';
 SELECT * from test_alter_table;
@ -231,7 +255,6 @@ insert into atacc1 values(1);
 alter table atacc1
  add column b float8 not null default random(),
  add primary key(a);
 ERROR:  indexes not supported for columnar tables
 -- Add a generate column with an expression value
 create table test_gen_ex (x int) using columnar;
 INSERT INTO test_gen_ex VALUES (1), (2), (3);
@ -366,30 +389,30 @@ SELECT * FROM products ORDER BY 1;
          3 | pen     |     2
 (3 rows)
-- Add a UNIQUE constraint (should fail)
+-- Add a UNIQUE constraint
-CREATE TABLE products_fail (
+CREATE TABLE products_unique (
    product_no integer UNIQUE,
    name text,
    price numeric
 ) USING columnar;
 ERROR:  indexes not supported for columnar tables
 ALTER TABLE products ADD COLUMN store_id text UNIQUE;
-ERROR:  indexes not supported for columnar tables
+-- Add a PRIMARY KEY constraint
-- Add a PRIMARY KEY constraint (should fail)
+CREATE TABLE products_primary (
 CREATE TABLE products_fail (
    product_no integer PRIMARY KEY,
    name text,
    price numeric
 ) USING columnar;
-ERROR:  indexes not supported for columnar tables
+BEGIN;
-ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY;
+  ALTER TABLE products DROP COLUMN store_id;
-ERROR:  indexes not supported for columnar tables
+  ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY;
 ERROR:  column "store_id" contains null values
 ROLLBACK;
 -- Add an EXCLUSION constraint (should fail)
 CREATE TABLE circles (
    c circle,
    EXCLUDE USING gist (c WITH &&)
 ) USING columnar;
-ERROR:  indexes not supported for columnar tables
+ERROR:  only btree and hash indexes are supported on columnar tables
 -- Row level security
 CREATE TABLE public.row_level_security_col (id int, pgUser CHARACTER VARYING) USING columnar;
 CREATE USER user1;
--- a/src/test/regress/expected/columnar_create.out
+++ b/src/test/regress/expected/columnar_create.out
@ -5,11 +5,14 @@
 CREATE TABLE contestant (handle TEXT, birthdate DATE, rating INT,
 	percentile FLOAT, country CHAR(3), achievements TEXT[])
 	USING columnar;
-- should fail
+SELECT alter_columnar_table_set('contestant', compression => 'none');
 alter_columnar_table_set
 ---------------------------------------------------------------------
 (1 row)
 CREATE INDEX contestant_idx on contestant(handle);
-ERROR:  indexes not supported for columnar tables
+-- Create zstd compressed table
 -- Create compressed table with automatically determined file path
 -- COMPRESSED
 CREATE TABLE contestant_compressed (handle TEXT, birthdate DATE, rating INT,
 	percentile FLOAT, country CHAR(3), achievements TEXT[])
 	USING columnar;
--- a/src/test/regress/expected/columnar_empty.out
+++ b/src/test/regress/expected/columnar_empty.out
@ -52,6 +52,23 @@ select count(*) from t_compressed;
     0
 (1 row)
 -- check storage
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_compressed');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  1 |                   1 |           16336
 (1 row)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_uncompressed');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  1 |                   1 |           16336
 (1 row)
 -- explain
 explain (costs off, summary off, timing off) select * from t_uncompressed;
                  QUERY PLAN
@ -68,16 +85,16 @@ explain (costs off, summary off, timing off) select * from t_compressed;
 -- vacuum
 vacuum verbose t_compressed;
 INFO:  statistics for "t_compressed":
-storage id: -1
+storage id: xxxxx
-total file size: 0, total data size: 0
+total file size: 16384, total data size: 0
 compression rate: 1.00x
 total row count: 0, stripe count: 0, average rows per stripe: 0
 chunk count: 0, containing data for dropped columns: 0
 vacuum verbose t_uncompressed;
 INFO:  statistics for "t_uncompressed":
-storage id: -1
+storage id: xxxxx
-total file size: 0, total data size: 0
+total file size: 16384, total data size: 0
 compression rate: 1.00x
 total row count: 0, stripe count: 0, average rows per stripe: 0
 chunk count: 0, containing data for dropped columns: 0
@ -85,6 +102,23 @@ chunk count: 0, containing data for dropped columns: 0
 -- vacuum full
 vacuum full t_compressed;
 vacuum full t_uncompressed;
 -- check storage
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_compressed');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  1 |                   1 |           16336
 (1 row)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_uncompressed');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  1 |                   1 |           16336
 (1 row)
 -- analyze
 analyze t_uncompressed;
 analyze t_compressed;
@ -94,6 +128,23 @@ truncate t_compressed;
 -- alter type
 alter table t_uncompressed alter column a type text;
 alter table t_compressed alter column a type text;
 -- check storage
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_compressed');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  1 |                   1 |           16336
 (1 row)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_uncompressed');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  1 |                   1 |           16336
 (1 row)
 -- verify cost of scanning an empty table is zero, not NaN
 explain table t_uncompressed;
                                   QUERY PLAN
--- a/src/test/regress/expected/columnar_first_row_number.out
+++ b/src/test/regress/expected/columnar_first_row_number.out
@ -0,0 +1,56 @@
 CREATE SCHEMA columnar_first_row_number;
 SET search_path tO columnar_first_row_number;
 CREATE TABLE col_table_1 (a int) USING columnar;
 INSERT INTO col_table_1 SELECT i FROM generate_series(1, 10) i;
 BEGIN;
  -- we don't use same first_row_number even if the xact is rollback'ed
  INSERT INTO col_table_1 SELECT i FROM generate_series(1, 11) i;
 ROLLBACK;
 INSERT INTO col_table_1 SELECT i FROM generate_series(1, 12) i;
 SELECT alter_columnar_table_set('col_table_1', stripe_row_limit => 1000);
 alter_columnar_table_set
 ---------------------------------------------------------------------
 (1 row)
 INSERT INTO col_table_1 SELECT i FROM generate_series(1, 2350) i;
 SELECT row_count, first_row_number FROM columnar.stripe a
 WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass)
 ORDER BY stripe_num;
 row_count | first_row_number
 ---------------------------------------------------------------------
        10 |                1
        12 |           300001
      1000 |           450001
      1000 |           451001
       350 |           452001
 (5 rows)
 VACUUM FULL col_table_1;
 -- show that we properly update first_row_number after VACUUM FULL
 SELECT row_count, first_row_number FROM columnar.stripe a
 WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass)
 ORDER BY stripe_num;
 row_count | first_row_number
 ---------------------------------------------------------------------
      1000 |                1
      1000 |             1001
       372 |             2001
 (3 rows)
 TRUNCATE col_table_1;
 BEGIN;
  INSERT INTO col_table_1 SELECT i FROM generate_series(1, 16) i;
  INSERT INTO col_table_1 SELECT i FROM generate_series(1, 16) i;
 COMMIT;
 -- show that we start with first_row_number=1 after TRUNCATE
 SELECT row_count, first_row_number FROM columnar.stripe a
 WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass)
 ORDER BY stripe_num;
 row_count | first_row_number
 ---------------------------------------------------------------------
        32 |                1
 (1 row)
 SET client_min_messages TO ERROR;
 DROP SCHEMA columnar_first_row_number CASCADE;
--- a/src/test/regress/expected/columnar_indexes.out
+++ b/src/test/regress/expected/columnar_indexes.out
@ -10,7 +10,7 @@ SET search_path tO columnar_indexes, public;
 --
 create table t(a int, b int) using columnar;
 create index CONCURRENTLY t_idx on t(a, b);
-ERROR:  indexes not supported for columnar tables
+ERROR:  concurrent index commands are not supported for columnar tables
 \d t
            Table "columnar_indexes.t"
 Column |  Type   | Collation | Nullable | Default
@ -32,16 +32,15 @@ SELECT * FROM t;
 1 | 2
 (1 row)
 -- create index without the concurrent option. We should
 -- error out during index creation.
 create index t_idx on t(a, b);
 ERROR:  indexes not supported for columnar tables
 \d t
            Table "columnar_indexes.t"
 Column |  Type   | Collation | Nullable | Default
 ---------------------------------------------------------------------
 a      | integer |           |          |
 b      | integer |           |          |
 Indexes:
    "t_idx" btree (a, b)
 explain insert into t values (1, 2);
                   QUERY PLAN
@ -58,5 +57,347 @@ SELECT * FROM t;
 3 | 4
 (2 rows)
 -- make sure that we test index scan
 set columnar.enable_custom_scan to 'off';
 set enable_seqscan to off;
 CREATE table columnar_table (a INT, b int) USING columnar;
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(0, 16000) i;
 -- unique --
 BEGIN;
  INSERT INTO columnar_table VALUES (100000000);
  SAVEPOINT s1;
  -- errors out due to unflushed data in upper transaction
  CREATE UNIQUE INDEX ON columnar_table (a);
 ERROR:  cannot read from table when there is unflushed data in upper transactions
 ROLLBACK;
 CREATE UNIQUE INDEX ON columnar_table (a);
 BEGIN;
  INSERT INTO columnar_table VALUES (16050);
  SAVEPOINT s1;
  -- index scan errors out due to unflushed data in upper transaction
  SELECT a FROM columnar_table WHERE a = 16050;
 ERROR:  cannot read from index when there is unflushed data in upper transactions
 ROLLBACK;
 EXPLAIN (COSTS OFF) SELECT * FROM columnar_table WHERE a=6456;
                       QUERY PLAN
 ---------------------------------------------------------------------
 Index Scan using columnar_table_a_idx on columnar_table
   Index Cond: (a = 6456)
 (2 rows)
 EXPLAIN (COSTS OFF) SELECT a FROM columnar_table WHERE a=6456;
                          QUERY PLAN
 ---------------------------------------------------------------------
 Index Only Scan using columnar_table_a_idx on columnar_table
   Index Cond: (a = 6456)
 (2 rows)
 SELECT (SELECT a FROM columnar_table WHERE a=6456 limit 1)=6456;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 SELECT (SELECT b FROM columnar_table WHERE a=6456 limit 1)=6456*2;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 -- even if a=16050 doesn't exist, we try to insert it twice so this should error out
 INSERT INTO columnar_table VALUES (16050), (16050);
 ERROR:  duplicate key value violates unique constraint "columnar_table_a_idx"
 DETAIL:  Key (a)=(16050) already exists.
 -- should work
 INSERT INTO columnar_table VALUES (16050);
 -- check edge cases around stripe boundaries, error out
 INSERT INTO columnar_table VALUES (16050);
 ERROR:  duplicate key value violates unique constraint "columnar_table_a_idx"
 DETAIL:  Key (a)=(16050) already exists.
 INSERT INTO columnar_table VALUES (15999);
 ERROR:  duplicate key value violates unique constraint "columnar_table_a_idx"
 DETAIL:  Key (a)=(15999) already exists.
 DROP INDEX columnar_table_a_idx;
 CREATE TABLE partial_unique_idx_test (a INT, b INT) USING columnar;
 CREATE UNIQUE INDEX ON partial_unique_idx_test (a)
 WHERE b > 500;
 -- should work since b =< 500 and our partial index doesn't check this interval
 INSERT INTO partial_unique_idx_test VALUES (1, 2), (1, 2);
 -- should work since our partial index wouldn't cover the tuples that we inserted above
 INSERT INTO partial_unique_idx_test VALUES (1, 800);
 INSERT INTO partial_unique_idx_test VALUES (4, 600);
 -- should error out due to (4, 600)
 INSERT INTO partial_unique_idx_test VALUES (4, 700);
 ERROR:  duplicate key value violates unique constraint "partial_unique_idx_test_a_idx"
 DETAIL:  Key (a)=(4) already exists.
 -- btree --
 CREATE INDEX ON columnar_table (a);
 SELECT (SELECT SUM(b) FROM columnar_table WHERE a>700 and a<965)=439560;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 CREATE INDEX ON columnar_table (b)
 WHERE (b > 30000 AND b < 33000);
 -- partial index should be way smaller than the non-partial index
 SELECT pg_total_relation_size('columnar_table_b_idx') * 5 <
       pg_total_relation_size('columnar_table_a_idx');
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 -- can't use index scan due to partial index boundaries
 EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30000;
         QUERY PLAN
 ---------------------------------------------------------------------
 Seq Scan on columnar_table
   Filter: (b = 30000)
 (2 rows)
 -- can use index scan
 EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30001;
                          QUERY PLAN
 ---------------------------------------------------------------------
 Index Only Scan using columnar_table_b_idx on columnar_table
   Index Cond: (b = 30001)
 (2 rows)
 -- some more rows
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 17000) i;
 DROP INDEX columnar_table_a_idx;
 TRUNCATE columnar_table;
 -- pkey --
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 16499) i;
 ALTER TABLE columnar_table ADD PRIMARY KEY (a);
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16500, 17000) i;
 BEGIN;
  INSERT INTO columnar_table (a) SELECT 1;
 ROLLBACK;
 -- should work
 INSERT INTO columnar_table (a) SELECT 1;
 -- error out
 INSERT INTO columnar_table VALUES (16100), (16101);
 ERROR:  duplicate key value violates unique constraint "columnar_table_pkey"
 DETAIL:  Key (a)=(16100) already exists.
 INSERT INTO columnar_table VALUES (16999);
 ERROR:  duplicate key value violates unique constraint "columnar_table_pkey"
 DETAIL:  Key (a)=(16999) already exists.
 BEGIN;
  REINDEX INDEX columnar_table_pkey;
  -- should error even after reindex
  INSERT INTO columnar_table VALUES (16999);
 ERROR:  duplicate key value violates unique constraint "columnar_table_pkey"
 DETAIL:  Key (a)=(16999) already exists.
 ROLLBACK;
 VACUUM FULL columnar_table;
 -- show that we don't support clustering columnar tables using indexes
 CLUSTER columnar_table USING columnar_table_pkey;
 ERROR:  clustering columnar tables using indexes is not supported
 ALTER TABLE columnar_table CLUSTER ON columnar_table_pkey;
 CLUSTER columnar_table;
 ERROR:  clustering columnar tables using indexes is not supported
 -- should error even after vacuum
 INSERT INTO columnar_table VALUES (16999);
 ERROR:  duplicate key value violates unique constraint "columnar_table_pkey"
 DETAIL:  Key (a)=(16999) already exists.
 TRUNCATE columnar_table;
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i;
 SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 TRUNCATE columnar_table;
 ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey;
 -- hash --
 INSERT INTO columnar_table (a, b) SELECT i*2,i FROM generate_series(1, 8000) i;
 CREATE INDEX hash_idx ON columnar_table USING HASH (b);
 BEGIN;
  CREATE INDEX hash_idx_fill_factor ON columnar_table USING HASH (b) WITH (fillfactor=10);
  -- same hash index with lower fillfactor should be way bigger
  SELECT pg_total_relation_size ('hash_idx_fill_factor') >
         pg_total_relation_size ('hash_idx') * 5;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 ROLLBACK;
 BEGIN;
  INSERT INTO columnar_table (a, b) SELECT i*3,i FROM generate_series(1, 8000) i;
 ROLLBACK;
 INSERT INTO columnar_table (a, b) SELECT i*4,i FROM generate_series(1, 8000) i;
 SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 BEGIN;
  REINDEX TABLE columnar_table;
  SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 ROLLBACK;
 VACUUM FULL columnar_table;
 SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 -- exclusion contraints --
 CREATE TABLE exclusion_test (c1 INT,c2 INT, c3 INT, c4 BOX,
 EXCLUDE USING btree (c1 WITH =) INCLUDE(c3,c4) WHERE (c1 < 10)) USING columnar;
 -- error out since "c1" is "1" for all rows to be inserted
 INSERT INTO exclusion_test SELECT 1, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x;
 ERROR:  conflicting key value violates exclusion constraint "exclusion_test_c1_c3_c4_excl"
 DETAIL:  Key (c1)=(1) conflicts with existing key (c1)=(1).
 BEGIN;
  INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x;
 ROLLBACK;
 -- should work
 INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x;
 INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x;
 BEGIN;
  -- should work thanks to "where" clause in exclusion constraint
  INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x;
 ROLLBACK;
 REINDEX TABLE exclusion_test;
 -- should still work after reindex
 INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x;
 -- make sure that we respect INCLUDE syntax --
 CREATE TABLE include_test (a INT, b BIGINT, c BIGINT, d BIGINT) USING columnar;
 INSERT INTO include_test SELECT i, i, i, i FROM generate_series (1, 1000) i;
 CREATE UNIQUE INDEX unique_a ON include_test (a);
 -- cannot use index only scan
 EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500;
                QUERY PLAN
 ---------------------------------------------------------------------
 Index Scan using unique_a on include_test
   Index Cond: (a = 500)
 (2 rows)
 CREATE UNIQUE INDEX unique_a_include_b_c_d ON include_test (a) INCLUDE(b, c, d);
 -- same unique index that includes other columns should be way bigger
 SELECT pg_total_relation_size ('unique_a') * 1.5 <
       pg_total_relation_size ('unique_a_include_b_c_d');
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 DROP INDEX unique_a;
 -- should use index only scan since unique_a_include_b_c_d includes column "b" too
 EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500;
                          QUERY PLAN
 ---------------------------------------------------------------------
 Index Only Scan using unique_a_include_b_c_d on include_test
   Index Cond: (a = 500)
 (2 rows)
 BEGIN;
  SET enable_indexonlyscan = OFF;
  -- show that we respect enable_indexonlyscan GUC
  EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500;
                       QUERY PLAN
 ---------------------------------------------------------------------
 Index Scan using unique_a_include_b_c_d on include_test
   Index Cond: (a = 500)
 (2 rows)
 ROLLBACK;
 -- make sure that we read the correct value for "b" when doing index only scan
 SELECT b=980 FROM include_test WHERE a = 980;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 -- some tests with distributed & partitioned tables --
 CREATE TABLE dist_part_table(
  dist_col INT,
  part_col TIMESTAMPTZ,
  col1 TEXT
 ) PARTITION BY RANGE (part_col);
 -- create an index before creating a columnar partition
 CREATE INDEX dist_part_table_btree ON dist_part_table (col1);
 -- columnar partition
 CREATE TABLE p0 PARTITION OF dist_part_table
 FOR VALUES FROM ('2020-01-01') TO ('2020-02-01')
 USING columnar;
 SELECT create_distributed_table('dist_part_table', 'dist_col');
 create_distributed_table
 ---------------------------------------------------------------------
 (1 row)
 -- columnar partition
 CREATE TABLE p1 PARTITION OF dist_part_table
 FOR VALUES FROM ('2020-02-01') TO ('2020-03-01')
 USING columnar;
 -- row partition
 CREATE TABLE p2 PARTITION OF dist_part_table
 FOR VALUES FROM ('2020-03-01') TO ('2020-04-01');
 INSERT INTO dist_part_table VALUES (1, '2020-03-15', 'str1', POINT(1, 1));
 ERROR:  INSERT has more expressions than target columns
 -- insert into columnar partitions
 INSERT INTO dist_part_table VALUES (1, '2020-01-15', 'str2', POINT(2, 2));
 ERROR:  INSERT has more expressions than target columns
 INSERT INTO dist_part_table VALUES (1, '2020-02-15', 'str3', POINT(3, 3));
 ERROR:  INSERT has more expressions than target columns
 -- create another index after creating a columnar partition
 CREATE UNIQUE INDEX dist_part_table_unique ON dist_part_table (dist_col, part_col);
 -- verify that indexes are created on columnar partitions
 SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p0';
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p1';
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 -- unsupported index types --
 -- gin --
 CREATE TABLE testjsonb (j JSONB) USING columnar;
 INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(1,10) i;
 CREATE INDEX jidx ON testjsonb USING GIN (j);
 ERROR:  only btree and hash indexes are supported on columnar tables
 INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(15,20) i;
 -- gist --
 CREATE TABLE gist_point_tbl(id INT4, p POINT) USING columnar;
 INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(1, 10) g;
 CREATE INDEX gist_pointidx ON gist_point_tbl USING gist(p);
 ERROR:  only btree and hash indexes are supported on columnar tables
 INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(10, 20) g;
 -- sp gist --
 CREATE TABLE box_temp (f1 box) USING columnar;
 INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i;
 CREATE INDEX box_spgist ON box_temp USING spgist (f1);
 ERROR:  only btree and hash indexes are supported on columnar tables
 INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i;
 -- brin --
 CREATE TABLE brin_summarize (value int) USING columnar;
 CREATE INDEX brin_summarize_idx ON brin_summarize USING brin (value) WITH (pages_per_range=2);
 ERROR:  only btree and hash indexes are supported on columnar tables
 -- Show that we safely fallback to serial index build.
 CREATE TABLE parallel_scan_test(a int) USING columnar WITH ( parallel_workers = 2 );
 INSERT INTO parallel_scan_test SELECT i FROM generate_series(1,10) i;
 CREATE INDEX ON parallel_scan_test (a);
 NOTICE:  falling back to serial index build since parallel scan on columnar tables is not supported
 VACUUM FULL parallel_scan_test;
 NOTICE:  falling back to serial index build since parallel scan on columnar tables is not supported
 REINDEX TABLE parallel_scan_test;
 NOTICE:  falling back to serial index build since parallel scan on columnar tables is not supported
 SET client_min_messages TO WARNING;
 DROP SCHEMA columnar_indexes CASCADE;
--- a/src/test/regress/expected/columnar_insert.out
+++ b/src/test/regress/expected/columnar_insert.out
@ -45,6 +45,14 @@ select count(*) from test_insert_command;
     3
 (1 row)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_insert_command');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  4 |              450001 |           32686
 (1 row)
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 consistent
 ---------------------------------------------------------------------
@ -141,6 +149,14 @@ FROM test_toast_columnar;
           5004 |           5004 |           5004 |           5004
 (1 row)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_toast_columnar');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  2 |              150001 |           16428
 (1 row)
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 consistent
 ---------------------------------------------------------------------
@ -173,6 +189,14 @@ INSERT INTO zero_col_heap SELECT * FROM zero_col_heap;
 INSERT INTO zero_col_heap SELECT * FROM zero_col_heap;
 INSERT INTO zero_col_heap SELECT * FROM zero_col_heap;
 INSERT INTO zero_col SELECT * FROM zero_col_heap;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('zero_col');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  6 |              750001 |           16336
 (1 row)
 SELECT relname, stripe_num, chunk_group_count, row_count FROM columnar.stripe a, pg_class b
 WHERE columnar_test_helpers.columnar_relation_storageid(b.oid)=a.storage_id AND relname = 'zero_col'
 ORDER BY 1,2,3,4;
--- a/src/test/regress/expected/columnar_rollback.out
+++ b/src/test/regress/expected/columnar_rollback.out
@ -14,6 +14,14 @@ SELECT count(*) FROM t;
     0
 (1 row)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  1 |              150001 |           16336
 (1 row)
 -- check stripe metadata also have been rolled-back
 SELECT count(*) FROM t_stripes;
 count
@ -46,6 +54,14 @@ SELECT count(*) FROM t;  -- force flush
 SAVEPOINT s1;
 INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  3 |              600001 |           24606
 (1 row)
 SELECT count(*) FROM t;
 count
 ---------------------------------------------------------------------
@ -68,6 +84,14 @@ SELECT count(*) FROM t;
 INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i;
 COMMIT;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  5 |              750001 |           40942
 (1 row)
 SELECT count(*) FROM t;
 count
 ---------------------------------------------------------------------
--- a/src/test/regress/expected/columnar_test_helpers.out
+++ b/src/test/regress/expected/columnar_test_helpers.out
@ -3,6 +3,16 @@ SET search_path TO columnar_test_helpers;
 CREATE FUNCTION columnar_relation_storageid(relid oid) RETURNS bigint
    LANGUAGE C STABLE STRICT
    AS 'citus', $$columnar_relation_storageid$$;
 CREATE OR REPLACE FUNCTION columnar_storage_info(
    rel regclass,
    version_major OUT int4,
    version_minor OUT int4,
    storage_id OUT int8,
    reserved_stripe_id OUT int8,
    reserved_row_number OUT int8,
    reserved_offset OUT int8)
  STRICT
  LANGUAGE c AS 'citus', $$columnar_storage_info$$;
 CREATE FUNCTION compression_type_supported(type text) RETURNS boolean
 AS $$
 BEGIN
--- a/src/test/regress/expected/columnar_truncate.out
+++ b/src/test/regress/expected/columnar_truncate.out
@ -43,7 +43,23 @@ SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 t
 (1 row)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('columnar_truncate_test');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  2 |              150001 |           16438
 (1 row)
 TRUNCATE TABLE columnar_truncate_test;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('columnar_truncate_test');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  1 |                   1 |           16336
 (1 row)
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 consistent
 ---------------------------------------------------------------------
@ -77,7 +93,7 @@ SELECT count(*) FROM columnar_truncate_test_compressed;
 SELECT pg_relation_size('columnar_truncate_test_compressed');
 pg_relation_size
 ---------------------------------------------------------------------
-                0
+            16384
 (1 row)
 INSERT INTO columnar_truncate_test select a, a from generate_series(1, 10) a;
--- a/src/test/regress/expected/columnar_vacuum.out
+++ b/src/test/regress/expected/columnar_vacuum.out
@ -25,6 +25,14 @@ SELECT count(*) FROM t_stripes;
     3
 (1 row)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  4 |              450001 |           32756
 (1 row)
 -- vacuum full should merge stripes together
 VACUUM FULL t;
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
@ -45,6 +53,14 @@ SELECT count(*) FROM t_stripes;
     1
 (1 row)
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  2 |              150001 |           16584
 (1 row)
 -- test the case when all data cannot fit into a single stripe
 SELECT alter_columnar_table_set('t', stripe_row_limit => 1000);
 alter_columnar_table_set
@ -66,6 +82,14 @@ SELECT count(*) FROM t_stripes;
 (1 row)
 VACUUM FULL t;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  4 |                3001 |           53382
 (1 row)
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 consistent
 ---------------------------------------------------------------------
@ -215,6 +239,14 @@ compression rate: 1.25x
 total row count: 5530, stripe count: 5, average rows per stripe: 1106
 chunk count: 7, containing data for dropped columns: 0, none compressed: 5, pglz compressed: 2
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                 16 |               21001 |           50686
 (1 row)
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 consistent
 ---------------------------------------------------------------------
--- a/src/test/regress/expected/columnar_write_concurrency.out
+++ b/src/test/regress/expected/columnar_write_concurrency.out
@ -140,3 +140,45 @@ a              b
 11
 12
 13
 starting permutation: s1-truncate s1-begin s1-insert-10000-rows s2-begin s2-insert s2-commit s1-commit s1-verify-metadata
 step s1-truncate:
    TRUNCATE test_insert_concurrency;
 step s1-begin:
    BEGIN;
 step s1-insert-10000-rows:
    INSERT INTO test_insert_concurrency SELECT i, 2 * i FROM generate_series(1, 10000) i;
 step s2-begin:
    BEGIN;
 step s2-insert:
    INSERT INTO test_insert_concurrency SELECT i, 2 * i FROM generate_series(4, 6) i;
 step s2-commit:
    COMMIT;
 step s1-commit:
    COMMIT;
 step s1-verify-metadata:
    WITH test_insert_concurrency_stripes AS (
      SELECT first_row_number, stripe_num, row_count
      FROM columnar.stripe a, pg_class b
      WHERE columnar_relation_storageid(b.oid)=a.storage_id AND
            relname = 'test_insert_concurrency'
    )
    SELECT
      -- verify that table has two stripes ..
      count(*) = 2 AND
      -- .. and those stripes look like:
      sum(case when stripe_num = 1 AND first_row_number = 150001 AND row_count = 3 then 1 end) = 1 AND
      sum(case when stripe_num = 2 AND first_row_number = 1 AND row_count = 10000 then 1 end) = 1
      AS stripe_metadata_for_test_insert_concurrency_ok
    FROM test_insert_concurrency_stripes;
 stripe_metadata_for_test_insert_concurrency_ok
 t
--- a/src/test/regress/expected/multi_extension.out
+++ b/src/test/regress/expected/multi_extension.out
@ -595,9 +595,11 @@ SELECT * FROM print_extension_changes();
 -- Snapshot of state at 10.2-1
 ALTER EXTENSION citus UPDATE TO '10.2-1';
 SELECT * FROM print_extension_changes();
- previous_object | current_object
+ previous_object |                          current_object
 ---------------------------------------------------------------------
-(0 rows)
+                 | function citus_internal.downgrade_columnar_storage(regclass) void
                 | function citus_internal.upgrade_columnar_storage(regclass) void
 (2 rows)
 DROP TABLE prev_objects, extension_diff;
 -- show running version
--- a/src/test/regress/expected/multi_multiuser.out
+++ b/src/test/regress/expected/multi_multiuser.out
@ -259,7 +259,7 @@ ABORT;
 -- all below 5 commands should throw no permission errors
 -- read columnar metadata table
 SELECT * FROM columnar.stripe;
- storage_id | stripe_num | file_offset | data_length | column_count | chunk_row_count | row_count | chunk_group_count
+ storage_id | stripe_num | file_offset | data_length | column_count | chunk_row_count | row_count | chunk_group_count | first_row_number
 ---------------------------------------------------------------------
 (0 rows)
--- a/src/test/regress/expected/upgrade_columnar_metapage_after.out
+++ b/src/test/regress/expected/upgrade_columnar_metapage_after.out
@ -0,0 +1,122 @@
 \set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"`
 SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND
       substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0
 AS upgrade_test_old_citus_version_ge_10_0;
 upgrade_test_old_citus_version_ge_10_0
 ---------------------------------------------------------------------
 t
 (1 row)
 \gset
 \if :upgrade_test_old_citus_version_ge_10_0
 \else
 \q
 \endif
 -- it's not the best practice to define this here, but we don't want to include
 -- columnar_test_helpers in upgrade test schedule
 CREATE OR REPLACE FUNCTION columnar_storage_info(
  rel regclass,
  version_major OUT int4,
  version_minor OUT int4,
  storage_id OUT int8,
  reserved_stripe_id OUT int8,
  reserved_row_number OUT int8,
  reserved_offset OUT int8)
 STRICT
 LANGUAGE c AS 'citus', 'columnar_storage_info';
 CREATE VIEW columnar_table_stripe_info AS
 SELECT columnar_table_storageids.relname relname,
       columnar.stripe.stripe_num stripe_num,
       columnar.stripe.row_count row_count,
       columnar.stripe.first_row_number first_row_number
 FROM columnar.stripe,
 (
  SELECT c.oid relid, c.relname relname, (columnar_storage_info(c.oid)).storage_id relstorageid
  FROM pg_class c, pg_am a
  WHERE c.relam = a.oid AND amname = 'columnar'
 ) columnar_table_storageids
 WHERE relstorageid = columnar.stripe.storage_id;
 SET search_path TO upgrade_columnar_metapage, public;
 -- show that first_row_number values are equal to MAX(row_count) * stripe_num + COLUMNAR_FIRST_ROW_NUMBER
 SELECT * FROM columnar_table_stripe_info ORDER BY relname, stripe_num;
     relname      | stripe_num | row_count | first_row_number
 ---------------------------------------------------------------------
 columnar_table_1 |          1 |    150000 |                1
 columnar_table_1 |          2 |     10000 |           150001
 columnar_table_2 |          1 |      1000 |                1
 columnar_table_2 |          2 |       901 |           150001
 columnar_table_3 |          1 |         2 |                1
 (5 rows)
 -- should work since we upgrade metapages when upgrading schema version
 INSERT INTO columnar_table_1 VALUES (3);
 -- state of stripe metadata for columnar_table_1 after post-upgrade insert
 SELECT * FROM columnar_table_stripe_info WHERE relname = 'columnar_table_1' ORDER BY stripe_num;
     relname      | stripe_num | row_count | first_row_number
 ---------------------------------------------------------------------
 columnar_table_1 |          1 |    150000 |                1
 columnar_table_1 |          2 |     10000 |           150001
 columnar_table_1 |          3 |         1 |           160001
 (3 rows)
 -- show that all columnar relation's metapage's are upgraded to "2.0"
 SELECT count(*)=0
 FROM (SELECT (columnar_storage_info(c.oid)).* t
      FROM pg_class c, pg_am a
      WHERE c.relam = a.oid AND amname = 'columnar') t
 WHERE t.version_major != 2 and t.version_minor != 0;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 -- print metapage for two of the tables
 SELECT columnar_storage_info('columnar_table_1');
    columnar_storage_info
 ---------------------------------------------------------------------
 (2,0,10000000000,4,310001,481936)
 (1 row)
 SELECT columnar_storage_info('columnar_table_2');
    columnar_storage_info
 ---------------------------------------------------------------------
 (2,0,10000000001,3,150902,26694)
 (1 row)
 -- show that no_data_columnar_table also has metapage after upgrade
 SELECT columnar_storage_info('no_data_columnar_table');
    columnar_storage_info
 ---------------------------------------------------------------------
 (2,0,10000000003,1,1,16336)
 (1 row)
 -- table is already upgraded, make sure that upgrade_columnar_metapage is no-op
 SELECT citus_internal.upgrade_columnar_storage(c.oid)
 FROM pg_class c, pg_am a
 WHERE c.relam = a.oid AND amname = 'columnar' and relname = 'columnar_table_2';
 upgrade_columnar_storage
 ---------------------------------------------------------------------
 (1 row)
 SELECT columnar_storage_info('columnar_table_2');
    columnar_storage_info
 ---------------------------------------------------------------------
 (2,0,10000000001,3,150902,26694)
 (1 row)
 VACUUM FULL columnar_table_2;
 -- print metapage and stripe metadata after post-upgrade vacuum full
 SELECT columnar_storage_info('columnar_table_2');
     columnar_storage_info
 ---------------------------------------------------------------------
 (2,0,10000000004,3,2001,26694)
 (1 row)
 SELECT * FROM columnar_table_stripe_info WHERE relname = 'columnar_table_2' ORDER BY stripe_num;
     relname      | stripe_num | row_count | first_row_number
 ---------------------------------------------------------------------
 columnar_table_2 |          1 |      1000 |                1
 columnar_table_2 |          2 |       901 |             1001
 (2 rows)
--- a/src/test/regress/expected/upgrade_columnar_metapage_after_0.out
+++ b/src/test/regress/expected/upgrade_columnar_metapage_after_0.out
@ -0,0 +1,13 @@
 \set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"`
 SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND
       substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0
 AS upgrade_test_old_citus_version_ge_10_0;
 upgrade_test_old_citus_version_ge_10_0
 ---------------------------------------------------------------------
 f
 (1 row)
 \gset
 \if :upgrade_test_old_citus_version_ge_10_0
 \else
 \q
--- a/src/test/regress/expected/upgrade_columnar_metapage_before.out
+++ b/src/test/regress/expected/upgrade_columnar_metapage_before.out
@ -0,0 +1,31 @@
 \set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"`
 SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND
       substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0
 AS upgrade_test_old_citus_version_ge_10_0;
 upgrade_test_old_citus_version_ge_10_0
 ---------------------------------------------------------------------
 t
 (1 row)
 \gset
 \if :upgrade_test_old_citus_version_ge_10_0
 \else
 \q
 \endif
 CREATE SCHEMA upgrade_columnar_metapage;
 SET search_path TO upgrade_columnar_metapage, public;
 CREATE TABLE columnar_table_1(a INT, b INT) USING columnar;
 INSERT INTO columnar_table_1 SELECT i FROM generate_series(160001, 320000) i;
 CREATE TABLE columnar_table_2(b INT) USING columnar;
 SELECT alter_columnar_table_set('columnar_table_2',
                                chunk_group_row_limit => 100,
                                stripe_row_limit => 1000);
 alter_columnar_table_set
 ---------------------------------------------------------------------
 (1 row)
 INSERT INTO columnar_table_2 SELECT i FROM generate_series(1600, 3500) i;
 CREATE TABLE columnar_table_3(b INT) USING columnar;
 INSERT INTO columnar_table_3 VALUES (1), (2);
 CREATE TABLE no_data_columnar_table(a INT, b INT, c TEXT) USING columnar;
--- a/src/test/regress/expected/upgrade_columnar_metapage_before_0.out
+++ b/src/test/regress/expected/upgrade_columnar_metapage_before_0.out
@ -0,0 +1,13 @@
 \set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"`
 SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND
       substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0
 AS upgrade_test_old_citus_version_ge_10_0;
 upgrade_test_old_citus_version_ge_10_0
 ---------------------------------------------------------------------
 f
 (1 row)
 \gset
 \if :upgrade_test_old_citus_version_ge_10_0
 \else
 \q
--- a/src/test/regress/expected/upgrade_list_citus_objects.out
+++ b/src/test/regress/expected/upgrade_list_citus_objects.out
@ -57,6 +57,7 @@ ORDER BY 1;
 function citus_extradata_container(internal)
 function citus_finish_pg_upgrade()
 function citus_get_active_worker_nodes()
 function citus_internal.downgrade_columnar_storage(regclass)
 function citus_internal.find_groupid_for_node(text,integer)
 function citus_internal.pg_dist_node_trigger_func()
 function citus_internal.pg_dist_rebalance_strategy_trigger_func()
@ -64,6 +65,7 @@ ORDER BY 1;
 function citus_internal.refresh_isolation_tester_prepared_statement()
 function citus_internal.replace_isolation_tester_func()
 function citus_internal.restore_isolation_tester_func()
 function citus_internal.upgrade_columnar_storage(regclass)
 function citus_isolation_test_session_is_blocked(integer,integer[])
 function citus_json_concatenate(json,json)
 function citus_json_concatenate_final(json)
@ -246,5 +248,5 @@ ORDER BY 1;
 view citus_worker_stat_activity
 view pg_dist_shard_placement
 view time_partitions
-(230 rows)
+(232 rows)
--- a/src/test/regress/expected/upgrade_list_citus_objects_0.out
+++ b/src/test/regress/expected/upgrade_list_citus_objects_0.out
@ -53,6 +53,7 @@ ORDER BY 1;
 function citus_extradata_container(internal)
 function citus_finish_pg_upgrade()
 function citus_get_active_worker_nodes()
 function citus_internal.downgrade_columnar_storage(regclass)
 function citus_internal.find_groupid_for_node(text,integer)
 function citus_internal.pg_dist_node_trigger_func()
 function citus_internal.pg_dist_rebalance_strategy_enterprise_check()
@ -61,6 +62,7 @@ ORDER BY 1;
 function citus_internal.refresh_isolation_tester_prepared_statement()
 function citus_internal.replace_isolation_tester_func()
 function citus_internal.restore_isolation_tester_func()
 function citus_internal.upgrade_columnar_storage(regclass)
 function citus_isolation_test_session_is_blocked(integer,integer[])
 function citus_json_concatenate(json,json)
 function citus_json_concatenate_final(json)
@ -238,5 +240,5 @@ ORDER BY 1;
 view citus_worker_stat_activity
 view pg_dist_shard_placement
 view time_partitions
-(222 rows)
+(224 rows)
--- a/src/test/regress/input/columnar_load.source
+++ b/src/test/regress/input/columnar_load.source
@ -15,14 +15,20 @@ COPY contestant FROM '@abs_srcdir@/data/contestants.1.csv' WITH CSV;
 -- COPY into uncompressed table from program
 COPY contestant FROM PROGRAM 'cat @abs_srcdir@/data/contestants.2.csv' WITH CSV;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('contestant');
 -- COPY into compressed table
 set columnar.compression = 'pglz';
 COPY contestant_compressed FROM '@abs_srcdir@/data/contestants.1.csv' WITH CSV;
 -- COPY into uncompressed table from program
 COPY contestant_compressed FROM PROGRAM 'cat @abs_srcdir@/data/contestants.2.csv'
 	WITH CSV;
-set columnar.compression to default;
+
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('contestant_compressed');
 -- Test column list
 CREATE TABLE famous_constants (id int, name text, value real)
--- a/src/test/regress/output/columnar_load.source
+++ b/src/test/regress/output/columnar_load.source
@ -14,13 +14,27 @@ DETAIL:  command not found
 COPY contestant FROM '@abs_srcdir@/data/contestants.1.csv' WITH CSV;
 -- COPY into uncompressed table from program
 COPY contestant FROM PROGRAM 'cat @abs_srcdir@/data/contestants.2.csv' WITH CSV;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('contestant');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  3 |              300001 |           24742
 (1 row)
 -- COPY into compressed table
 set columnar.compression = 'pglz';
 COPY contestant_compressed FROM '@abs_srcdir@/data/contestants.1.csv' WITH CSV;
 -- COPY into uncompressed table from program
 COPY contestant_compressed FROM PROGRAM 'cat @abs_srcdir@/data/contestants.2.csv'
 	WITH CSV;
-set columnar.compression to default;
+select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('contestant_compressed');
 version_major | version_minor | reserved_stripe_id | reserved_row_number | reserved_offset
 ---------------------------------------------------------------------
             2 |             0 |                  3 |              300001 |           24704
 (1 row)
 -- Test column list
 CREATE TABLE famous_constants (id int, name text, value real)
    USING columnar;
--- a/src/test/regress/spec/columnar_write_concurrency.spec
+++ b/src/test/regress/spec/columnar_write_concurrency.spec
@ -1,6 +1,10 @@
 setup
 {
    CREATE TABLE test_insert_concurrency (a int, b int) USING columnar;
    CREATE OR REPLACE FUNCTION columnar_relation_storageid(relid oid) RETURNS bigint
        LANGUAGE C STABLE STRICT
        AS 'citus', $$columnar_relation_storageid$$;
 }
 teardown
@ -20,6 +24,11 @@ step "s1-insert"
    INSERT INTO test_insert_concurrency SELECT i, 2 * i FROM generate_series(1, 3) i;
 }
 step "s1-insert-10000-rows"
 {
    INSERT INTO test_insert_concurrency SELECT i, 2 * i FROM generate_series(1, 10000) i;
 }
 step "s1-copy"
 {
    COPY test_insert_concurrency(a) FROM PROGRAM 'seq 11 13';
@ -30,6 +39,29 @@ step "s1-select"
    SELECT * FROM test_insert_concurrency ORDER BY a;
 }
 step "s1-truncate"
 {
    TRUNCATE test_insert_concurrency;
 }
 step "s1-verify-metadata"
 {
    WITH test_insert_concurrency_stripes AS (
      SELECT first_row_number, stripe_num, row_count
      FROM columnar.stripe a, pg_class b
      WHERE columnar_relation_storageid(b.oid)=a.storage_id AND
            relname = 'test_insert_concurrency'
    )
    SELECT
      -- verify that table has two stripes ..
      count(*) = 2 AND
      -- .. and those stripes look like:
      sum(case when stripe_num = 1 AND first_row_number = 150001 AND row_count = 3 then 1 end) = 1 AND
      sum(case when stripe_num = 2 AND first_row_number = 1 AND row_count = 10000 then 1 end) = 1
      AS stripe_metadata_for_test_insert_concurrency_ok
    FROM test_insert_concurrency_stripes;
 }
 step "s1-commit"
 {
    COMMIT;
@ -65,3 +97,9 @@ permutation "s1-begin" "s2-begin" "s1-copy" "s2-insert" "s1-select" "s2-select"
 // insert vs copy
 permutation "s1-begin" "s2-begin" "s2-insert" "s1-copy" "s1-select" "s2-select" "s1-commit" "s2-commit" "s1-select"
 # insert vs insert
 # Start inserting rows in session 1, reserve first_row_number to be 1 for session 1 but commit session 2 before session 1.
 # Then verify that while the stripe written by session 2 has the greater first_row_number, stripe written by session 1 has
 # the greater stripe_num. This is because, we reserve stripe_num and first_row_number at different times.
 permutation "s1-truncate" "s1-begin" "s1-insert-10000-rows" "s2-begin" "s2-insert" "s2-commit" "s1-commit" "s1-verify-metadata"
--- a/src/test/regress/sql/columnar_alter.sql
+++ b/src/test/regress/sql/columnar_alter.sql
@ -17,6 +17,10 @@ INSERT INTO test_alter_table SELECT * FROM sample_data;
 -- drop a column
 ALTER TABLE test_alter_table DROP COLUMN a;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_alter_table');
 -- test analyze
 ANALYZE test_alter_table;
@ -36,6 +40,10 @@ SELECT * FROM test_alter_table;
 INSERT INTO test_alter_table (SELECT 3, 5, 8);
 SELECT * FROM test_alter_table;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_alter_table');
 -- add a fixed-length column with default value
 ALTER TABLE test_alter_table ADD COLUMN e int default 3;
@ -43,6 +51,10 @@ SELECT * from test_alter_table;
 INSERT INTO test_alter_table (SELECT 1, 2, 4, 8);
 SELECT * from test_alter_table;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_alter_table');
 -- add a variable-length column with default value
 ALTER TABLE test_alter_table ADD COLUMN f text DEFAULT 'TEXT ME';
@ -203,21 +215,25 @@ ALTER TABLE products DROP CONSTRAINT dummy_constraint;
 INSERT INTO products VALUES (3, 'pen', 2);
 SELECT * FROM products ORDER BY 1;
-- Add a UNIQUE constraint (should fail)
+-- Add a UNIQUE constraint
-CREATE TABLE products_fail (
+CREATE TABLE products_unique (
    product_no integer UNIQUE,
    name text,
    price numeric
 ) USING columnar;
 ALTER TABLE products ADD COLUMN store_id text UNIQUE;
-- Add a PRIMARY KEY constraint (should fail)
+-- Add a PRIMARY KEY constraint
-CREATE TABLE products_fail (
+CREATE TABLE products_primary (
    product_no integer PRIMARY KEY,
    name text,
    price numeric
 ) USING columnar;
-ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY;
+
 BEGIN;
  ALTER TABLE products DROP COLUMN store_id;
  ALTER TABLE products ADD COLUMN store_id text PRIMARY KEY;
 ROLLBACK;
 -- Add an EXCLUSION constraint (should fail)
 CREATE TABLE circles (
--- a/src/test/regress/sql/columnar_create.sql
+++ b/src/test/regress/sql/columnar_create.sql
@ -7,12 +7,11 @@
 CREATE TABLE contestant (handle TEXT, birthdate DATE, rating INT,
 	percentile FLOAT, country CHAR(3), achievements TEXT[])
 	USING columnar;
 SELECT alter_columnar_table_set('contestant', compression => 'none');
 -- should fail
 CREATE INDEX contestant_idx on contestant(handle);
-- Create compressed table with automatically determined file path
+-- Create zstd compressed table
 -- COMPRESSED
 CREATE TABLE contestant_compressed (handle TEXT, birthdate DATE, rating INT,
 	percentile FLOAT, country CHAR(3), achievements TEXT[])
 	USING columnar;
--- a/src/test/regress/sql/columnar_empty.sql
+++ b/src/test/regress/sql/columnar_empty.sql
@ -19,6 +19,14 @@ select count(*) from t_uncompressed;
 select * from t_compressed;
 select count(*) from t_compressed;
 -- check storage
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_compressed');
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_uncompressed');
 -- explain
 explain (costs off, summary off, timing off) select * from t_uncompressed;
 explain (costs off, summary off, timing off) select * from t_compressed;
@ -31,6 +39,14 @@ vacuum verbose t_uncompressed;
 vacuum full t_compressed;
 vacuum full t_uncompressed;
 -- check storage
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_compressed');
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_uncompressed');
 -- analyze
 analyze t_uncompressed;
 analyze t_compressed;
@ -43,6 +59,14 @@ truncate t_compressed;
 alter table t_uncompressed alter column a type text;
 alter table t_compressed alter column a type text;
 -- check storage
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_compressed');
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t_uncompressed');
 -- verify cost of scanning an empty table is zero, not NaN
 explain table t_uncompressed;
 explain table t_compressed;
--- a/src/test/regress/sql/columnar_first_row_number.sql
+++ b/src/test/regress/sql/columnar_first_row_number.sql
@ -0,0 +1,43 @@
 CREATE SCHEMA columnar_first_row_number;
 SET search_path tO columnar_first_row_number;
 CREATE TABLE col_table_1 (a int) USING columnar;
 INSERT INTO col_table_1 SELECT i FROM generate_series(1, 10) i;
 BEGIN;
  -- we don't use same first_row_number even if the xact is rollback'ed
  INSERT INTO col_table_1 SELECT i FROM generate_series(1, 11) i;
 ROLLBACK;
 INSERT INTO col_table_1 SELECT i FROM generate_series(1, 12) i;
 SELECT alter_columnar_table_set('col_table_1', stripe_row_limit => 1000);
 INSERT INTO col_table_1 SELECT i FROM generate_series(1, 2350) i;
 SELECT row_count, first_row_number FROM columnar.stripe a
 WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass)
 ORDER BY stripe_num;
 VACUUM FULL col_table_1;
 -- show that we properly update first_row_number after VACUUM FULL
 SELECT row_count, first_row_number FROM columnar.stripe a
 WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass)
 ORDER BY stripe_num;
 TRUNCATE col_table_1;
 BEGIN;
  INSERT INTO col_table_1 SELECT i FROM generate_series(1, 16) i;
  INSERT INTO col_table_1 SELECT i FROM generate_series(1, 16) i;
 COMMIT;
 -- show that we start with first_row_number=1 after TRUNCATE
 SELECT row_count, first_row_number FROM columnar.stripe a
 WHERE a.storage_id = columnar_test_helpers.columnar_relation_storageid('col_table_1'::regclass)
 ORDER BY stripe_num;
 SET client_min_messages TO ERROR;
 DROP SCHEMA columnar_first_row_number CASCADE;
--- a/src/test/regress/sql/columnar_indexes.sql
+++ b/src/test/regress/sql/columnar_indexes.sql
@ -17,13 +17,283 @@ explain insert into t values (1, 2);
 insert into t values (1, 2);
 SELECT * FROM t;
 -- create index without the concurrent option. We should
 -- error out during index creation.
 create index t_idx on t(a, b);
 \d t
 explain insert into t values (1, 2);
 insert into t values (3, 4);
 SELECT * FROM t;
 -- make sure that we test index scan
 set columnar.enable_custom_scan to 'off';
 set enable_seqscan to off;
 CREATE table columnar_table (a INT, b int) USING columnar;
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(0, 16000) i;
 -- unique --
 BEGIN;
  INSERT INTO columnar_table VALUES (100000000);
  SAVEPOINT s1;
  -- errors out due to unflushed data in upper transaction
  CREATE UNIQUE INDEX ON columnar_table (a);
 ROLLBACK;
 CREATE UNIQUE INDEX ON columnar_table (a);
 BEGIN;
  INSERT INTO columnar_table VALUES (16050);
  SAVEPOINT s1;
  -- index scan errors out due to unflushed data in upper transaction
  SELECT a FROM columnar_table WHERE a = 16050;
 ROLLBACK;
 EXPLAIN (COSTS OFF) SELECT * FROM columnar_table WHERE a=6456;
 EXPLAIN (COSTS OFF) SELECT a FROM columnar_table WHERE a=6456;
 SELECT (SELECT a FROM columnar_table WHERE a=6456 limit 1)=6456;
 SELECT (SELECT b FROM columnar_table WHERE a=6456 limit 1)=6456*2;
 -- even if a=16050 doesn't exist, we try to insert it twice so this should error out
 INSERT INTO columnar_table VALUES (16050), (16050);
 -- should work
 INSERT INTO columnar_table VALUES (16050);
 -- check edge cases around stripe boundaries, error out
 INSERT INTO columnar_table VALUES (16050);
 INSERT INTO columnar_table VALUES (15999);
 DROP INDEX columnar_table_a_idx;
 CREATE TABLE partial_unique_idx_test (a INT, b INT) USING columnar;
 CREATE UNIQUE INDEX ON partial_unique_idx_test (a)
 WHERE b > 500;
 -- should work since b =< 500 and our partial index doesn't check this interval
 INSERT INTO partial_unique_idx_test VALUES (1, 2), (1, 2);
 -- should work since our partial index wouldn't cover the tuples that we inserted above
 INSERT INTO partial_unique_idx_test VALUES (1, 800);
 INSERT INTO partial_unique_idx_test VALUES (4, 600);
 -- should error out due to (4, 600)
 INSERT INTO partial_unique_idx_test VALUES (4, 700);
 -- btree --
 CREATE INDEX ON columnar_table (a);
 SELECT (SELECT SUM(b) FROM columnar_table WHERE a>700 and a<965)=439560;
 CREATE INDEX ON columnar_table (b)
 WHERE (b > 30000 AND b < 33000);
 -- partial index should be way smaller than the non-partial index
 SELECT pg_total_relation_size('columnar_table_b_idx') * 5 <
       pg_total_relation_size('columnar_table_a_idx');
 -- can't use index scan due to partial index boundaries
 EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30000;
 -- can use index scan
 EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30001;
 -- some more rows
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 17000) i;
 DROP INDEX columnar_table_a_idx;
 TRUNCATE columnar_table;
 -- pkey --
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 16499) i;
 ALTER TABLE columnar_table ADD PRIMARY KEY (a);
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16500, 17000) i;
 BEGIN;
  INSERT INTO columnar_table (a) SELECT 1;
 ROLLBACK;
 -- should work
 INSERT INTO columnar_table (a) SELECT 1;
 -- error out
 INSERT INTO columnar_table VALUES (16100), (16101);
 INSERT INTO columnar_table VALUES (16999);
 BEGIN;
  REINDEX INDEX columnar_table_pkey;
  -- should error even after reindex
  INSERT INTO columnar_table VALUES (16999);
 ROLLBACK;
 VACUUM FULL columnar_table;
 -- show that we don't support clustering columnar tables using indexes
 CLUSTER columnar_table USING columnar_table_pkey;
 ALTER TABLE columnar_table CLUSTER ON columnar_table_pkey;
 CLUSTER columnar_table;
 -- should error even after vacuum
 INSERT INTO columnar_table VALUES (16999);
 TRUNCATE columnar_table;
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i;
 SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000;
 TRUNCATE columnar_table;
 ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey;
 -- hash --
 INSERT INTO columnar_table (a, b) SELECT i*2,i FROM generate_series(1, 8000) i;
 CREATE INDEX hash_idx ON columnar_table USING HASH (b);
 BEGIN;
  CREATE INDEX hash_idx_fill_factor ON columnar_table USING HASH (b) WITH (fillfactor=10);
  -- same hash index with lower fillfactor should be way bigger
  SELECT pg_total_relation_size ('hash_idx_fill_factor') >
         pg_total_relation_size ('hash_idx') * 5;
 ROLLBACK;
 BEGIN;
  INSERT INTO columnar_table (a, b) SELECT i*3,i FROM generate_series(1, 8000) i;
 ROLLBACK;
 INSERT INTO columnar_table (a, b) SELECT i*4,i FROM generate_series(1, 8000) i;
 SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000;
 BEGIN;
  REINDEX TABLE columnar_table;
  SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000;
 ROLLBACK;
 VACUUM FULL columnar_table;
 SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000;
 -- exclusion contraints --
 CREATE TABLE exclusion_test (c1 INT,c2 INT, c3 INT, c4 BOX,
 EXCLUDE USING btree (c1 WITH =) INCLUDE(c3,c4) WHERE (c1 < 10)) USING columnar;
 -- error out since "c1" is "1" for all rows to be inserted
 INSERT INTO exclusion_test SELECT 1, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x;
 BEGIN;
  INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x;
 ROLLBACK;
 -- should work
 INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x;
 INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x;
 BEGIN;
  -- should work thanks to "where" clause in exclusion constraint
  INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x;
 ROLLBACK;
 REINDEX TABLE exclusion_test;
 -- should still work after reindex
 INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x;
 -- make sure that we respect INCLUDE syntax --
 CREATE TABLE include_test (a INT, b BIGINT, c BIGINT, d BIGINT) USING columnar;
 INSERT INTO include_test SELECT i, i, i, i FROM generate_series (1, 1000) i;
 CREATE UNIQUE INDEX unique_a ON include_test (a);
 -- cannot use index only scan
 EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500;
 CREATE UNIQUE INDEX unique_a_include_b_c_d ON include_test (a) INCLUDE(b, c, d);
 -- same unique index that includes other columns should be way bigger
 SELECT pg_total_relation_size ('unique_a') * 1.5 <
       pg_total_relation_size ('unique_a_include_b_c_d');
 DROP INDEX unique_a;
 -- should use index only scan since unique_a_include_b_c_d includes column "b" too
 EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500;
 BEGIN;
  SET enable_indexonlyscan = OFF;
  -- show that we respect enable_indexonlyscan GUC
  EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500;
 ROLLBACK;
 -- make sure that we read the correct value for "b" when doing index only scan
 SELECT b=980 FROM include_test WHERE a = 980;
 -- some tests with distributed & partitioned tables --
 CREATE TABLE dist_part_table(
  dist_col INT,
  part_col TIMESTAMPTZ,
  col1 TEXT
 ) PARTITION BY RANGE (part_col);
 -- create an index before creating a columnar partition
 CREATE INDEX dist_part_table_btree ON dist_part_table (col1);
 -- columnar partition
 CREATE TABLE p0 PARTITION OF dist_part_table
 FOR VALUES FROM ('2020-01-01') TO ('2020-02-01')
 USING columnar;
 SELECT create_distributed_table('dist_part_table', 'dist_col');
 -- columnar partition
 CREATE TABLE p1 PARTITION OF dist_part_table
 FOR VALUES FROM ('2020-02-01') TO ('2020-03-01')
 USING columnar;
 -- row partition
 CREATE TABLE p2 PARTITION OF dist_part_table
 FOR VALUES FROM ('2020-03-01') TO ('2020-04-01');
 INSERT INTO dist_part_table VALUES (1, '2020-03-15', 'str1', POINT(1, 1));
 -- insert into columnar partitions
 INSERT INTO dist_part_table VALUES (1, '2020-01-15', 'str2', POINT(2, 2));
 INSERT INTO dist_part_table VALUES (1, '2020-02-15', 'str3', POINT(3, 3));
 -- create another index after creating a columnar partition
 CREATE UNIQUE INDEX dist_part_table_unique ON dist_part_table (dist_col, part_col);
 -- verify that indexes are created on columnar partitions
 SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p0';
 SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p1';
 -- unsupported index types --
 -- gin --
 CREATE TABLE testjsonb (j JSONB) USING columnar;
 INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(1,10) i;
 CREATE INDEX jidx ON testjsonb USING GIN (j);
 INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(15,20) i;
 -- gist --
 CREATE TABLE gist_point_tbl(id INT4, p POINT) USING columnar;
 INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(1, 10) g;
 CREATE INDEX gist_pointidx ON gist_point_tbl USING gist(p);
 INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(10, 20) g;
 -- sp gist --
 CREATE TABLE box_temp (f1 box) USING columnar;
 INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i;
 CREATE INDEX box_spgist ON box_temp USING spgist (f1);
 INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i;
 -- brin --
 CREATE TABLE brin_summarize (value int) USING columnar;
 CREATE INDEX brin_summarize_idx ON brin_summarize USING brin (value) WITH (pages_per_range=2);
 -- Show that we safely fallback to serial index build.
 CREATE TABLE parallel_scan_test(a int) USING columnar WITH ( parallel_workers = 2 );
 INSERT INTO parallel_scan_test SELECT i FROM generate_series(1,10) i;
 CREATE INDEX ON parallel_scan_test (a);
 VACUUM FULL parallel_scan_test;
 REINDEX TABLE parallel_scan_test;
 SET client_min_messages TO WARNING;
 DROP SCHEMA columnar_indexes CASCADE;
--- a/src/test/regress/sql/columnar_insert.sql
+++ b/src/test/regress/sql/columnar_insert.sql
@ -22,6 +22,10 @@ select count(*) from test_insert_command_data;
 insert into test_insert_command select * from test_insert_command_data;
 select count(*) from test_insert_command;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_insert_command');
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 drop table test_insert_command_data;
@ -99,6 +103,10 @@ SELECT
  pg_column_size(external), pg_column_size(extended)
 FROM test_toast_columnar;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('test_toast_columnar');
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 DROP TABLE test_toast_row;
@ -128,6 +136,10 @@ INSERT INTO zero_col_heap SELECT * FROM zero_col_heap;
 INSERT INTO zero_col SELECT * FROM zero_col_heap;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('zero_col');
 SELECT relname, stripe_num, chunk_group_count, row_count FROM columnar.stripe a, pg_class b
 WHERE columnar_test_helpers.columnar_relation_storageid(b.oid)=a.storage_id AND relname = 'zero_col'
 ORDER BY 1,2,3,4;
--- a/src/test/regress/sql/columnar_rollback.sql
+++ b/src/test/regress/sql/columnar_rollback.sql
@ -13,6 +13,10 @@ INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i;
 ROLLBACK;
 SELECT count(*) FROM t;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 -- check stripe metadata also have been rolled-back
 SELECT count(*) FROM t_stripes;
@ -28,6 +32,11 @@ INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i;
 SELECT count(*) FROM t;  -- force flush
 SAVEPOINT s1;
 INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 SELECT count(*) FROM t;
 ROLLBACK TO SAVEPOINT s1;
 SELECT count(*) FROM t;
@ -36,6 +45,10 @@ SELECT count(*) FROM t;
 INSERT INTO t SELECT i, i+1 FROM generate_series(1, 10) i;
 COMMIT;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 SELECT count(*) FROM t;
 SELECT count(*) FROM t_stripes;
--- a/src/test/regress/sql/columnar_test_helpers.sql
+++ b/src/test/regress/sql/columnar_test_helpers.sql
@ -5,6 +5,17 @@ CREATE FUNCTION columnar_relation_storageid(relid oid) RETURNS bigint
    LANGUAGE C STABLE STRICT
    AS 'citus', $$columnar_relation_storageid$$;
 CREATE OR REPLACE FUNCTION columnar_storage_info(
    rel regclass,
    version_major OUT int4,
    version_minor OUT int4,
    storage_id OUT int8,
    reserved_stripe_id OUT int8,
    reserved_row_number OUT int8,
    reserved_offset OUT int8)
  STRICT
  LANGUAGE c AS 'citus', $$columnar_storage_info$$;
 CREATE FUNCTION compression_type_supported(type text) RETURNS boolean
 AS $$
 BEGIN
--- a/src/test/regress/sql/columnar_truncate.sql
+++ b/src/test/regress/sql/columnar_truncate.sql
@ -27,8 +27,16 @@ SELECT * FROM columnar_truncate_test;
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('columnar_truncate_test');
 TRUNCATE TABLE columnar_truncate_test;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('columnar_truncate_test');
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 SELECT * FROM columnar_truncate_test;
--- a/src/test/regress/sql/columnar_vacuum.sql
+++ b/src/test/regress/sql/columnar_vacuum.sql
@ -17,6 +17,10 @@ INSERT INTO t SELECT i, i * i FROM generate_series(21, 30) i;
 SELECT sum(a), sum(b) FROM t;
 SELECT count(*) FROM t_stripes;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 -- vacuum full should merge stripes together
 VACUUM FULL t;
@ -25,6 +29,10 @@ SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 SELECT sum(a), sum(b) FROM t;
 SELECT count(*) FROM t_stripes;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 -- test the case when all data cannot fit into a single stripe
 SELECT alter_columnar_table_set('t', stripe_row_limit => 1000);
 INSERT INTO t SELECT i, 2 * i FROM generate_series(1,2500) i;
@ -34,6 +42,10 @@ SELECT count(*) FROM t_stripes;
 VACUUM FULL t;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
 SELECT sum(a), sum(b) FROM t;
@ -95,6 +107,9 @@ INSERT INTO t SELECT i / 5 FROM generate_series(1, 1500) i;
 COMMIT;
 VACUUM VERBOSE t;
 select
  version_major, version_minor, reserved_stripe_id, reserved_row_number, reserved_offset
  from columnar_test_helpers.columnar_storage_info('t');
 SELECT * FROM columnar_test_helpers.chunk_group_consistency;
--- a/src/test/regress/sql/upgrade_columnar_metapage_after.sql
+++ b/src/test/regress/sql/upgrade_columnar_metapage_after.sql
@ -0,0 +1,73 @@
 \set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"`
 SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND
       substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0
 AS upgrade_test_old_citus_version_ge_10_0;
 \gset
 \if :upgrade_test_old_citus_version_ge_10_0
 \else
 \q
 \endif
 -- it's not the best practice to define this here, but we don't want to include
 -- columnar_test_helpers in upgrade test schedule
 CREATE OR REPLACE FUNCTION columnar_storage_info(
  rel regclass,
  version_major OUT int4,
  version_minor OUT int4,
  storage_id OUT int8,
  reserved_stripe_id OUT int8,
  reserved_row_number OUT int8,
  reserved_offset OUT int8)
 STRICT
 LANGUAGE c AS 'citus', 'columnar_storage_info';
 CREATE VIEW columnar_table_stripe_info AS
 SELECT columnar_table_storageids.relname relname,
       columnar.stripe.stripe_num stripe_num,
       columnar.stripe.row_count row_count,
       columnar.stripe.first_row_number first_row_number
 FROM columnar.stripe,
 (
  SELECT c.oid relid, c.relname relname, (columnar_storage_info(c.oid)).storage_id relstorageid
  FROM pg_class c, pg_am a
  WHERE c.relam = a.oid AND amname = 'columnar'
 ) columnar_table_storageids
 WHERE relstorageid = columnar.stripe.storage_id;
 SET search_path TO upgrade_columnar_metapage, public;
 -- show that first_row_number values are equal to MAX(row_count) * stripe_num + COLUMNAR_FIRST_ROW_NUMBER
 SELECT * FROM columnar_table_stripe_info ORDER BY relname, stripe_num;
 -- should work since we upgrade metapages when upgrading schema version
 INSERT INTO columnar_table_1 VALUES (3);
 -- state of stripe metadata for columnar_table_1 after post-upgrade insert
 SELECT * FROM columnar_table_stripe_info WHERE relname = 'columnar_table_1' ORDER BY stripe_num;
 -- show that all columnar relation's metapage's are upgraded to "2.0"
 SELECT count(*)=0
 FROM (SELECT (columnar_storage_info(c.oid)).* t
      FROM pg_class c, pg_am a
      WHERE c.relam = a.oid AND amname = 'columnar') t
 WHERE t.version_major != 2 and t.version_minor != 0;
 -- print metapage for two of the tables
 SELECT columnar_storage_info('columnar_table_1');
 SELECT columnar_storage_info('columnar_table_2');
 -- show that no_data_columnar_table also has metapage after upgrade
 SELECT columnar_storage_info('no_data_columnar_table');
 -- table is already upgraded, make sure that upgrade_columnar_metapage is no-op
 SELECT citus_internal.upgrade_columnar_storage(c.oid)
 FROM pg_class c, pg_am a
 WHERE c.relam = a.oid AND amname = 'columnar' and relname = 'columnar_table_2';
 SELECT columnar_storage_info('columnar_table_2');
 VACUUM FULL columnar_table_2;
 -- print metapage and stripe metadata after post-upgrade vacuum full
 SELECT columnar_storage_info('columnar_table_2');
 SELECT * FROM columnar_table_stripe_info WHERE relname = 'columnar_table_2' ORDER BY stripe_num;
--- a/src/test/regress/sql/upgrade_columnar_metapage_before.sql
+++ b/src/test/regress/sql/upgrade_columnar_metapage_before.sql
@ -0,0 +1,26 @@
 \set upgrade_test_old_citus_version `echo "$upgrade_test_old_citus_version"`
 SELECT substring(:'upgrade_test_old_citus_version', 'v(\d+)\.\d+\.\d+')::int >= 10 AND
       substring(:'upgrade_test_old_citus_version', 'v\d+\.(\d+)\.\d+')::int >= 0
 AS upgrade_test_old_citus_version_ge_10_0;
 \gset
 \if :upgrade_test_old_citus_version_ge_10_0
 \else
 \q
 \endif
 CREATE SCHEMA upgrade_columnar_metapage;
 SET search_path TO upgrade_columnar_metapage, public;
 CREATE TABLE columnar_table_1(a INT, b INT) USING columnar;
 INSERT INTO columnar_table_1 SELECT i FROM generate_series(160001, 320000) i;
 CREATE TABLE columnar_table_2(b INT) USING columnar;
 SELECT alter_columnar_table_set('columnar_table_2',
                                chunk_group_row_limit => 100,
                                stripe_row_limit => 1000);
 INSERT INTO columnar_table_2 SELECT i FROM generate_series(1600, 3500) i;
 CREATE TABLE columnar_table_3(b INT) USING columnar;
 INSERT INTO columnar_table_3 VALUES (1), (2);
 CREATE TABLE no_data_columnar_table(a INT, b INT, c TEXT) USING columnar;
`@ -1,4 +1,3 @@`
	`-- citus--10.1-1--10.2-1`	`-- citus--10.1-1--10.2-1`

	`-- bump version to 10.2-1`	`#include "../../columnar/sql/columnar--10.1-1--10.2-1.sql"`