From 83f5d42365cdd417aaad03e486c1c89f88a0384c Mon Sep 17 00:00:00 2001
From: Onur Tirtir <onurcantirtir@gmail.com>
Date: Mon, 5 Jul 2021 12:46:39 +0300
Subject: [PATCH] Use long-lasting mem cxt & optimize correlated index scan

---
 src/backend/columnar/columnar_reader.c        | 155 ++++++++++++++----
 src/backend/columnar/columnar_tableam.c       |  81 +++++++--
 src/include/columnar/columnar.h               |   5 +-
 .../regress/expected/columnar_indexes.out     |  98 +++++++++++
 src/test/regress/sql/columnar_indexes.sql     |  70 ++++++++
 5 files changed, 356 insertions(+), 53 deletions(-)

diff --git a/src/backend/columnar/columnar_reader.c b/src/backend/columnar/columnar_reader.c
index 685e3163d..53725ad45 100644
--- a/src/backend/columnar/columnar_reader.c
+++ b/src/backend/columnar/columnar_reader.c
@@ -85,10 +85,14 @@ struct ColumnarReadState
 
 /* static function declarations */
 static MemoryContext CreateStripeReadMemoryContext(void);
-static void ReadStripeRowByRowNumber(StripeReadState *stripeReadState,
-									 StripeMetadata *stripeMetadata,
+static bool ColumnarReadIsCurrentStripe(ColumnarReadState *readState,
+										uint64 rowNumber);
+static StripeMetadata * ColumnarReadGetCurrentStripe(ColumnarReadState *readState);
+static void ReadStripeRowByRowNumber(ColumnarReadState *readState,
 									 uint64 rowNumber, Datum *columnValues,
 									 bool *columnNulls);
+static bool StripeReadIsCurrentChunkGroup(StripeReadState *stripeReadState,
+										  int chunkGroupIndex);
 static void ReadChunkGroupRowByRowOffset(ChunkGroupReadState *chunkGroupReadState,
 										 StripeMetadata *stripeMetadata,
 										 uint64 stripeRowOffset, Datum *columnValues,
@@ -246,36 +250,76 @@ ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *col
  * exists, then returns false.
  */
 bool
-ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber,
-						   List *neededColumnList, Datum *columnValues,
+ColumnarReadRowByRowNumber(ColumnarReadState *readState,
+						   uint64 rowNumber, Datum *columnValues,
 						   bool *columnNulls, Snapshot snapshot)
 {
-	StripeMetadata *stripeMetadata = FindStripeByRowNumber(relation, rowNumber, snapshot);
-	if (stripeMetadata == NULL)
+	if (!ColumnarReadIsCurrentStripe(readState, rowNumber))
+	{
+		Relation columnarRelation = readState->relation;
+		StripeMetadata *stripeMetadata = FindStripeByRowNumber(columnarRelation,
+															   rowNumber, snapshot);
+		if (stripeMetadata == NULL)
+		{
+			/* no such row exists */
+			return false;
+		}
+
+		/* do the cleanup before reading a new stripe */
+		ColumnarResetRead(readState);
+
+		TupleDesc relationTupleDesc = RelationGetDescr(columnarRelation);
+		List *whereClauseList = NIL;
+		List *whereClauseVars = NIL;
+		MemoryContext stripeReadContext = readState->stripeReadContext;
+		readState->stripeReadState = BeginStripeRead(stripeMetadata,
+													 columnarRelation,
+													 relationTupleDesc,
+													 readState->projectedColumnList,
+													 whereClauseList,
+													 whereClauseVars,
+													 stripeReadContext);
+
+		readState->currentStripeMetadata = stripeMetadata;
+	}
+
+	ReadStripeRowByRowNumber(readState, rowNumber, columnValues, columnNulls);
+
+	return true;
+}
+
+
+/*
+ * ColumnarReadIsCurrentStripe returns true if stripe being read contains
+ * row with given rowNumber.
+ */
+static bool
+ColumnarReadIsCurrentStripe(ColumnarReadState *readState, uint64 rowNumber)
+{
+	if (!StripeReadInProgress(readState))
 	{
-		/* no such row exists */
 		return false;
 	}
 
-	TupleDesc relationTupleDesc = RelationGetDescr(relation);
-	List *whereClauseList = NIL;
-	List *whereClauseVars = NIL;
-	MemoryContext stripeReadContext = CreateStripeReadMemoryContext();
-	StripeReadState *stripeReadState = BeginStripeRead(stripeMetadata,
-													   relation,
-													   relationTupleDesc,
-													   neededColumnList,
-													   whereClauseList,
-													   whereClauseVars,
-													   stripeReadContext);
+	StripeMetadata *currentStripeMetadata = readState->currentStripeMetadata;
+	if (rowNumber >= currentStripeMetadata->firstRowNumber &&
+		rowNumber <= StripeGetHighestRowNumber(currentStripeMetadata))
+	{
+		return true;
+	}
 
-	ReadStripeRowByRowNumber(stripeReadState, stripeMetadata, rowNumber,
-							 columnValues, columnNulls);
+	return false;
+}
 
-	EndStripeRead(stripeReadState);
-	MemoryContextReset(stripeReadContext);
 
-	return true;
+/*
+ * ColumnarReadGetCurrentStripe returns StripeMetadata for the stripe that is
+ * being read.
+ */
+static StripeMetadata *
+ColumnarReadGetCurrentStripe(ColumnarReadState *readState)
+{
+	return readState->currentStripeMetadata;
 }
 
 
@@ -285,11 +329,13 @@ ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber,
  * Errors out if no such row exists in the stripe being read.
  */
 static void
-ReadStripeRowByRowNumber(StripeReadState *stripeReadState,
-						 StripeMetadata *stripeMetadata,
+ReadStripeRowByRowNumber(ColumnarReadState *readState,
 						 uint64 rowNumber, Datum *columnValues,
 						 bool *columnNulls)
 {
+	StripeMetadata *stripeMetadata = ColumnarReadGetCurrentStripe(readState);
+	StripeReadState *stripeReadState = readState->stripeReadState;
+
 	if (rowNumber < stripeMetadata->firstRowNumber)
 	{
 		/* not expected but be on the safe side */
@@ -298,21 +344,42 @@ ReadStripeRowByRowNumber(StripeReadState *stripeReadState,
 
 	/* find the exact chunk group to be read */
 	uint64 stripeRowOffset = rowNumber - stripeMetadata->firstRowNumber;
-	stripeReadState->chunkGroupIndex = stripeRowOffset /
-									   stripeMetadata->chunkGroupRowCount;
-	stripeReadState->chunkGroupReadState = BeginChunkGroupRead(
-		stripeReadState->stripeBuffers,
-		stripeReadState->chunkGroupIndex,
-		stripeReadState->tupleDescriptor,
-		stripeReadState->projectedColumnList,
-		stripeReadState->stripeReadContext);
+	int chunkGroupIndex = stripeRowOffset / stripeMetadata->chunkGroupRowCount;
+	if (!StripeReadIsCurrentChunkGroup(stripeReadState, chunkGroupIndex))
+	{
+		if (stripeReadState->chunkGroupReadState)
+		{
+			EndChunkGroupRead(stripeReadState->chunkGroupReadState);
+		}
+
+		stripeReadState->chunkGroupIndex = chunkGroupIndex;
+		stripeReadState->chunkGroupReadState = BeginChunkGroupRead(
+			stripeReadState->stripeBuffers,
+			stripeReadState->chunkGroupIndex,
+			stripeReadState->tupleDescriptor,
+			stripeReadState->projectedColumnList,
+			stripeReadState->stripeReadContext);
+	}
 
 	ReadChunkGroupRowByRowOffset(stripeReadState->chunkGroupReadState,
 								 stripeMetadata, stripeRowOffset,
 								 columnValues, columnNulls);
+}
 
-	EndChunkGroupRead(stripeReadState->chunkGroupReadState);
-	stripeReadState->chunkGroupReadState = NULL;
+
+/*
+ * StripeReadIsCurrentChunkGroup returns true if chunk group being read is
+ * the has given chunkGroupIndex in its stripe.
+ */
+static bool
+StripeReadIsCurrentChunkGroup(StripeReadState *stripeReadState, int chunkGroupIndex)
+{
+	if (!stripeReadState->chunkGroupReadState)
+	{
+		return false;
+	}
+
+	return (stripeReadState->chunkGroupIndex == chunkGroupIndex);
 }
 
 
@@ -390,6 +457,24 @@ ColumnarEndRead(ColumnarReadState *readState)
 }
 
 
+/*
+ * ColumnarResetRead resets the stripe and the chunk group that is
+ * being read currently (if any).
+ */
+void
+ColumnarResetRead(ColumnarReadState *readState)
+{
+	if (StripeReadInProgress(readState))
+	{
+		pfree(readState->currentStripeMetadata);
+		readState->currentStripeMetadata = NULL;
+
+		readState->stripeReadState = NULL;
+		MemoryContextReset(readState->stripeReadContext);
+	}
+}
+
+
 /*
  * BeginStripeRead allocates state for reading a stripe.
  */
diff --git a/src/backend/columnar/columnar_tableam.c b/src/backend/columnar/columnar_tableam.c
index 143467169..532613dd4 100644
--- a/src/backend/columnar/columnar_tableam.c
+++ b/src/backend/columnar/columnar_tableam.c
@@ -87,6 +87,25 @@ typedef struct ColumnarScanDescData
 
 typedef struct ColumnarScanDescData *ColumnarScanDesc;
 
+/*
+ * IndexFetchColumnarData is the scan state passed between index_fetch_begin,
+ * index_fetch_reset, index_fetch_end, index_fetch_tuple calls.
+ */
+typedef struct IndexFetchColumnarData
+{
+	IndexFetchTableData cs_base;
+	ColumnarReadState *cs_readState;
+
+	/*
+	 * We initialize cs_readState lazily in the first columnar_index_fetch_tuple
+	 * call. However, we want to do memory allocations in a sub MemoryContext of
+	 * columnar_index_fetch_begin. For this reason, we store scanContext in
+	 * columnar_index_fetch_begin.
+	 */
+	MemoryContext scanContext;
+} IndexFetchColumnarData;
+
+
 static object_access_hook_type PrevObjectAccessHook = NULL;
 static ProcessUtility_hook_type PrevProcessUtilityHook = NULL;
 
@@ -409,29 +428,43 @@ columnar_index_fetch_begin(Relation rel)
 
 	FlushWriteStateForRelfilenode(relfilenode, GetCurrentSubTransactionId());
 
-	IndexFetchTableData *scan = palloc0(sizeof(IndexFetchTableData));
-	scan->rel = rel;
-	return scan;
+	MemoryContext scanContext = CreateColumnarScanMemoryContext();
+	MemoryContext oldContext = MemoryContextSwitchTo(scanContext);
+
+	IndexFetchColumnarData *scan = palloc0(sizeof(IndexFetchColumnarData));
+	scan->cs_base.rel = rel;
+	scan->cs_readState = NULL;
+	scan->scanContext = scanContext;
+
+	MemoryContextSwitchTo(oldContext);
+
+	return &scan->cs_base;
 }
 
 
 static void
-columnar_index_fetch_reset(IndexFetchTableData *scan)
+columnar_index_fetch_reset(IndexFetchTableData *sscan)
 {
 	/* no-op */
 }
 
 
 static void
-columnar_index_fetch_end(IndexFetchTableData *scan)
+columnar_index_fetch_end(IndexFetchTableData *sscan)
 {
-	columnar_index_fetch_reset(scan);
-	pfree(scan);
+	columnar_index_fetch_reset(sscan);
+
+	IndexFetchColumnarData *scan = (IndexFetchColumnarData *) sscan;
+	if (scan->cs_readState)
+	{
+		ColumnarEndRead(scan->cs_readState);
+		scan->cs_readState = NULL;
+	}
 }
 
 
 static bool
-columnar_index_fetch_tuple(struct IndexFetchTableData *scan,
+columnar_index_fetch_tuple(struct IndexFetchTableData *sscan,
 						   ItemPointer tid,
 						   Snapshot snapshot,
 						   TupleTableSlot *slot,
@@ -451,19 +484,35 @@ columnar_index_fetch_tuple(struct IndexFetchTableData *scan,
 
 	ExecClearTuple(slot);
 
-	/* we need all columns */
-	int natts = scan->rel->rd_att->natts;
-	Bitmapset *attr_needed = bms_add_range(NULL, 0, natts - 1);
-	TupleDesc relationTupleDesc = RelationGetDescr(scan->rel);
-	List *relationColumnList = NeededColumnsList(relationTupleDesc, attr_needed);
+	IndexFetchColumnarData *scan = (IndexFetchColumnarData *) sscan;
+	Relation columnarRelation = scan->cs_base.rel;
+
+	/* initialize read state for the first row */
+	if (scan->cs_readState == NULL)
+	{
+		MemoryContext oldContext = MemoryContextSwitchTo(scan->scanContext);
+
+		/* we need all columns */
+		int natts = columnarRelation->rd_att->natts;
+		Bitmapset *attr_needed = bms_add_range(NULL, 0, natts - 1);
+
+		/* no quals for index scan */
+		List *scanQual = NIL;
+
+		scan->cs_readState = init_columnar_read_state(columnarRelation,
+													  slot->tts_tupleDescriptor,
+													  attr_needed, scanQual);
+		MemoryContextSwitchTo(oldContext);
+	}
+
 	uint64 rowNumber = tid_to_row_number(*tid);
-	if (!ColumnarReadRowByRowNumber(scan->rel, rowNumber, relationColumnList,
-									slot->tts_values, slot->tts_isnull, snapshot))
+	if (!ColumnarReadRowByRowNumber(scan->cs_readState, rowNumber, slot->tts_values,
+									slot->tts_isnull, snapshot))
 	{
 		return false;
 	}
 
-	slot->tts_tableOid = RelationGetRelid(scan->rel);
+	slot->tts_tableOid = RelationGetRelid(columnarRelation);
 	slot->tts_tid = *tid;
 	ExecStoreVirtualTuple(slot);
 
diff --git a/src/include/columnar/columnar.h b/src/include/columnar/columnar.h
index 24c96e419..eaeb9fa21 100644
--- a/src/include/columnar/columnar.h
+++ b/src/include/columnar/columnar.h
@@ -216,10 +216,11 @@ extern ColumnarReadState * ColumnarBeginRead(Relation relation,
 extern bool ColumnarReadNextRow(ColumnarReadState *state, Datum *columnValues,
 								bool *columnNulls, uint64 *rowNumber);
 extern void ColumnarRescan(ColumnarReadState *readState);
-extern bool ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber,
-									   List *neededColumnList, Datum *columnValues,
+extern bool ColumnarReadRowByRowNumber(ColumnarReadState *readState,
+									   uint64 rowNumber, Datum *columnValues,
 									   bool *columnNulls, Snapshot snapshot);
 extern void ColumnarEndRead(ColumnarReadState *state);
+extern void ColumnarResetRead(ColumnarReadState *readState);
 extern int64 ColumnarReadChunkGroupsFiltered(ColumnarReadState *state);
 
 /* Function declarations for common functions */
diff --git a/src/test/regress/expected/columnar_indexes.out b/src/test/regress/expected/columnar_indexes.out
index 7b28bce21..a61133a68 100644
--- a/src/test/regress/expected/columnar_indexes.out
+++ b/src/test/regress/expected/columnar_indexes.out
@@ -52,6 +52,7 @@ SELECT * FROM t;
 -- make sure that we test index scan
 set columnar.enable_custom_scan to 'off';
 set enable_seqscan to off;
+set seq_page_cost TO 10000000;
 CREATE table columnar_table (a INT, b int) USING columnar;
 INSERT INTO columnar_table (a) VALUES (1), (1);
 CREATE UNIQUE INDEX CONCURRENTLY ON columnar_table (a);
@@ -234,6 +235,28 @@ SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000;
  t
 (1 row)
 
+-- Since our index is highly correlated with the relation itself, we should
+-- de-serialize each chunk group only once. For this reason, if this test
+-- file hangs on below queries, then you should think that we are not properly
+-- caching the last-read chunk group during index reads.
+SELECT SUM(a)=312487500 FROM columnar_table WHERE a < 25000;
+ ?column?
+---------------------------------------------------------------------
+ t
+(1 row)
+
+SELECT SUM(a)=167000 FROM columnar_table WHERE a = 16000 OR a = 151000;
+ ?column?
+---------------------------------------------------------------------
+ t
+(1 row)
+
+SELECT SUM(a)=48000 FROM columnar_table WHERE a = 16000 OR a = 32000;
+ ?column?
+---------------------------------------------------------------------
+ t
+(1 row)
+
 TRUNCATE columnar_table;
 ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey;
 -- hash --
@@ -435,5 +458,80 @@ NOTICE:  falling back to serial index build since parallel scan on columnar tabl
 REINDEX TABLE CONCURRENTLY parallel_scan_test;
 NOTICE:  falling back to serial index build since parallel scan on columnar tables is not supported
 NOTICE:  falling back to serial index build since parallel scan on columnar tables is not supported
+-- test with different data types & indexAM's --
+CREATE TABLE hash_text(a INT, b TEXT) USING columnar;
+INSERT INTO hash_text SELECT i, (i*2)::TEXT FROM generate_series(1, 10) i;
+CREATE INDEX ON hash_text USING hash (b);
+SELECT b FROM hash_text WHERE b='10';
+ b
+---------------------------------------------------------------------
+ 10
+(1 row)
+
+CREATE TABLE hash_int(a INT, b TEXT) USING columnar;
+INSERT INTO hash_int SELECT i, (i*3)::TEXT FROM generate_series(1, 10) i;
+CREATE INDEX ON hash_int USING hash (a);
+SELECT b='15' FROM hash_int WHERE a=5;
+ ?column?
+---------------------------------------------------------------------
+ t
+(1 row)
+
+CREATE TABLE mixed_data_types (
+  timestamp_col timestamp,
+  box_col box,
+  circle_col circle,
+  float_col float,
+  uuid_col uuid,
+  text_col text,
+  numeric_col numeric,
+  PRIMARY KEY(timestamp_col, text_col)
+) USING columnar;
+INSERT INTO mixed_data_types
+SELECT
+  to_timestamp(i+36000),
+  box(point(i, i+90)),
+  circle(point(i*2, i*3), i*100),
+  (i*1.2)::float,
+  uuid_in(md5((i*10)::text || (i*15)::text)::cstring),
+  (i*8)::text,
+  (i*42)::numeric
+FROM generate_series(1, 10) i;
+SELECT text_col='64'
+FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:08';
+ ?column?
+---------------------------------------------------------------------
+ t
+(1 row)
+
+SELECT uuid_col='298923c8-1900-45e9-1288-b430794814c4'
+FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:01';
+ ?column?
+---------------------------------------------------------------------
+ t
+(1 row)
+
+CREATE INDEX hash_uuid ON mixed_data_types USING hash(uuid_col);
+SELECT box_col=box(point(1, 91)) AND timestamp_col='1970-01-01 02:00:01'
+FROM mixed_data_types WHERE uuid_col='298923c8-1900-45e9-1288-b430794814c4';
+ ?column?
+---------------------------------------------------------------------
+ t
+(1 row)
+
+DROP INDEX hash_uuid;
+CREATE INDEX btree_multi_numeric_text_timestamp
+ON mixed_data_types (numeric_col, text_col, timestamp_col);
+SELECT uuid_col='ab2481c9-f93d-0ed3-033a-3281d865ccb2'
+FROM mixed_data_types
+WHERE
+  numeric_col >= 120 AND numeric_col <= 220 AND
+  circle_col >= circle(point(7, 7), 350) AND
+  float_col <= 5.0;
+ ?column?
+---------------------------------------------------------------------
+ t
+(1 row)
+
 SET client_min_messages TO WARNING;
 DROP SCHEMA columnar_indexes CASCADE;
diff --git a/src/test/regress/sql/columnar_indexes.sql b/src/test/regress/sql/columnar_indexes.sql
index 1ec529872..d2caa120e 100644
--- a/src/test/regress/sql/columnar_indexes.sql
+++ b/src/test/regress/sql/columnar_indexes.sql
@@ -25,6 +25,7 @@ SELECT * FROM t;
 -- make sure that we test index scan
 set columnar.enable_custom_scan to 'off';
 set enable_seqscan to off;
+set seq_page_cost TO 10000000;
 
 CREATE table columnar_table (a INT, b int) USING columnar;
 
@@ -158,6 +159,14 @@ TRUNCATE columnar_table;
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i;
 SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000;
 
+-- Since our index is highly correlated with the relation itself, we should
+-- de-serialize each chunk group only once. For this reason, if this test
+-- file hangs on below queries, then you should think that we are not properly
+-- caching the last-read chunk group during index reads.
+SELECT SUM(a)=312487500 FROM columnar_table WHERE a < 25000;
+SELECT SUM(a)=167000 FROM columnar_table WHERE a = 16000 OR a = 151000;
+SELECT SUM(a)=48000 FROM columnar_table WHERE a = 16000 OR a = 32000;
+
 TRUNCATE columnar_table;
 ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey;
 
@@ -321,5 +330,66 @@ REINDEX TABLE parallel_scan_test;
 CREATE INDEX CONCURRENTLY ON parallel_scan_test (a);
 REINDEX TABLE CONCURRENTLY parallel_scan_test;
 
+-- test with different data types & indexAM's --
+
+CREATE TABLE hash_text(a INT, b TEXT) USING columnar;
+INSERT INTO hash_text SELECT i, (i*2)::TEXT FROM generate_series(1, 10) i;
+
+CREATE INDEX ON hash_text USING hash (b);
+
+SELECT b FROM hash_text WHERE b='10';
+
+CREATE TABLE hash_int(a INT, b TEXT) USING columnar;
+INSERT INTO hash_int SELECT i, (i*3)::TEXT FROM generate_series(1, 10) i;
+
+CREATE INDEX ON hash_int USING hash (a);
+
+SELECT b='15' FROM hash_int WHERE a=5;
+
+CREATE TABLE mixed_data_types (
+  timestamp_col timestamp,
+  box_col box,
+  circle_col circle,
+  float_col float,
+  uuid_col uuid,
+  text_col text,
+  numeric_col numeric,
+  PRIMARY KEY(timestamp_col, text_col)
+) USING columnar;
+
+INSERT INTO mixed_data_types
+SELECT
+  to_timestamp(i+36000),
+  box(point(i, i+90)),
+  circle(point(i*2, i*3), i*100),
+  (i*1.2)::float,
+  uuid_in(md5((i*10)::text || (i*15)::text)::cstring),
+  (i*8)::text,
+  (i*42)::numeric
+FROM generate_series(1, 10) i;
+
+SELECT text_col='64'
+FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:08';
+
+SELECT uuid_col='298923c8-1900-45e9-1288-b430794814c4'
+FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:01';
+
+CREATE INDEX hash_uuid ON mixed_data_types USING hash(uuid_col);
+
+SELECT box_col=box(point(1, 91)) AND timestamp_col='1970-01-01 02:00:01'
+FROM mixed_data_types WHERE uuid_col='298923c8-1900-45e9-1288-b430794814c4';
+
+DROP INDEX hash_uuid;
+
+CREATE INDEX btree_multi_numeric_text_timestamp
+ON mixed_data_types (numeric_col, text_col, timestamp_col);
+
+SELECT uuid_col='ab2481c9-f93d-0ed3-033a-3281d865ccb2'
+FROM mixed_data_types
+WHERE
+  numeric_col >= 120 AND numeric_col <= 220 AND
+  circle_col >= circle(point(7, 7), 350) AND
+  float_col <= 5.0;
+
 SET client_min_messages TO WARNING;
 DROP SCHEMA columnar_indexes CASCADE;