Use long-lasting mem cxt & optimize correlated index scan

2021-07-05 12:46:39 +03:00 · 2021-07-05 12:46:39 +03:00 · 83f5d42365
parent c021b82a43
commit 83f5d42365
5 changed files with 356 additions and 53 deletions
--- a/src/backend/columnar/columnar_reader.c
+++ b/src/backend/columnar/columnar_reader.c
@ -85,10 +85,14 @@ struct ColumnarReadState
 /* static function declarations */
 static MemoryContext CreateStripeReadMemoryContext(void);
-static void ReadStripeRowByRowNumber(StripeReadState *stripeReadState,
+static bool ColumnarReadIsCurrentStripe(ColumnarReadState *readState,
-									 StripeMetadata *stripeMetadata,
+										uint64 rowNumber);
 static StripeMetadata * ColumnarReadGetCurrentStripe(ColumnarReadState *readState);
 static void ReadStripeRowByRowNumber(ColumnarReadState *readState,
 									 uint64 rowNumber, Datum *columnValues,
 									 bool *columnNulls);
 static bool StripeReadIsCurrentChunkGroup(StripeReadState *stripeReadState,
 										  int chunkGroupIndex);
 static void ReadChunkGroupRowByRowOffset(ChunkGroupReadState *chunkGroupReadState,
 										 StripeMetadata *stripeMetadata,
 										 uint64 stripeRowOffset, Datum *columnValues,
@ -246,36 +250,76 @@ ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *col
 * exists, then returns false.
 */
 bool
-ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber,
+ColumnarReadRowByRowNumber(ColumnarReadState *readState,
-						   List *neededColumnList, Datum *columnValues,
+						   uint64 rowNumber, Datum *columnValues,
 						   bool *columnNulls, Snapshot snapshot)
 {
-	StripeMetadata *stripeMetadata = FindStripeByRowNumber(relation, rowNumber, snapshot);
+	if (!ColumnarReadIsCurrentStripe(readState, rowNumber))
-	if (stripeMetadata == NULL)
+	{
 		Relation columnarRelation = readState->relation;
 		StripeMetadata *stripeMetadata = FindStripeByRowNumber(columnarRelation,
 															   rowNumber, snapshot);
 		if (stripeMetadata == NULL)
 		{
 			/* no such row exists */
 			return false;
 		}
 		/* do the cleanup before reading a new stripe */
 		ColumnarResetRead(readState);
 		TupleDesc relationTupleDesc = RelationGetDescr(columnarRelation);
 		List *whereClauseList = NIL;
 		List *whereClauseVars = NIL;
 		MemoryContext stripeReadContext = readState->stripeReadContext;
 		readState->stripeReadState = BeginStripeRead(stripeMetadata,
 													 columnarRelation,
 													 relationTupleDesc,
 													 readState->projectedColumnList,
 													 whereClauseList,
 													 whereClauseVars,
 													 stripeReadContext);
 		readState->currentStripeMetadata = stripeMetadata;
 	}
 	ReadStripeRowByRowNumber(readState, rowNumber, columnValues, columnNulls);
 	return true;
 }
 /*
 * ColumnarReadIsCurrentStripe returns true if stripe being read contains
 * row with given rowNumber.
 */
 static bool
 ColumnarReadIsCurrentStripe(ColumnarReadState *readState, uint64 rowNumber)
 {
 	if (!StripeReadInProgress(readState))
 	{
 		/* no such row exists */
 		return false;
 	}
-	TupleDesc relationTupleDesc = RelationGetDescr(relation);
+	StripeMetadata *currentStripeMetadata = readState->currentStripeMetadata;
-	List *whereClauseList = NIL;
+	if (rowNumber >= currentStripeMetadata->firstRowNumber &&
-	List *whereClauseVars = NIL;
+		rowNumber <= StripeGetHighestRowNumber(currentStripeMetadata))
-	MemoryContext stripeReadContext = CreateStripeReadMemoryContext();
+	{
-	StripeReadState *stripeReadState = BeginStripeRead(stripeMetadata,
+		return true;
-													   relation,
+	}
 													   relationTupleDesc,
 													   neededColumnList,
 													   whereClauseList,
 													   whereClauseVars,
 													   stripeReadContext);
-	ReadStripeRowByRowNumber(stripeReadState, stripeMetadata, rowNumber,
+	return false;
-							 columnValues, columnNulls);
+}
 	EndStripeRead(stripeReadState);
 	MemoryContextReset(stripeReadContext);
-	return true;
+/*
 * ColumnarReadGetCurrentStripe returns StripeMetadata for the stripe that is
 * being read.
 */
 static StripeMetadata *
 ColumnarReadGetCurrentStripe(ColumnarReadState *readState)
 {
 	return readState->currentStripeMetadata;
 }
@ -285,11 +329,13 @@ ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber,
 * Errors out if no such row exists in the stripe being read.
 */
 static void
-ReadStripeRowByRowNumber(StripeReadState *stripeReadState,
+ReadStripeRowByRowNumber(ColumnarReadState *readState,
 						 StripeMetadata *stripeMetadata,
 						 uint64 rowNumber, Datum *columnValues,
 						 bool *columnNulls)
 {
 	StripeMetadata *stripeMetadata = ColumnarReadGetCurrentStripe(readState);
 	StripeReadState *stripeReadState = readState->stripeReadState;
 	if (rowNumber < stripeMetadata->firstRowNumber)
 	{
 		/* not expected but be on the safe side */
@ -298,21 +344,42 @@ ReadStripeRowByRowNumber(StripeReadState *stripeReadState,
 	/* find the exact chunk group to be read */
 	uint64 stripeRowOffset = rowNumber - stripeMetadata->firstRowNumber;
-	stripeReadState->chunkGroupIndex = stripeRowOffset /
+	int chunkGroupIndex = stripeRowOffset / stripeMetadata->chunkGroupRowCount;
-									   stripeMetadata->chunkGroupRowCount;
+	if (!StripeReadIsCurrentChunkGroup(stripeReadState, chunkGroupIndex))
-	stripeReadState->chunkGroupReadState = BeginChunkGroupRead(
+	{
-		stripeReadState->stripeBuffers,
+		if (stripeReadState->chunkGroupReadState)
-		stripeReadState->chunkGroupIndex,
+		{
-		stripeReadState->tupleDescriptor,
+			EndChunkGroupRead(stripeReadState->chunkGroupReadState);
-		stripeReadState->projectedColumnList,
+		}
-		stripeReadState->stripeReadContext);
+
 		stripeReadState->chunkGroupIndex = chunkGroupIndex;
 		stripeReadState->chunkGroupReadState = BeginChunkGroupRead(
 			stripeReadState->stripeBuffers,
 			stripeReadState->chunkGroupIndex,
 			stripeReadState->tupleDescriptor,
 			stripeReadState->projectedColumnList,
 			stripeReadState->stripeReadContext);
 	}
 	ReadChunkGroupRowByRowOffset(stripeReadState->chunkGroupReadState,
 								 stripeMetadata, stripeRowOffset,
 								 columnValues, columnNulls);
 }
-	EndChunkGroupRead(stripeReadState->chunkGroupReadState);
+
-	stripeReadState->chunkGroupReadState = NULL;
+/*
 * StripeReadIsCurrentChunkGroup returns true if chunk group being read is
 * the has given chunkGroupIndex in its stripe.
 */
 static bool
 StripeReadIsCurrentChunkGroup(StripeReadState *stripeReadState, int chunkGroupIndex)
 {
 	if (!stripeReadState->chunkGroupReadState)
 	{
 		return false;
 	}
 	return (stripeReadState->chunkGroupIndex == chunkGroupIndex);
 }
@ -390,6 +457,24 @@ ColumnarEndRead(ColumnarReadState *readState)
 }
 /*
 * ColumnarResetRead resets the stripe and the chunk group that is
 * being read currently (if any).
 */
 void
 ColumnarResetRead(ColumnarReadState *readState)
 {
 	if (StripeReadInProgress(readState))
 	{
 		pfree(readState->currentStripeMetadata);
 		readState->currentStripeMetadata = NULL;
 		readState->stripeReadState = NULL;
 		MemoryContextReset(readState->stripeReadContext);
 	}
 }
 /*
 * BeginStripeRead allocates state for reading a stripe.
 */
--- a/src/backend/columnar/columnar_tableam.c
+++ b/src/backend/columnar/columnar_tableam.c
@ -87,6 +87,25 @@ typedef struct ColumnarScanDescData
 typedef struct ColumnarScanDescData *ColumnarScanDesc;
 /*
 * IndexFetchColumnarData is the scan state passed between index_fetch_begin,
 * index_fetch_reset, index_fetch_end, index_fetch_tuple calls.
 */
 typedef struct IndexFetchColumnarData
 {
 	IndexFetchTableData cs_base;
 	ColumnarReadState *cs_readState;
 	/*
 	 * We initialize cs_readState lazily in the first columnar_index_fetch_tuple
 	 * call. However, we want to do memory allocations in a sub MemoryContext of
 	 * columnar_index_fetch_begin. For this reason, we store scanContext in
 	 * columnar_index_fetch_begin.
 	 */
 	MemoryContext scanContext;
 } IndexFetchColumnarData;
 static object_access_hook_type PrevObjectAccessHook = NULL;
 static ProcessUtility_hook_type PrevProcessUtilityHook = NULL;
@ -409,29 +428,43 @@ columnar_index_fetch_begin(Relation rel)
 	FlushWriteStateForRelfilenode(relfilenode, GetCurrentSubTransactionId());
-	IndexFetchTableData *scan = palloc0(sizeof(IndexFetchTableData));
+	MemoryContext scanContext = CreateColumnarScanMemoryContext();
-	scan->rel = rel;
+	MemoryContext oldContext = MemoryContextSwitchTo(scanContext);
-	return scan;
+
 	IndexFetchColumnarData *scan = palloc0(sizeof(IndexFetchColumnarData));
 	scan->cs_base.rel = rel;
 	scan->cs_readState = NULL;
 	scan->scanContext = scanContext;
 	MemoryContextSwitchTo(oldContext);
 	return &scan->cs_base;
 }
 static void
-columnar_index_fetch_reset(IndexFetchTableData *scan)
+columnar_index_fetch_reset(IndexFetchTableData *sscan)
 {
 	/* no-op */
 }
 static void
-columnar_index_fetch_end(IndexFetchTableData *scan)
+columnar_index_fetch_end(IndexFetchTableData *sscan)
 {
-	columnar_index_fetch_reset(scan);
+	columnar_index_fetch_reset(sscan);
-	pfree(scan);
+
 	IndexFetchColumnarData *scan = (IndexFetchColumnarData *) sscan;
 	if (scan->cs_readState)
 	{
 		ColumnarEndRead(scan->cs_readState);
 		scan->cs_readState = NULL;
 	}
 }
 static bool
-columnar_index_fetch_tuple(struct IndexFetchTableData *scan,
+columnar_index_fetch_tuple(struct IndexFetchTableData *sscan,
 						   ItemPointer tid,
 						   Snapshot snapshot,
 						   TupleTableSlot *slot,
@ -451,19 +484,35 @@ columnar_index_fetch_tuple(struct IndexFetchTableData *scan,
 	ExecClearTuple(slot);
-	/* we need all columns */
+	IndexFetchColumnarData *scan = (IndexFetchColumnarData *) sscan;
-	int natts = scan->rel->rd_att->natts;
+	Relation columnarRelation = scan->cs_base.rel;
-	Bitmapset *attr_needed = bms_add_range(NULL, 0, natts - 1);
+
-	TupleDesc relationTupleDesc = RelationGetDescr(scan->rel);
+	/* initialize read state for the first row */
-	List *relationColumnList = NeededColumnsList(relationTupleDesc, attr_needed);
+	if (scan->cs_readState == NULL)
 	{
 		MemoryContext oldContext = MemoryContextSwitchTo(scan->scanContext);
 		/* we need all columns */
 		int natts = columnarRelation->rd_att->natts;
 		Bitmapset *attr_needed = bms_add_range(NULL, 0, natts - 1);
 		/* no quals for index scan */
 		List *scanQual = NIL;
 		scan->cs_readState = init_columnar_read_state(columnarRelation,
 													  slot->tts_tupleDescriptor,
 													  attr_needed, scanQual);
 		MemoryContextSwitchTo(oldContext);
 	}
 	uint64 rowNumber = tid_to_row_number(*tid);
-	if (!ColumnarReadRowByRowNumber(scan->rel, rowNumber, relationColumnList,
+	if (!ColumnarReadRowByRowNumber(scan->cs_readState, rowNumber, slot->tts_values,
-									slot->tts_values, slot->tts_isnull, snapshot))
+									slot->tts_isnull, snapshot))
 	{
 		return false;
 	}
-	slot->tts_tableOid = RelationGetRelid(scan->rel);
+	slot->tts_tableOid = RelationGetRelid(columnarRelation);
 	slot->tts_tid = *tid;
 	ExecStoreVirtualTuple(slot);
--- a/src/include/columnar/columnar.h
+++ b/src/include/columnar/columnar.h
@ -216,10 +216,11 @@ extern ColumnarReadState * ColumnarBeginRead(Relation relation,
 extern bool ColumnarReadNextRow(ColumnarReadState *state, Datum *columnValues,
 								bool *columnNulls, uint64 *rowNumber);
 extern void ColumnarRescan(ColumnarReadState *readState);
-extern bool ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber,
+extern bool ColumnarReadRowByRowNumber(ColumnarReadState *readState,
-									   List *neededColumnList, Datum *columnValues,
+									   uint64 rowNumber, Datum *columnValues,
 									   bool *columnNulls, Snapshot snapshot);
 extern void ColumnarEndRead(ColumnarReadState *state);
 extern void ColumnarResetRead(ColumnarReadState *readState);
 extern int64 ColumnarReadChunkGroupsFiltered(ColumnarReadState *state);
 /* Function declarations for common functions */
--- a/src/test/regress/expected/columnar_indexes.out
+++ b/src/test/regress/expected/columnar_indexes.out
@ -52,6 +52,7 @@ SELECT * FROM t;
 -- make sure that we test index scan
 set columnar.enable_custom_scan to 'off';
 set enable_seqscan to off;
 set seq_page_cost TO 10000000;
 CREATE table columnar_table (a INT, b int) USING columnar;
 INSERT INTO columnar_table (a) VALUES (1), (1);
 CREATE UNIQUE INDEX CONCURRENTLY ON columnar_table (a);
@ -234,6 +235,28 @@ SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000;
 t
 (1 row)
 -- Since our index is highly correlated with the relation itself, we should
 -- de-serialize each chunk group only once. For this reason, if this test
 -- file hangs on below queries, then you should think that we are not properly
 -- caching the last-read chunk group during index reads.
 SELECT SUM(a)=312487500 FROM columnar_table WHERE a < 25000;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 SELECT SUM(a)=167000 FROM columnar_table WHERE a = 16000 OR a = 151000;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 SELECT SUM(a)=48000 FROM columnar_table WHERE a = 16000 OR a = 32000;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 TRUNCATE columnar_table;
 ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey;
 -- hash --
@ -435,5 +458,80 @@ NOTICE:  falling back to serial index build since parallel scan on columnar tabl
 REINDEX TABLE CONCURRENTLY parallel_scan_test;
 NOTICE:  falling back to serial index build since parallel scan on columnar tables is not supported
 NOTICE:  falling back to serial index build since parallel scan on columnar tables is not supported
 -- test with different data types & indexAM's --
 CREATE TABLE hash_text(a INT, b TEXT) USING columnar;
 INSERT INTO hash_text SELECT i, (i*2)::TEXT FROM generate_series(1, 10) i;
 CREATE INDEX ON hash_text USING hash (b);
 SELECT b FROM hash_text WHERE b='10';
 b
 ---------------------------------------------------------------------
 10
 (1 row)
 CREATE TABLE hash_int(a INT, b TEXT) USING columnar;
 INSERT INTO hash_int SELECT i, (i*3)::TEXT FROM generate_series(1, 10) i;
 CREATE INDEX ON hash_int USING hash (a);
 SELECT b='15' FROM hash_int WHERE a=5;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 CREATE TABLE mixed_data_types (
  timestamp_col timestamp,
  box_col box,
  circle_col circle,
  float_col float,
  uuid_col uuid,
  text_col text,
  numeric_col numeric,
  PRIMARY KEY(timestamp_col, text_col)
 ) USING columnar;
 INSERT INTO mixed_data_types
 SELECT
  to_timestamp(i+36000),
  box(point(i, i+90)),
  circle(point(i*2, i*3), i*100),
  (i*1.2)::float,
  uuid_in(md5((i*10)::text || (i*15)::text)::cstring),
  (i*8)::text,
  (i*42)::numeric
 FROM generate_series(1, 10) i;
 SELECT text_col='64'
 FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:08';
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 SELECT uuid_col='298923c8-1900-45e9-1288-b430794814c4'
 FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:01';
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 CREATE INDEX hash_uuid ON mixed_data_types USING hash(uuid_col);
 SELECT box_col=box(point(1, 91)) AND timestamp_col='1970-01-01 02:00:01'
 FROM mixed_data_types WHERE uuid_col='298923c8-1900-45e9-1288-b430794814c4';
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 DROP INDEX hash_uuid;
 CREATE INDEX btree_multi_numeric_text_timestamp
 ON mixed_data_types (numeric_col, text_col, timestamp_col);
 SELECT uuid_col='ab2481c9-f93d-0ed3-033a-3281d865ccb2'
 FROM mixed_data_types
 WHERE
  numeric_col >= 120 AND numeric_col <= 220 AND
  circle_col >= circle(point(7, 7), 350) AND
  float_col <= 5.0;
 ?column?
 ---------------------------------------------------------------------
 t
 (1 row)
 SET client_min_messages TO WARNING;
 DROP SCHEMA columnar_indexes CASCADE;
--- a/src/test/regress/sql/columnar_indexes.sql
+++ b/src/test/regress/sql/columnar_indexes.sql
@ -25,6 +25,7 @@ SELECT * FROM t;
 -- make sure that we test index scan
 set columnar.enable_custom_scan to 'off';
 set enable_seqscan to off;
 set seq_page_cost TO 10000000;
 CREATE table columnar_table (a INT, b int) USING columnar;
@ -158,6 +159,14 @@ TRUNCATE columnar_table;
 INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i;
 SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000;
 -- Since our index is highly correlated with the relation itself, we should
 -- de-serialize each chunk group only once. For this reason, if this test
 -- file hangs on below queries, then you should think that we are not properly
 -- caching the last-read chunk group during index reads.
 SELECT SUM(a)=312487500 FROM columnar_table WHERE a < 25000;
 SELECT SUM(a)=167000 FROM columnar_table WHERE a = 16000 OR a = 151000;
 SELECT SUM(a)=48000 FROM columnar_table WHERE a = 16000 OR a = 32000;
 TRUNCATE columnar_table;
 ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey;
@ -321,5 +330,66 @@ REINDEX TABLE parallel_scan_test;
 CREATE INDEX CONCURRENTLY ON parallel_scan_test (a);
 REINDEX TABLE CONCURRENTLY parallel_scan_test;
 -- test with different data types & indexAM's --
 CREATE TABLE hash_text(a INT, b TEXT) USING columnar;
 INSERT INTO hash_text SELECT i, (i*2)::TEXT FROM generate_series(1, 10) i;
 CREATE INDEX ON hash_text USING hash (b);
 SELECT b FROM hash_text WHERE b='10';
 CREATE TABLE hash_int(a INT, b TEXT) USING columnar;
 INSERT INTO hash_int SELECT i, (i*3)::TEXT FROM generate_series(1, 10) i;
 CREATE INDEX ON hash_int USING hash (a);
 SELECT b='15' FROM hash_int WHERE a=5;
 CREATE TABLE mixed_data_types (
  timestamp_col timestamp,
  box_col box,
  circle_col circle,
  float_col float,
  uuid_col uuid,
  text_col text,
  numeric_col numeric,
  PRIMARY KEY(timestamp_col, text_col)
 ) USING columnar;
 INSERT INTO mixed_data_types
 SELECT
  to_timestamp(i+36000),
  box(point(i, i+90)),
  circle(point(i*2, i*3), i*100),
  (i*1.2)::float,
  uuid_in(md5((i*10)::text || (i*15)::text)::cstring),
  (i*8)::text,
  (i*42)::numeric
 FROM generate_series(1, 10) i;
 SELECT text_col='64'
 FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:08';
 SELECT uuid_col='298923c8-1900-45e9-1288-b430794814c4'
 FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:01';
 CREATE INDEX hash_uuid ON mixed_data_types USING hash(uuid_col);
 SELECT box_col=box(point(1, 91)) AND timestamp_col='1970-01-01 02:00:01'
 FROM mixed_data_types WHERE uuid_col='298923c8-1900-45e9-1288-b430794814c4';
 DROP INDEX hash_uuid;
 CREATE INDEX btree_multi_numeric_text_timestamp
 ON mixed_data_types (numeric_col, text_col, timestamp_col);
 SELECT uuid_col='ab2481c9-f93d-0ed3-033a-3281d865ccb2'
 FROM mixed_data_types
 WHERE
  numeric_col >= 120 AND numeric_col <= 220 AND
  circle_col >= circle(point(7, 7), 350) AND
  float_col <= 5.0;
 SET client_min_messages TO WARNING;
 DROP SCHEMA columnar_indexes CASCADE;