From 83f5d42365cdd417aaad03e486c1c89f88a0384c Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Mon, 5 Jul 2021 12:46:39 +0300 Subject: [PATCH] Use long-lasting mem cxt & optimize correlated index scan --- src/backend/columnar/columnar_reader.c | 155 ++++++++++++++---- src/backend/columnar/columnar_tableam.c | 81 +++++++-- src/include/columnar/columnar.h | 5 +- .../regress/expected/columnar_indexes.out | 98 +++++++++++ src/test/regress/sql/columnar_indexes.sql | 70 ++++++++ 5 files changed, 356 insertions(+), 53 deletions(-) diff --git a/src/backend/columnar/columnar_reader.c b/src/backend/columnar/columnar_reader.c index 685e3163d..53725ad45 100644 --- a/src/backend/columnar/columnar_reader.c +++ b/src/backend/columnar/columnar_reader.c @@ -85,10 +85,14 @@ struct ColumnarReadState /* static function declarations */ static MemoryContext CreateStripeReadMemoryContext(void); -static void ReadStripeRowByRowNumber(StripeReadState *stripeReadState, - StripeMetadata *stripeMetadata, +static bool ColumnarReadIsCurrentStripe(ColumnarReadState *readState, + uint64 rowNumber); +static StripeMetadata * ColumnarReadGetCurrentStripe(ColumnarReadState *readState); +static void ReadStripeRowByRowNumber(ColumnarReadState *readState, uint64 rowNumber, Datum *columnValues, bool *columnNulls); +static bool StripeReadIsCurrentChunkGroup(StripeReadState *stripeReadState, + int chunkGroupIndex); static void ReadChunkGroupRowByRowOffset(ChunkGroupReadState *chunkGroupReadState, StripeMetadata *stripeMetadata, uint64 stripeRowOffset, Datum *columnValues, @@ -246,36 +250,76 @@ ColumnarReadNextRow(ColumnarReadState *readState, Datum *columnValues, bool *col * exists, then returns false. */ bool -ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber, - List *neededColumnList, Datum *columnValues, +ColumnarReadRowByRowNumber(ColumnarReadState *readState, + uint64 rowNumber, Datum *columnValues, bool *columnNulls, Snapshot snapshot) { - StripeMetadata *stripeMetadata = FindStripeByRowNumber(relation, rowNumber, snapshot); - if (stripeMetadata == NULL) + if (!ColumnarReadIsCurrentStripe(readState, rowNumber)) + { + Relation columnarRelation = readState->relation; + StripeMetadata *stripeMetadata = FindStripeByRowNumber(columnarRelation, + rowNumber, snapshot); + if (stripeMetadata == NULL) + { + /* no such row exists */ + return false; + } + + /* do the cleanup before reading a new stripe */ + ColumnarResetRead(readState); + + TupleDesc relationTupleDesc = RelationGetDescr(columnarRelation); + List *whereClauseList = NIL; + List *whereClauseVars = NIL; + MemoryContext stripeReadContext = readState->stripeReadContext; + readState->stripeReadState = BeginStripeRead(stripeMetadata, + columnarRelation, + relationTupleDesc, + readState->projectedColumnList, + whereClauseList, + whereClauseVars, + stripeReadContext); + + readState->currentStripeMetadata = stripeMetadata; + } + + ReadStripeRowByRowNumber(readState, rowNumber, columnValues, columnNulls); + + return true; +} + + +/* + * ColumnarReadIsCurrentStripe returns true if stripe being read contains + * row with given rowNumber. + */ +static bool +ColumnarReadIsCurrentStripe(ColumnarReadState *readState, uint64 rowNumber) +{ + if (!StripeReadInProgress(readState)) { - /* no such row exists */ return false; } - TupleDesc relationTupleDesc = RelationGetDescr(relation); - List *whereClauseList = NIL; - List *whereClauseVars = NIL; - MemoryContext stripeReadContext = CreateStripeReadMemoryContext(); - StripeReadState *stripeReadState = BeginStripeRead(stripeMetadata, - relation, - relationTupleDesc, - neededColumnList, - whereClauseList, - whereClauseVars, - stripeReadContext); + StripeMetadata *currentStripeMetadata = readState->currentStripeMetadata; + if (rowNumber >= currentStripeMetadata->firstRowNumber && + rowNumber <= StripeGetHighestRowNumber(currentStripeMetadata)) + { + return true; + } - ReadStripeRowByRowNumber(stripeReadState, stripeMetadata, rowNumber, - columnValues, columnNulls); + return false; +} - EndStripeRead(stripeReadState); - MemoryContextReset(stripeReadContext); - return true; +/* + * ColumnarReadGetCurrentStripe returns StripeMetadata for the stripe that is + * being read. + */ +static StripeMetadata * +ColumnarReadGetCurrentStripe(ColumnarReadState *readState) +{ + return readState->currentStripeMetadata; } @@ -285,11 +329,13 @@ ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber, * Errors out if no such row exists in the stripe being read. */ static void -ReadStripeRowByRowNumber(StripeReadState *stripeReadState, - StripeMetadata *stripeMetadata, +ReadStripeRowByRowNumber(ColumnarReadState *readState, uint64 rowNumber, Datum *columnValues, bool *columnNulls) { + StripeMetadata *stripeMetadata = ColumnarReadGetCurrentStripe(readState); + StripeReadState *stripeReadState = readState->stripeReadState; + if (rowNumber < stripeMetadata->firstRowNumber) { /* not expected but be on the safe side */ @@ -298,21 +344,42 @@ ReadStripeRowByRowNumber(StripeReadState *stripeReadState, /* find the exact chunk group to be read */ uint64 stripeRowOffset = rowNumber - stripeMetadata->firstRowNumber; - stripeReadState->chunkGroupIndex = stripeRowOffset / - stripeMetadata->chunkGroupRowCount; - stripeReadState->chunkGroupReadState = BeginChunkGroupRead( - stripeReadState->stripeBuffers, - stripeReadState->chunkGroupIndex, - stripeReadState->tupleDescriptor, - stripeReadState->projectedColumnList, - stripeReadState->stripeReadContext); + int chunkGroupIndex = stripeRowOffset / stripeMetadata->chunkGroupRowCount; + if (!StripeReadIsCurrentChunkGroup(stripeReadState, chunkGroupIndex)) + { + if (stripeReadState->chunkGroupReadState) + { + EndChunkGroupRead(stripeReadState->chunkGroupReadState); + } + + stripeReadState->chunkGroupIndex = chunkGroupIndex; + stripeReadState->chunkGroupReadState = BeginChunkGroupRead( + stripeReadState->stripeBuffers, + stripeReadState->chunkGroupIndex, + stripeReadState->tupleDescriptor, + stripeReadState->projectedColumnList, + stripeReadState->stripeReadContext); + } ReadChunkGroupRowByRowOffset(stripeReadState->chunkGroupReadState, stripeMetadata, stripeRowOffset, columnValues, columnNulls); +} - EndChunkGroupRead(stripeReadState->chunkGroupReadState); - stripeReadState->chunkGroupReadState = NULL; + +/* + * StripeReadIsCurrentChunkGroup returns true if chunk group being read is + * the has given chunkGroupIndex in its stripe. + */ +static bool +StripeReadIsCurrentChunkGroup(StripeReadState *stripeReadState, int chunkGroupIndex) +{ + if (!stripeReadState->chunkGroupReadState) + { + return false; + } + + return (stripeReadState->chunkGroupIndex == chunkGroupIndex); } @@ -390,6 +457,24 @@ ColumnarEndRead(ColumnarReadState *readState) } +/* + * ColumnarResetRead resets the stripe and the chunk group that is + * being read currently (if any). + */ +void +ColumnarResetRead(ColumnarReadState *readState) +{ + if (StripeReadInProgress(readState)) + { + pfree(readState->currentStripeMetadata); + readState->currentStripeMetadata = NULL; + + readState->stripeReadState = NULL; + MemoryContextReset(readState->stripeReadContext); + } +} + + /* * BeginStripeRead allocates state for reading a stripe. */ diff --git a/src/backend/columnar/columnar_tableam.c b/src/backend/columnar/columnar_tableam.c index 143467169..532613dd4 100644 --- a/src/backend/columnar/columnar_tableam.c +++ b/src/backend/columnar/columnar_tableam.c @@ -87,6 +87,25 @@ typedef struct ColumnarScanDescData typedef struct ColumnarScanDescData *ColumnarScanDesc; +/* + * IndexFetchColumnarData is the scan state passed between index_fetch_begin, + * index_fetch_reset, index_fetch_end, index_fetch_tuple calls. + */ +typedef struct IndexFetchColumnarData +{ + IndexFetchTableData cs_base; + ColumnarReadState *cs_readState; + + /* + * We initialize cs_readState lazily in the first columnar_index_fetch_tuple + * call. However, we want to do memory allocations in a sub MemoryContext of + * columnar_index_fetch_begin. For this reason, we store scanContext in + * columnar_index_fetch_begin. + */ + MemoryContext scanContext; +} IndexFetchColumnarData; + + static object_access_hook_type PrevObjectAccessHook = NULL; static ProcessUtility_hook_type PrevProcessUtilityHook = NULL; @@ -409,29 +428,43 @@ columnar_index_fetch_begin(Relation rel) FlushWriteStateForRelfilenode(relfilenode, GetCurrentSubTransactionId()); - IndexFetchTableData *scan = palloc0(sizeof(IndexFetchTableData)); - scan->rel = rel; - return scan; + MemoryContext scanContext = CreateColumnarScanMemoryContext(); + MemoryContext oldContext = MemoryContextSwitchTo(scanContext); + + IndexFetchColumnarData *scan = palloc0(sizeof(IndexFetchColumnarData)); + scan->cs_base.rel = rel; + scan->cs_readState = NULL; + scan->scanContext = scanContext; + + MemoryContextSwitchTo(oldContext); + + return &scan->cs_base; } static void -columnar_index_fetch_reset(IndexFetchTableData *scan) +columnar_index_fetch_reset(IndexFetchTableData *sscan) { /* no-op */ } static void -columnar_index_fetch_end(IndexFetchTableData *scan) +columnar_index_fetch_end(IndexFetchTableData *sscan) { - columnar_index_fetch_reset(scan); - pfree(scan); + columnar_index_fetch_reset(sscan); + + IndexFetchColumnarData *scan = (IndexFetchColumnarData *) sscan; + if (scan->cs_readState) + { + ColumnarEndRead(scan->cs_readState); + scan->cs_readState = NULL; + } } static bool -columnar_index_fetch_tuple(struct IndexFetchTableData *scan, +columnar_index_fetch_tuple(struct IndexFetchTableData *sscan, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot, @@ -451,19 +484,35 @@ columnar_index_fetch_tuple(struct IndexFetchTableData *scan, ExecClearTuple(slot); - /* we need all columns */ - int natts = scan->rel->rd_att->natts; - Bitmapset *attr_needed = bms_add_range(NULL, 0, natts - 1); - TupleDesc relationTupleDesc = RelationGetDescr(scan->rel); - List *relationColumnList = NeededColumnsList(relationTupleDesc, attr_needed); + IndexFetchColumnarData *scan = (IndexFetchColumnarData *) sscan; + Relation columnarRelation = scan->cs_base.rel; + + /* initialize read state for the first row */ + if (scan->cs_readState == NULL) + { + MemoryContext oldContext = MemoryContextSwitchTo(scan->scanContext); + + /* we need all columns */ + int natts = columnarRelation->rd_att->natts; + Bitmapset *attr_needed = bms_add_range(NULL, 0, natts - 1); + + /* no quals for index scan */ + List *scanQual = NIL; + + scan->cs_readState = init_columnar_read_state(columnarRelation, + slot->tts_tupleDescriptor, + attr_needed, scanQual); + MemoryContextSwitchTo(oldContext); + } + uint64 rowNumber = tid_to_row_number(*tid); - if (!ColumnarReadRowByRowNumber(scan->rel, rowNumber, relationColumnList, - slot->tts_values, slot->tts_isnull, snapshot)) + if (!ColumnarReadRowByRowNumber(scan->cs_readState, rowNumber, slot->tts_values, + slot->tts_isnull, snapshot)) { return false; } - slot->tts_tableOid = RelationGetRelid(scan->rel); + slot->tts_tableOid = RelationGetRelid(columnarRelation); slot->tts_tid = *tid; ExecStoreVirtualTuple(slot); diff --git a/src/include/columnar/columnar.h b/src/include/columnar/columnar.h index 24c96e419..eaeb9fa21 100644 --- a/src/include/columnar/columnar.h +++ b/src/include/columnar/columnar.h @@ -216,10 +216,11 @@ extern ColumnarReadState * ColumnarBeginRead(Relation relation, extern bool ColumnarReadNextRow(ColumnarReadState *state, Datum *columnValues, bool *columnNulls, uint64 *rowNumber); extern void ColumnarRescan(ColumnarReadState *readState); -extern bool ColumnarReadRowByRowNumber(Relation relation, uint64 rowNumber, - List *neededColumnList, Datum *columnValues, +extern bool ColumnarReadRowByRowNumber(ColumnarReadState *readState, + uint64 rowNumber, Datum *columnValues, bool *columnNulls, Snapshot snapshot); extern void ColumnarEndRead(ColumnarReadState *state); +extern void ColumnarResetRead(ColumnarReadState *readState); extern int64 ColumnarReadChunkGroupsFiltered(ColumnarReadState *state); /* Function declarations for common functions */ diff --git a/src/test/regress/expected/columnar_indexes.out b/src/test/regress/expected/columnar_indexes.out index 7b28bce21..a61133a68 100644 --- a/src/test/regress/expected/columnar_indexes.out +++ b/src/test/regress/expected/columnar_indexes.out @@ -52,6 +52,7 @@ SELECT * FROM t; -- make sure that we test index scan set columnar.enable_custom_scan to 'off'; set enable_seqscan to off; +set seq_page_cost TO 10000000; CREATE table columnar_table (a INT, b int) USING columnar; INSERT INTO columnar_table (a) VALUES (1), (1); CREATE UNIQUE INDEX CONCURRENTLY ON columnar_table (a); @@ -234,6 +235,28 @@ SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000; t (1 row) +-- Since our index is highly correlated with the relation itself, we should +-- de-serialize each chunk group only once. For this reason, if this test +-- file hangs on below queries, then you should think that we are not properly +-- caching the last-read chunk group during index reads. +SELECT SUM(a)=312487500 FROM columnar_table WHERE a < 25000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT SUM(a)=167000 FROM columnar_table WHERE a = 16000 OR a = 151000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT SUM(a)=48000 FROM columnar_table WHERE a = 16000 OR a = 32000; + ?column? +--------------------------------------------------------------------- + t +(1 row) + TRUNCATE columnar_table; ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey; -- hash -- @@ -435,5 +458,80 @@ NOTICE: falling back to serial index build since parallel scan on columnar tabl REINDEX TABLE CONCURRENTLY parallel_scan_test; NOTICE: falling back to serial index build since parallel scan on columnar tables is not supported NOTICE: falling back to serial index build since parallel scan on columnar tables is not supported +-- test with different data types & indexAM's -- +CREATE TABLE hash_text(a INT, b TEXT) USING columnar; +INSERT INTO hash_text SELECT i, (i*2)::TEXT FROM generate_series(1, 10) i; +CREATE INDEX ON hash_text USING hash (b); +SELECT b FROM hash_text WHERE b='10'; + b +--------------------------------------------------------------------- + 10 +(1 row) + +CREATE TABLE hash_int(a INT, b TEXT) USING columnar; +INSERT INTO hash_int SELECT i, (i*3)::TEXT FROM generate_series(1, 10) i; +CREATE INDEX ON hash_int USING hash (a); +SELECT b='15' FROM hash_int WHERE a=5; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +CREATE TABLE mixed_data_types ( + timestamp_col timestamp, + box_col box, + circle_col circle, + float_col float, + uuid_col uuid, + text_col text, + numeric_col numeric, + PRIMARY KEY(timestamp_col, text_col) +) USING columnar; +INSERT INTO mixed_data_types +SELECT + to_timestamp(i+36000), + box(point(i, i+90)), + circle(point(i*2, i*3), i*100), + (i*1.2)::float, + uuid_in(md5((i*10)::text || (i*15)::text)::cstring), + (i*8)::text, + (i*42)::numeric +FROM generate_series(1, 10) i; +SELECT text_col='64' +FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:08'; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT uuid_col='298923c8-1900-45e9-1288-b430794814c4' +FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:01'; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +CREATE INDEX hash_uuid ON mixed_data_types USING hash(uuid_col); +SELECT box_col=box(point(1, 91)) AND timestamp_col='1970-01-01 02:00:01' +FROM mixed_data_types WHERE uuid_col='298923c8-1900-45e9-1288-b430794814c4'; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +DROP INDEX hash_uuid; +CREATE INDEX btree_multi_numeric_text_timestamp +ON mixed_data_types (numeric_col, text_col, timestamp_col); +SELECT uuid_col='ab2481c9-f93d-0ed3-033a-3281d865ccb2' +FROM mixed_data_types +WHERE + numeric_col >= 120 AND numeric_col <= 220 AND + circle_col >= circle(point(7, 7), 350) AND + float_col <= 5.0; + ?column? +--------------------------------------------------------------------- + t +(1 row) + SET client_min_messages TO WARNING; DROP SCHEMA columnar_indexes CASCADE; diff --git a/src/test/regress/sql/columnar_indexes.sql b/src/test/regress/sql/columnar_indexes.sql index 1ec529872..d2caa120e 100644 --- a/src/test/regress/sql/columnar_indexes.sql +++ b/src/test/regress/sql/columnar_indexes.sql @@ -25,6 +25,7 @@ SELECT * FROM t; -- make sure that we test index scan set columnar.enable_custom_scan to 'off'; set enable_seqscan to off; +set seq_page_cost TO 10000000; CREATE table columnar_table (a INT, b int) USING columnar; @@ -158,6 +159,14 @@ TRUNCATE columnar_table; INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i; SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000; +-- Since our index is highly correlated with the relation itself, we should +-- de-serialize each chunk group only once. For this reason, if this test +-- file hangs on below queries, then you should think that we are not properly +-- caching the last-read chunk group during index reads. +SELECT SUM(a)=312487500 FROM columnar_table WHERE a < 25000; +SELECT SUM(a)=167000 FROM columnar_table WHERE a = 16000 OR a = 151000; +SELECT SUM(a)=48000 FROM columnar_table WHERE a = 16000 OR a = 32000; + TRUNCATE columnar_table; ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey; @@ -321,5 +330,66 @@ REINDEX TABLE parallel_scan_test; CREATE INDEX CONCURRENTLY ON parallel_scan_test (a); REINDEX TABLE CONCURRENTLY parallel_scan_test; +-- test with different data types & indexAM's -- + +CREATE TABLE hash_text(a INT, b TEXT) USING columnar; +INSERT INTO hash_text SELECT i, (i*2)::TEXT FROM generate_series(1, 10) i; + +CREATE INDEX ON hash_text USING hash (b); + +SELECT b FROM hash_text WHERE b='10'; + +CREATE TABLE hash_int(a INT, b TEXT) USING columnar; +INSERT INTO hash_int SELECT i, (i*3)::TEXT FROM generate_series(1, 10) i; + +CREATE INDEX ON hash_int USING hash (a); + +SELECT b='15' FROM hash_int WHERE a=5; + +CREATE TABLE mixed_data_types ( + timestamp_col timestamp, + box_col box, + circle_col circle, + float_col float, + uuid_col uuid, + text_col text, + numeric_col numeric, + PRIMARY KEY(timestamp_col, text_col) +) USING columnar; + +INSERT INTO mixed_data_types +SELECT + to_timestamp(i+36000), + box(point(i, i+90)), + circle(point(i*2, i*3), i*100), + (i*1.2)::float, + uuid_in(md5((i*10)::text || (i*15)::text)::cstring), + (i*8)::text, + (i*42)::numeric +FROM generate_series(1, 10) i; + +SELECT text_col='64' +FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:08'; + +SELECT uuid_col='298923c8-1900-45e9-1288-b430794814c4' +FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:01'; + +CREATE INDEX hash_uuid ON mixed_data_types USING hash(uuid_col); + +SELECT box_col=box(point(1, 91)) AND timestamp_col='1970-01-01 02:00:01' +FROM mixed_data_types WHERE uuid_col='298923c8-1900-45e9-1288-b430794814c4'; + +DROP INDEX hash_uuid; + +CREATE INDEX btree_multi_numeric_text_timestamp +ON mixed_data_types (numeric_col, text_col, timestamp_col); + +SELECT uuid_col='ab2481c9-f93d-0ed3-033a-3281d865ccb2' +FROM mixed_data_types +WHERE + numeric_col >= 120 AND numeric_col <= 220 AND + circle_col >= circle(point(7, 7), 350) AND + float_col <= 5.0; + SET client_min_messages TO WARNING; DROP SCHEMA columnar_indexes CASCADE;