From 4ee0fb2758d41517ca577fd5d7e9b58242c2b105 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Mon, 13 Sep 2021 11:50:53 +0300 Subject: [PATCH] Make sure to skip aborted writes when reading the first tuple (#5274) With 5825c44d5f45ae78cd51484dfe1828f691eb82ea, we made the changes to skip aborted writes when scanning a columnar table. However, looks like we forgot to handle such cases for the very first call made to columnar_getnextslot. That means, that commit only considered the intermediate stripe read operations. However, functions called by columnar_getnextslot to find first stripe to read (ColumnarBeginRead & ColumnarRescan) were not caring about those aborted writes. To fix that, we teach AdvanceStripeRead to find the very first stripe to read, and then start using it where were blindly calling FindNextStripeByRowNumber. --- src/backend/columnar/columnar_reader.c | 27 ++++++++++++------- .../regress/expected/columnar_indexes.out | 23 ++++++++++++++++ src/test/regress/sql/columnar_indexes.sql | 26 ++++++++++++++++++ 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/src/backend/columnar/columnar_reader.c b/src/backend/columnar/columnar_reader.c index cc72fd77b..f90068c76 100644 --- a/src/backend/columnar/columnar_reader.c +++ b/src/backend/columnar/columnar_reader.c @@ -196,13 +196,13 @@ ColumnarBeginRead(Relation relation, TupleDesc tupleDescriptor, readState->tupleDescriptor = tupleDescriptor; readState->stripeReadContext = stripeReadContext; readState->stripeReadState = NULL; - readState->currentStripeMetadata = FindNextStripeByRowNumber(relation, - COLUMNAR_INVALID_ROW_NUMBER, - snapshot); readState->scanContext = scanContext; readState->snapshot = snapshot; readState->snapshotRegisteredByUs = snapshotRegisteredByUs; + /* set currentStripeMetadata for the first stripe to read */ + AdvanceStripeRead(readState); + return readState; } @@ -472,9 +472,10 @@ ColumnarRescan(ColumnarReadState *readState, List *scanQual) MemoryContext oldContext = MemoryContextSwitchTo(readState->scanContext); ColumnarResetRead(readState); - readState->currentStripeMetadata = FindNextStripeByRowNumber(readState->relation, - COLUMNAR_INVALID_ROW_NUMBER, - readState->snapshot); + + /* set currentStripeMetadata for the first stripe to read */ + AdvanceStripeRead(readState); + readState->chunkGroupsFiltered = 0; readState->whereClauseList = copyObject(scanQual); @@ -572,11 +573,17 @@ AdvanceStripeRead(ColumnarReadState *readState) { MemoryContext oldContext = MemoryContextSwitchTo(readState->scanContext); - readState->chunkGroupsFiltered += - readState->stripeReadState->chunkGroupsFiltered; + /* if not read any stripes yet, start from the first one .. */ + uint64 lastReadRowNumber = COLUMNAR_INVALID_ROW_NUMBER; + if (StripeReadInProgress(readState)) + { + /* .. otherwise, continue with the next stripe */ + lastReadRowNumber = StripeGetHighestRowNumber(readState->currentStripeMetadata); + + readState->chunkGroupsFiltered += + readState->stripeReadState->chunkGroupsFiltered; + } - uint64 lastReadRowNumber = - StripeGetHighestRowNumber(readState->currentStripeMetadata); readState->currentStripeMetadata = FindNextStripeByRowNumber(readState->relation, lastReadRowNumber, readState->snapshot); diff --git a/src/test/regress/expected/columnar_indexes.out b/src/test/regress/expected/columnar_indexes.out index f3cd5d2f3..1a39c781b 100644 --- a/src/test/regress/expected/columnar_indexes.out +++ b/src/test/regress/expected/columnar_indexes.out @@ -544,6 +544,29 @@ ERROR: duplicate key value violates unique constraint "aborted_write_test_pkey" DETAIL: Key (a)=(16999) already exists. -- since second INSERT already failed, should not throw a "duplicate key" error REINDEX TABLE aborted_write_test; +BEGIN; + ALTER TABLE columnar.stripe SET (autovacuum_enabled = false); + ALTER TABLE columnar.chunk SET (autovacuum_enabled = false); + ALTER TABLE columnar.chunk_group SET (autovacuum_enabled = false); + DROP TABLE aborted_write_test; + TRUNCATE columnar.stripe, columnar.chunk, columnar.chunk_group; + CREATE TABLE aborted_write_test (a INT) USING columnar; + SAVEPOINT svpt; + INSERT INTO aborted_write_test SELECT i FROM generate_series(1, 2) i; + -- force flush write state + SELECT FROM aborted_write_test; +-- +(2 rows) + + ROLLBACK TO SAVEPOINT svpt; + -- Already disabled autovacuum for all three metadata tables. + -- Here we truncate columnar.chunk and columnar.chunk_group but not + -- columnar.stripe to make sure that we properly handle dead tuples + -- in columnar.stripe, i.e. stripe metadata entries for aborted + -- transactions. + TRUNCATE columnar.chunk, columnar.chunk_group; + CREATE INDEX ON aborted_write_test (a); +ROLLBACK; create table events (event_id bigserial, event_time timestamptz default now(), payload text) using columnar; BEGIN; -- this wouldn't flush any data diff --git a/src/test/regress/sql/columnar_indexes.sql b/src/test/regress/sql/columnar_indexes.sql index f3a84c13a..67382f201 100644 --- a/src/test/regress/sql/columnar_indexes.sql +++ b/src/test/regress/sql/columnar_indexes.sql @@ -406,6 +406,32 @@ INSERT INTO aborted_write_test VALUES (16999); -- since second INSERT already failed, should not throw a "duplicate key" error REINDEX TABLE aborted_write_test; +BEGIN; + ALTER TABLE columnar.stripe SET (autovacuum_enabled = false); + ALTER TABLE columnar.chunk SET (autovacuum_enabled = false); + ALTER TABLE columnar.chunk_group SET (autovacuum_enabled = false); + + DROP TABLE aborted_write_test; + TRUNCATE columnar.stripe, columnar.chunk, columnar.chunk_group; + + CREATE TABLE aborted_write_test (a INT) USING columnar; + + SAVEPOINT svpt; + INSERT INTO aborted_write_test SELECT i FROM generate_series(1, 2) i; + -- force flush write state + SELECT FROM aborted_write_test; + ROLLBACK TO SAVEPOINT svpt; + + -- Already disabled autovacuum for all three metadata tables. + -- Here we truncate columnar.chunk and columnar.chunk_group but not + -- columnar.stripe to make sure that we properly handle dead tuples + -- in columnar.stripe, i.e. stripe metadata entries for aborted + -- transactions. + TRUNCATE columnar.chunk, columnar.chunk_group; + + CREATE INDEX ON aborted_write_test (a); +ROLLBACK; + create table events (event_id bigserial, event_time timestamptz default now(), payload text) using columnar; BEGIN; -- this wouldn't flush any data