Allow overwriting columnar storage pages written by aborted xacts (#5484)

When refactoring storage layer in #4907, we deleted the code that allows
overwriting a disk page previously written but not known by metadata.

Readers can see the change that introduced the code allows doing so in
commit a8da9acc63.

The reasoning was that; as of 10.2, we started aligning page
reservations (`AlignReservation`) for subsequent writes right after
allocating pages from disk. That means, even if writer transaction
fails, subsequent writes are guaranteed to allocate a new page and write
to there. For this reason, attempting to write to a page allocated
before is not possible for a columnar table that user created when using
v10.2.x.

However, since the older versions of columnar doesn't do that, following
example scenario can still result in writing to such disk page, even if
user now upgraded to v10.2.x. This is because, when upgrading storage to
2.0 (`ColumnarStorageUpdateIfNeeded`), we calculate `reservedOffset` of
the metapage based on the highest used address known by stripe
metadata (`GetHighestUsedAddressAndId`). However, stripe metadata
doesn't have entries for aborted writes. As a result, highest used
address would be computed by ignoring pages that are allocated but not
used.

- User attempts writing to columnar table on Citus v10.0x/v10.1x.
- Write operation fails for some reason.
- User upgrades Citus to v10.2.x.
- When attempting to write to same columnar table, they hit to "attempt
  to write columnar data .." error since write operation done in the
  older version of columnar already allocated that page, and now we are
  overwriting it.

For this reason, with this commit, we re-do the change done in
a8da9acc63.

And for the reasons given above, it wasn't possible to add a test for
this commit via usual code-paths. For this reason, added a UDF only for
testing purposes so that we can reproduce the exact scenario in our
regression test suite.

(cherry picked from commit 76b8006a9e)
pull/5568/head
Onur Tirtir 2021-11-26 09:51:13 +03:00
parent b66abbcba8
commit f5b297e149
3 changed files with 81 additions and 1 deletions

View File

@ -104,6 +104,10 @@ typedef struct PhysicalAddr
"version or run \"ALTER EXTENSION citus UPDATE\"."
/* only for testing purposes */
PG_FUNCTION_INFO_V1(test_columnar_storage_write_new_page);
/*
* Map logical offsets to a physical page and offset where the data is kept.
*/
@ -704,13 +708,32 @@ WriteToBlock(Relation rel, BlockNumber blockno, uint32 offset, char *buf,
PageInit(page, BLCKSZ, 0);
}
if (phdr->pd_lower != offset || phdr->pd_upper - offset < len)
if (phdr->pd_lower < offset || phdr->pd_upper - offset < len)
{
elog(ERROR,
"attempt to write columnar data of length %d to offset %d of block %d of relation %d",
len, offset, blockno, rel->rd_id);
}
/*
* After a transaction has been rolled-back, we might be
* over-writing the rolledback write, so phdr->pd_lower can be
* different from addr.offset.
*
* We reset pd_lower to reset the rolledback write.
*
* Given that we always align page reservation to the next page as of
* 10.2, having such a disk page is only possible if write operaion
* failed in an older version of columnar, but now user attempts writing
* to that table in version >= 10.2.
*/
if (phdr->pd_lower > offset)
{
ereport(DEBUG4, (errmsg("overwriting page %u", blockno),
errdetail("This can happen after a roll-back.")));
phdr->pd_lower = offset;
}
START_CRIT_SECTION();
memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower, buf, len);
@ -820,3 +843,36 @@ ColumnarMetapageCheckVersion(Relation rel, ColumnarMetapage *metapage)
errhint(OLD_METAPAGE_VERSION_HINT)));
}
}
/*
* test_columnar_storage_write_new_page is a UDF only used for testing
* purposes. It could make more sense to define this in columnar_debug.c,
* but the storage layer doesn't expose ColumnarMetapage to any other files,
* so we define it here.
*/
Datum
test_columnar_storage_write_new_page(PG_FUNCTION_ARGS)
{
Oid relationId = PG_GETARG_OID(0);
Relation relation = relation_open(relationId, AccessShareLock);
/*
* Allocate a new page, write some data to there, and set reserved offset
* to the start of that page. That way, for a subsequent write operation,
* storage layer would try to overwrite the page that we allocated here.
*/
uint64 newPageOffset = ColumnarStorageGetReservedOffset(relation, false);
ColumnarStorageReserveData(relation, 100);
ColumnarStorageWrite(relation, newPageOffset, "foo_bar", 8);
ColumnarMetapage metapage = ColumnarMetapageRead(relation, false);
metapage.reservedOffset = newPageOffset;
ColumnarOverwriteMetapage(relation, metapage);
relation_close(relation, AccessShareLock);
PG_RETURN_VOID();
}

View File

@ -291,6 +291,20 @@ BEGIN;
(1 row)
ROLLBACK;
CREATE OR REPLACE FUNCTION test_columnar_storage_write_new_page(relation regclass) RETURNS void
STRICT LANGUAGE c AS 'citus', 'test_columnar_storage_write_new_page';
CREATE TABLE aborted_write (a int, b int) USING columnar;
SELECT test_columnar_storage_write_new_page('aborted_write');
test_columnar_storage_write_new_page
---------------------------------------------------------------------
(1 row)
SET client_min_messages TO DEBUG4;
INSERT INTO aborted_write VALUES (5);
DEBUG: Flushing Stripe of size 1
DEBUG: overwriting page 2
DETAIL: This can happen after a roll-back.
RESET search_path;
SET client_min_messages TO WARNING;
DROP SCHEMA columnar_insert CASCADE;

View File

@ -199,6 +199,16 @@ BEGIN;
SELECT a FROM flush_create_index WHERE a=5;
ROLLBACK;
CREATE OR REPLACE FUNCTION test_columnar_storage_write_new_page(relation regclass) RETURNS void
STRICT LANGUAGE c AS 'citus', 'test_columnar_storage_write_new_page';
CREATE TABLE aborted_write (a int, b int) USING columnar;
SELECT test_columnar_storage_write_new_page('aborted_write');
SET client_min_messages TO DEBUG4;
INSERT INTO aborted_write VALUES (5);
RESET search_path;
SET client_min_messages TO WARNING;
DROP SCHEMA columnar_insert CASCADE;