mirror of https://github.com/citusdata/citus.git
Allow overwriting columnar storage pages written by aborted xacts (#5484)
When refactoring storage layer in #4907, we deleted the code that allows overwriting a disk page previously written but not known by metadata. Readers can see the change that introduced the code allows doing so in commitpull/5568/heada8da9acc63
. The reasoning was that; as of 10.2, we started aligning page reservations (`AlignReservation`) for subsequent writes right after allocating pages from disk. That means, even if writer transaction fails, subsequent writes are guaranteed to allocate a new page and write to there. For this reason, attempting to write to a page allocated before is not possible for a columnar table that user created when using v10.2.x. However, since the older versions of columnar doesn't do that, following example scenario can still result in writing to such disk page, even if user now upgraded to v10.2.x. This is because, when upgrading storage to 2.0 (`ColumnarStorageUpdateIfNeeded`), we calculate `reservedOffset` of the metapage based on the highest used address known by stripe metadata (`GetHighestUsedAddressAndId`). However, stripe metadata doesn't have entries for aborted writes. As a result, highest used address would be computed by ignoring pages that are allocated but not used. - User attempts writing to columnar table on Citus v10.0x/v10.1x. - Write operation fails for some reason. - User upgrades Citus to v10.2.x. - When attempting to write to same columnar table, they hit to "attempt to write columnar data .." error since write operation done in the older version of columnar already allocated that page, and now we are overwriting it. For this reason, with this commit, we re-do the change done ina8da9acc63
. And for the reasons given above, it wasn't possible to add a test for this commit via usual code-paths. For this reason, added a UDF only for testing purposes so that we can reproduce the exact scenario in our regression test suite. (cherry picked from commit76b8006a9e
)
parent
b66abbcba8
commit
f5b297e149
|
@ -104,6 +104,10 @@ typedef struct PhysicalAddr
|
|||
"version or run \"ALTER EXTENSION citus UPDATE\"."
|
||||
|
||||
|
||||
/* only for testing purposes */
|
||||
PG_FUNCTION_INFO_V1(test_columnar_storage_write_new_page);
|
||||
|
||||
|
||||
/*
|
||||
* Map logical offsets to a physical page and offset where the data is kept.
|
||||
*/
|
||||
|
@ -704,13 +708,32 @@ WriteToBlock(Relation rel, BlockNumber blockno, uint32 offset, char *buf,
|
|||
PageInit(page, BLCKSZ, 0);
|
||||
}
|
||||
|
||||
if (phdr->pd_lower != offset || phdr->pd_upper - offset < len)
|
||||
if (phdr->pd_lower < offset || phdr->pd_upper - offset < len)
|
||||
{
|
||||
elog(ERROR,
|
||||
"attempt to write columnar data of length %d to offset %d of block %d of relation %d",
|
||||
len, offset, blockno, rel->rd_id);
|
||||
}
|
||||
|
||||
/*
|
||||
* After a transaction has been rolled-back, we might be
|
||||
* over-writing the rolledback write, so phdr->pd_lower can be
|
||||
* different from addr.offset.
|
||||
*
|
||||
* We reset pd_lower to reset the rolledback write.
|
||||
*
|
||||
* Given that we always align page reservation to the next page as of
|
||||
* 10.2, having such a disk page is only possible if write operaion
|
||||
* failed in an older version of columnar, but now user attempts writing
|
||||
* to that table in version >= 10.2.
|
||||
*/
|
||||
if (phdr->pd_lower > offset)
|
||||
{
|
||||
ereport(DEBUG4, (errmsg("overwriting page %u", blockno),
|
||||
errdetail("This can happen after a roll-back.")));
|
||||
phdr->pd_lower = offset;
|
||||
}
|
||||
|
||||
START_CRIT_SECTION();
|
||||
|
||||
memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower, buf, len);
|
||||
|
@ -820,3 +843,36 @@ ColumnarMetapageCheckVersion(Relation rel, ColumnarMetapage *metapage)
|
|||
errhint(OLD_METAPAGE_VERSION_HINT)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* test_columnar_storage_write_new_page is a UDF only used for testing
|
||||
* purposes. It could make more sense to define this in columnar_debug.c,
|
||||
* but the storage layer doesn't expose ColumnarMetapage to any other files,
|
||||
* so we define it here.
|
||||
*/
|
||||
Datum
|
||||
test_columnar_storage_write_new_page(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Oid relationId = PG_GETARG_OID(0);
|
||||
|
||||
Relation relation = relation_open(relationId, AccessShareLock);
|
||||
|
||||
/*
|
||||
* Allocate a new page, write some data to there, and set reserved offset
|
||||
* to the start of that page. That way, for a subsequent write operation,
|
||||
* storage layer would try to overwrite the page that we allocated here.
|
||||
*/
|
||||
uint64 newPageOffset = ColumnarStorageGetReservedOffset(relation, false);
|
||||
|
||||
ColumnarStorageReserveData(relation, 100);
|
||||
ColumnarStorageWrite(relation, newPageOffset, "foo_bar", 8);
|
||||
|
||||
ColumnarMetapage metapage = ColumnarMetapageRead(relation, false);
|
||||
metapage.reservedOffset = newPageOffset;
|
||||
ColumnarOverwriteMetapage(relation, metapage);
|
||||
|
||||
relation_close(relation, AccessShareLock);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
|
|
@ -291,6 +291,20 @@ BEGIN;
|
|||
(1 row)
|
||||
|
||||
ROLLBACK;
|
||||
CREATE OR REPLACE FUNCTION test_columnar_storage_write_new_page(relation regclass) RETURNS void
|
||||
STRICT LANGUAGE c AS 'citus', 'test_columnar_storage_write_new_page';
|
||||
CREATE TABLE aborted_write (a int, b int) USING columnar;
|
||||
SELECT test_columnar_storage_write_new_page('aborted_write');
|
||||
test_columnar_storage_write_new_page
|
||||
---------------------------------------------------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
SET client_min_messages TO DEBUG4;
|
||||
INSERT INTO aborted_write VALUES (5);
|
||||
DEBUG: Flushing Stripe of size 1
|
||||
DEBUG: overwriting page 2
|
||||
DETAIL: This can happen after a roll-back.
|
||||
RESET search_path;
|
||||
SET client_min_messages TO WARNING;
|
||||
DROP SCHEMA columnar_insert CASCADE;
|
||||
|
|
|
@ -199,6 +199,16 @@ BEGIN;
|
|||
SELECT a FROM flush_create_index WHERE a=5;
|
||||
ROLLBACK;
|
||||
|
||||
CREATE OR REPLACE FUNCTION test_columnar_storage_write_new_page(relation regclass) RETURNS void
|
||||
STRICT LANGUAGE c AS 'citus', 'test_columnar_storage_write_new_page';
|
||||
|
||||
CREATE TABLE aborted_write (a int, b int) USING columnar;
|
||||
|
||||
SELECT test_columnar_storage_write_new_page('aborted_write');
|
||||
|
||||
SET client_min_messages TO DEBUG4;
|
||||
INSERT INTO aborted_write VALUES (5);
|
||||
|
||||
RESET search_path;
|
||||
SET client_min_messages TO WARNING;
|
||||
DROP SCHEMA columnar_insert CASCADE;
|
||||
|
|
Loading…
Reference in New Issue