diff --git a/src/backend/columnar/columnar_storage.c b/src/backend/columnar/columnar_storage.c index 509cfd95b..f5a35c1cf 100644 --- a/src/backend/columnar/columnar_storage.c +++ b/src/backend/columnar/columnar_storage.c @@ -104,6 +104,10 @@ typedef struct PhysicalAddr "version or run \"ALTER EXTENSION citus UPDATE\"." +/* only for testing purposes */ +PG_FUNCTION_INFO_V1(test_columnar_storage_write_new_page); + + /* * Map logical offsets to a physical page and offset where the data is kept. */ @@ -704,13 +708,32 @@ WriteToBlock(Relation rel, BlockNumber blockno, uint32 offset, char *buf, PageInit(page, BLCKSZ, 0); } - if (phdr->pd_lower != offset || phdr->pd_upper - offset < len) + if (phdr->pd_lower < offset || phdr->pd_upper - offset < len) { elog(ERROR, "attempt to write columnar data of length %d to offset %d of block %d of relation %d", len, offset, blockno, rel->rd_id); } + /* + * After a transaction has been rolled-back, we might be + * over-writing the rolledback write, so phdr->pd_lower can be + * different from addr.offset. + * + * We reset pd_lower to reset the rolledback write. + * + * Given that we always align page reservation to the next page as of + * 10.2, having such a disk page is only possible if write operaion + * failed in an older version of columnar, but now user attempts writing + * to that table in version >= 10.2. + */ + if (phdr->pd_lower > offset) + { + ereport(DEBUG4, (errmsg("overwriting page %u", blockno), + errdetail("This can happen after a roll-back."))); + phdr->pd_lower = offset; + } + START_CRIT_SECTION(); memcpy_s(page + phdr->pd_lower, phdr->pd_upper - phdr->pd_lower, buf, len); @@ -820,3 +843,36 @@ ColumnarMetapageCheckVersion(Relation rel, ColumnarMetapage *metapage) errhint(OLD_METAPAGE_VERSION_HINT))); } } + + +/* + * test_columnar_storage_write_new_page is a UDF only used for testing + * purposes. It could make more sense to define this in columnar_debug.c, + * but the storage layer doesn't expose ColumnarMetapage to any other files, + * so we define it here. + */ +Datum +test_columnar_storage_write_new_page(PG_FUNCTION_ARGS) +{ + Oid relationId = PG_GETARG_OID(0); + + Relation relation = relation_open(relationId, AccessShareLock); + + /* + * Allocate a new page, write some data to there, and set reserved offset + * to the start of that page. That way, for a subsequent write operation, + * storage layer would try to overwrite the page that we allocated here. + */ + uint64 newPageOffset = ColumnarStorageGetReservedOffset(relation, false); + + ColumnarStorageReserveData(relation, 100); + ColumnarStorageWrite(relation, newPageOffset, "foo_bar", 8); + + ColumnarMetapage metapage = ColumnarMetapageRead(relation, false); + metapage.reservedOffset = newPageOffset; + ColumnarOverwriteMetapage(relation, metapage); + + relation_close(relation, AccessShareLock); + + PG_RETURN_VOID(); +} diff --git a/src/test/regress/expected/columnar_insert.out b/src/test/regress/expected/columnar_insert.out index edbd21c53..2227bd839 100644 --- a/src/test/regress/expected/columnar_insert.out +++ b/src/test/regress/expected/columnar_insert.out @@ -291,6 +291,20 @@ BEGIN; (1 row) ROLLBACK; +CREATE OR REPLACE FUNCTION test_columnar_storage_write_new_page(relation regclass) RETURNS void +STRICT LANGUAGE c AS 'citus', 'test_columnar_storage_write_new_page'; +CREATE TABLE aborted_write (a int, b int) USING columnar; +SELECT test_columnar_storage_write_new_page('aborted_write'); + test_columnar_storage_write_new_page +--------------------------------------------------------------------- + +(1 row) + +SET client_min_messages TO DEBUG4; +INSERT INTO aborted_write VALUES (5); +DEBUG: Flushing Stripe of size 1 +DEBUG: overwriting page 2 +DETAIL: This can happen after a roll-back. RESET search_path; SET client_min_messages TO WARNING; DROP SCHEMA columnar_insert CASCADE; diff --git a/src/test/regress/sql/columnar_insert.sql b/src/test/regress/sql/columnar_insert.sql index 96237ce02..9874109ca 100644 --- a/src/test/regress/sql/columnar_insert.sql +++ b/src/test/regress/sql/columnar_insert.sql @@ -199,6 +199,16 @@ BEGIN; SELECT a FROM flush_create_index WHERE a=5; ROLLBACK; +CREATE OR REPLACE FUNCTION test_columnar_storage_write_new_page(relation regclass) RETURNS void +STRICT LANGUAGE c AS 'citus', 'test_columnar_storage_write_new_page'; + +CREATE TABLE aborted_write (a int, b int) USING columnar; + +SELECT test_columnar_storage_write_new_page('aborted_write'); + +SET client_min_messages TO DEBUG4; +INSERT INTO aborted_write VALUES (5); + RESET search_path; SET client_min_messages TO WARNING; DROP SCHEMA columnar_insert CASCADE;