From 9f52f6a9a1d17646c9d79dc0838ad51d041237df Mon Sep 17 00:00:00 2001 From: Imran Zaheer Date: Thu, 25 Sep 2025 10:28:06 +0500 Subject: [PATCH] fix: Keep track of chunk indices instead recomputing & adjustments There were still some issues in adjujsting chunk index and chunk row index after every compute. This was causing failures for some edge cases i.e. reaching chunk row limit. A better way would be keep track of these indices intead of recomputing and adjujsting them on every row iteration. --- src/backend/columnar/columnar_writer.c | 57 ++++----- .../regress/expected/columnar_chunk_sizes.out | 112 ++++++++---------- src/test/regress/sql/columnar_chunk_sizes.sql | 15 ++- 3 files changed, 79 insertions(+), 105 deletions(-) diff --git a/src/backend/columnar/columnar_writer.c b/src/backend/columnar/columnar_writer.c index fbdbc1d5e..3df3c20f0 100644 --- a/src/backend/columnar/columnar_writer.c +++ b/src/backend/columnar/columnar_writer.c @@ -55,14 +55,14 @@ struct ColumnarWriteState EmptyStripeReservation *emptyStripeReservation; ColumnarOptions options; ChunkData *chunkData; + uint32 currentChunkRowIndex; + uint32 currentChunkIndex; /* * accounting for creating new chunks groups when * size limit reaches */ Size currentChunkBytes; - uint32 earlySerializedRowCount; - uint32 earlySerializedChunkCount; List *chunkGroupRowCounts; @@ -216,23 +216,16 @@ ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *colu } writeState->currentChunkBytes = 0; - writeState->earlySerializedRowCount = 0; - writeState->earlySerializedChunkCount = 0; + writeState->currentChunkIndex = 0; + writeState->currentChunkRowIndex = 0; /* Ensure maxChunkSize is set with a reasonable default */ Assert(options->maxChunkSize >= CHUNK_GROUP_SIZE_MINIMUM && options->maxChunkSize <= CHUNK_GROUP_SIZE_MAXIMUM); } - uint32 chunkIndex = stripeBuffers->rowCount / chunkRowCount; - uint32 chunkRowIndex = stripeBuffers->rowCount % chunkRowCount; - - /* Adjust the indices if some chunks were serialized early */ - if (writeState->earlySerializedRowCount) - { - chunkIndex = chunkIndex + writeState->earlySerializedChunkCount; - chunkRowIndex = chunkRowIndex - writeState->earlySerializedRowCount; - } + uint32 chunkIndex = writeState->currentChunkIndex; + uint32 chunkRowIndex = writeState->currentChunkRowIndex; /* * Calculate total serialized current row size without actually serializing. @@ -293,19 +286,14 @@ ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *colu * Size limit reached, now serialize upto the last row. * We make sure not to serialize the current row data and only upto * the last row, so we use `chunkRowIndex` instead of `chunkRowIndex + 1` - * in order to skip current row. + * in order to skip current row. Current row will go in the next chunk. */ SerializeChunkData(writeState, chunkIndex, chunkRowIndex); - writeState->earlySerializedChunkCount++; writeState->currentChunkBytes = 0; - writeState->earlySerializedRowCount = writeState->earlySerializedRowCount + chunkRowIndex; - /* Recreate and adjust the indices after deciding to start a new chunk */ - chunkIndex = stripeBuffers->rowCount / chunkRowCount; - chunkRowIndex = stripeBuffers->rowCount % chunkRowCount; - - chunkIndex = chunkIndex + writeState->earlySerializedChunkCount; - chunkRowIndex = chunkRowIndex - writeState->earlySerializedRowCount; + /* Adjust the indices after deciding to start a new chunk */ + chunkIndex = ++writeState->currentChunkIndex; + chunkRowIndex = writeState->currentChunkRowIndex = 0; } for (columnIndex = 0; columnIndex < columnCount; columnIndex++) @@ -351,11 +339,21 @@ ColumnarWriteRow(ColumnarWriteState *writeState, Datum *columnValues, bool *colu { SerializeChunkData(writeState, chunkIndex, chunkRowCount); writeState->currentChunkBytes = 0; + writeState->currentChunkIndex++; + writeState->currentChunkRowIndex = 0; } uint64 writtenRowNumber = writeState->emptyStripeReservation->stripeFirstRowNumber + stripeBuffers->rowCount; stripeBuffers->rowCount++; + + /* + * don't increment when chunk row limit was reached and new chunk was + * created, writeState->currentChunkRowIndex should suppose to be remain `0` + * in this case. + */ + if (chunkRowIndex != chunkRowCount - 1) writeState->currentChunkRowIndex++; + if (stripeBuffers->rowCount >= options->stripeRowCount) { ColumnarFlushPendingWrites(writeState); @@ -580,22 +578,11 @@ FlushStripe(ColumnarWriteState *writeState) TupleDesc tupleDescriptor = writeState->tupleDescriptor; uint32 columnCount = tupleDescriptor->natts; uint32 chunkCount = stripeSkipList->chunkCount; - uint32 chunkRowCount = writeState->options.chunkRowCount; - uint32 lastChunkIndex = stripeBuffers->rowCount / chunkRowCount; - uint32 lastChunkRowCount = stripeBuffers->rowCount % chunkRowCount; + uint32 lastChunkIndex = writeState->currentChunkIndex; + uint32 lastChunkRowCount = writeState->currentChunkRowIndex; uint64 stripeSize = 0; uint64 stripeRowCount = stripeBuffers->rowCount; - if (writeState->earlySerializedRowCount) - { - /* - * Increment indices as a chunk has beed serialized early - * because of reaching its size limit - */ - lastChunkIndex = lastChunkIndex + writeState->earlySerializedChunkCount; - lastChunkRowCount = lastChunkRowCount - writeState->earlySerializedRowCount; - } - elog(DEBUG1, "Flushing Stripe of size %d", stripeBuffers->rowCount); Oid relationId = RelidByRelfilenumber(RelationTablespace_compat( diff --git a/src/test/regress/expected/columnar_chunk_sizes.out b/src/test/regress/expected/columnar_chunk_sizes.out index 7a18b898c..3d6e5a5f4 100644 --- a/src/test/regress/expected/columnar_chunk_sizes.out +++ b/src/test/regress/expected/columnar_chunk_sizes.out @@ -20,89 +20,79 @@ CREATE TABLE test_oversized_row ( huge_text TEXT ) USING columnar WITH ( columnar.chunk_group_row_limit = 1000, - columnar.stripe_row_limit = 5000, + columnar.stripe_row_limit = 1500, columnar.chunk_group_size_limit = 128 ); +-- test with chunk & stripe row limit reached INSERT INTO test_oversized_row -SELECT gs, repeat('Y', 2*1024*1024) -- 2 MB text -FROM generate_series(1, 600) AS gs; -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Flushing Stripe of size 600 --- test VACUUM FULL -VACUUM FULL test_oversized_row; -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group -DEBUG: Flushing Stripe of size 600 +SELECT gs, repeat('Y', 1*1024*1024) -- 1 MB text +FROM generate_series(1, 1600) AS gs; +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Row size (1048584 bytes) exceeds chunk group size limit (134217728 bytes), storing in a separate chunk group +DEBUG: Flushing Stripe of size 1500 +DEBUG: Flushing Stripe of size 100 SET client_min_messages TO warning; -- try verifying the data integrity SELECT * FROM columnar.chunk_group WHERE relation = 'test_oversized_row'::regclass; relation | storage_id | stripe_num | chunk_group_num | row_count --------------------------------------------------------------------- - test_oversized_row | 10000000262 | 1 | 0 | 63 - test_oversized_row | 10000000262 | 1 | 1 | 63 - test_oversized_row | 10000000262 | 1 | 2 | 63 - test_oversized_row | 10000000262 | 1 | 3 | 63 - test_oversized_row | 10000000262 | 1 | 4 | 63 - test_oversized_row | 10000000262 | 1 | 5 | 63 - test_oversized_row | 10000000262 | 1 | 6 | 63 - test_oversized_row | 10000000262 | 1 | 7 | 63 - test_oversized_row | 10000000262 | 1 | 8 | 63 - test_oversized_row | 10000000262 | 1 | 9 | 33 -(10 rows) + test_oversized_row | 10000000261 | 1 | 0 | 127 + test_oversized_row | 10000000261 | 1 | 1 | 127 + test_oversized_row | 10000000261 | 1 | 2 | 127 + test_oversized_row | 10000000261 | 1 | 3 | 127 + test_oversized_row | 10000000261 | 1 | 4 | 127 + test_oversized_row | 10000000261 | 1 | 5 | 127 + test_oversized_row | 10000000261 | 1 | 6 | 127 + test_oversized_row | 10000000261 | 1 | 7 | 127 + test_oversized_row | 10000000261 | 1 | 8 | 127 + test_oversized_row | 10000000261 | 1 | 9 | 127 + test_oversized_row | 10000000261 | 1 | 10 | 127 + test_oversized_row | 10000000261 | 1 | 11 | 103 + test_oversized_row | 10000000261 | 2 | 0 | 100 +(13 rows) SELECT * FROM columnar.stripe WHERE relation = 'test_oversized_row'::regclass; relation | storage_id | stripe_num | file_offset | data_length | column_count | chunk_row_count | row_count | chunk_group_count | first_row_number --------------------------------------------------------------------- - test_oversized_row | 10000000262 | 1 | 16336 | 1258296154 | 2 | 1000 | 600 | 10 | 1 -(1 row) + test_oversized_row | 10000000261 | 1 | 16336 | 1572876378 | 2 | 1000 | 1500 | 12 | 1 + test_oversized_row | 10000000261 | 2 | 1572895424 | 104858426 | 2 | 1000 | 100 | 1 | 1501 +(2 rows) SELECT COUNT(*) FROM test_oversized_row; count --------------------------------------------------------------------- - 600 + 1600 (1 row) SELECT ID, LENGTH(huge_text) FROM test_oversized_row ORDER BY id LIMIT 10; id | length --------------------------------------------------------------------- - 1 | 2097152 - 2 | 2097152 - 3 | 2097152 - 4 | 2097152 - 5 | 2097152 - 6 | 2097152 - 7 | 2097152 - 8 | 2097152 - 9 | 2097152 - 10 | 2097152 + 1 | 1048576 + 2 | 1048576 + 3 | 1048576 + 4 | 1048576 + 5 | 1048576 + 6 | 1048576 + 7 | 1048576 + 8 | 1048576 + 9 | 1048576 + 10 | 1048576 (10 rows) -SELECT SUM(LENGTH(huge_text)) = 1258291200 AS is_equal FROM test_oversized_row; - is_equal ---------------------------------------------------------------------- - t -(1 row) - \dt+ test_oversized_row List of relations Schema | Name | Type | Owner | Persistence | Size | Description --------------------------------------------------------------------- - columnar_chunk_test | test_oversized_row | table | postgres | permanent | 1204 MB | + columnar_chunk_test | test_oversized_row | table | postgres | permanent | 1605 MB | (1 row) -- test edge case setting chunk_group_size_limit = 1024 @@ -122,6 +112,10 @@ SELECT gs, repeat('Y', 2*1024*1024) -- 2 MB text FROM generate_series(1, 600) AS gs; DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (1073741824 bytes), storing in a separate chunk group DEBUG: Flushing Stripe of size 600 +-- test VACUUM FULL +VACUUM FULL test_oversized_row; +DEBUG: Row size (2097160 bytes) exceeds chunk group size limit (1073741824 bytes), storing in a separate chunk group +DEBUG: Flushing Stripe of size 600 SET client_min_messages TO warning; -- try verifying the data integrity SELECT * FROM columnar.chunk_group WHERE relation = 'test_oversized_row'::regclass; @@ -158,12 +152,6 @@ SELECT ID, LENGTH(huge_text) FROM test_oversized_row ORDER BY id LIMIT 10; 10 | 2097152 (10 rows) -SELECT SUM(LENGTH(huge_text)) = 1258291200 AS is_equal FROM test_oversized_row; - is_equal ---------------------------------------------------------------------- - t -(1 row) - \dt+ test_oversized_row List of relations Schema | Name | Type | Owner | Persistence | Size | Description diff --git a/src/test/regress/sql/columnar_chunk_sizes.sql b/src/test/regress/sql/columnar_chunk_sizes.sql index cb3821450..65b784e04 100644 --- a/src/test/regress/sql/columnar_chunk_sizes.sql +++ b/src/test/regress/sql/columnar_chunk_sizes.sql @@ -25,16 +25,14 @@ CREATE TABLE test_oversized_row ( huge_text TEXT ) USING columnar WITH ( columnar.chunk_group_row_limit = 1000, - columnar.stripe_row_limit = 5000, + columnar.stripe_row_limit = 1500, columnar.chunk_group_size_limit = 128 ); +-- test with chunk & stripe row limit reached INSERT INTO test_oversized_row -SELECT gs, repeat('Y', 2*1024*1024) -- 2 MB text -FROM generate_series(1, 600) AS gs; - --- test VACUUM FULL -VACUUM FULL test_oversized_row; +SELECT gs, repeat('Y', 1*1024*1024) -- 1 MB text +FROM generate_series(1, 1600) AS gs; SET client_min_messages TO warning; @@ -43,7 +41,6 @@ SELECT * FROM columnar.chunk_group WHERE relation = 'test_oversized_row'::regcla SELECT * FROM columnar.stripe WHERE relation = 'test_oversized_row'::regclass; SELECT COUNT(*) FROM test_oversized_row; SELECT ID, LENGTH(huge_text) FROM test_oversized_row ORDER BY id LIMIT 10; -SELECT SUM(LENGTH(huge_text)) = 1258291200 AS is_equal FROM test_oversized_row; \dt+ test_oversized_row @@ -65,6 +62,9 @@ INSERT INTO test_oversized_row SELECT gs, repeat('Y', 2*1024*1024) -- 2 MB text FROM generate_series(1, 600) AS gs; +-- test VACUUM FULL +VACUUM FULL test_oversized_row; + SET client_min_messages TO warning; -- try verifying the data integrity @@ -72,7 +72,6 @@ SELECT * FROM columnar.chunk_group WHERE relation = 'test_oversized_row'::regcla SELECT * FROM columnar.stripe WHERE relation = 'test_oversized_row'::regclass; SELECT COUNT(*) FROM test_oversized_row; SELECT ID, LENGTH(huge_text) FROM test_oversized_row ORDER BY id LIMIT 10; -SELECT SUM(LENGTH(huge_text)) = 1258291200 AS is_equal FROM test_oversized_row; \dt+ test_oversized_row DROP TABLE test_oversized_row;