Columnar: track decompressed length in metadata

pull/4386/head
Hadi Moshayedi 2020-12-03 19:30:07 -08:00
parent d4f5d4a27b
commit 01da2a1c73
10 changed files with 69 additions and 6 deletions

View File

@ -137,7 +137,7 @@ typedef FormData_cstore_options *Form_cstore_options;
#define Anum_cstore_stripes_row_count 8
/* constants for cstore_skipnodes */
#define Natts_cstore_skipnodes 12
#define Natts_cstore_skipnodes 13
#define Anum_cstore_skipnodes_storageid 1
#define Anum_cstore_skipnodes_stripe 2
#define Anum_cstore_skipnodes_attr 3
@ -150,6 +150,7 @@ typedef FormData_cstore_options *Form_cstore_options;
#define Anum_cstore_skipnodes_exists_stream_offset 10
#define Anum_cstore_skipnodes_exists_stream_length 11
#define Anum_cstore_skipnodes_value_compression_type 12
#define Anum_cstore_skipnodes_value_decompressed_size 13
/*
@ -416,7 +417,8 @@ SaveStripeSkipList(RelFileNode relfilenode, uint64 stripe, StripeSkipList *strip
Int64GetDatum(skipNode->valueLength),
Int64GetDatum(skipNode->existsChunkOffset),
Int64GetDatum(skipNode->existsLength),
Int32GetDatum(skipNode->valueCompressionType)
Int32GetDatum(skipNode->valueCompressionType),
Int64GetDatum(skipNode->decompressedValueSize)
};
bool nulls[Natts_cstore_skipnodes] = { false };
@ -522,6 +524,8 @@ ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe, TupleDesc tupleDescri
DatumGetInt64(datumArray[Anum_cstore_skipnodes_exists_stream_length - 1]);
skipNode->valueCompressionType =
DatumGetInt32(datumArray[Anum_cstore_skipnodes_value_compression_type - 1]);
skipNode->decompressedValueSize =
DatumGetInt64(datumArray[Anum_cstore_skipnodes_value_decompressed_size - 1]);
if (isNullArray[Anum_cstore_skipnodes_minimum_value - 1] ||
isNullArray[Anum_cstore_skipnodes_maximum_value - 1])

View File

@ -694,6 +694,7 @@ LogRelationStats(Relation rel, int elevel)
uint64 chunkCount = 0;
TupleDesc tupdesc = RelationGetDescr(rel);
uint64 droppedChunksWithData = 0;
uint64 totalDecompressedLength = 0;
List *stripeList = StripesForRelfilenode(relfilenode);
int stripeCount = list_length(stripeList);
@ -723,6 +724,13 @@ LogRelationStats(Relation rel, int elevel)
droppedChunksWithData++;
}
}
/*
* We don't compress exists buffer, so its compressed & decompressed
* lengths are the same.
*/
totalDecompressedLength += skipnode->existsLength;
totalDecompressedLength += skipnode->decompressedValueSize;
}
}
@ -737,9 +745,14 @@ LogRelationStats(Relation rel, int elevel)
Datum storageId = DirectFunctionCall1(columnar_relation_storageid,
ObjectIdGetDatum(RelationGetRelid(rel)));
double compressionRate = totalStripeLength ?
(double) totalDecompressedLength / totalStripeLength :
1.0;
appendStringInfo(infoBuf, "storage id: %ld\n", DatumGetInt64(storageId));
appendStringInfo(infoBuf, "total file size: %ld, total data size: %ld\n",
relPages * BLCKSZ, totalStripeLength);
appendStringInfo(infoBuf, "compression rate: %.2fx\n", compressionRate);
appendStringInfo(infoBuf,
"total row count: %ld, stripe count: %d, "
"average rows per stripe: %ld\n",

View File

@ -463,6 +463,7 @@ FlushStripe(TableWriteState *writeState)
chunkSkipNode->valueChunkOffset = stripeSize;
chunkSkipNode->valueLength = valueBufferSize;
chunkSkipNode->valueCompressionType = valueCompressionType;
chunkSkipNode->decompressedValueSize = chunkBuffers->decompressedValueSize;
stripeSize += valueBufferSize;
}
@ -631,9 +632,11 @@ SerializeChunkData(TableWriteState *writeState, uint32 chunkIndex, uint32 rowCou
StringInfo serializedValueBuffer = chunkData->valueBufferArray[columnIndex];
/* the only other supported compression type is pg_lz for now */
Assert(requestedCompressionType == COMPRESSION_NONE ||
requestedCompressionType == COMPRESSION_PG_LZ);
Assert(requestedCompressionType >= 0 &&
requestedCompressionType < COMPRESSION_COUNT);
chunkBuffers->decompressedValueSize =
chunkData->valueBufferArray[columnIndex]->len;
/*
* if serializedValueBuffer is be compressed, update serializedValueBuffer
@ -644,7 +647,7 @@ SerializeChunkData(TableWriteState *writeState, uint32 chunkIndex, uint32 rowCou
if (compressed)
{
serializedValueBuffer = compressionBuffer;
actualCompressionType = COMPRESSION_PG_LZ;
actualCompressionType = requestedCompressionType;
}
/* store (compressed) value buffer */

View File

@ -41,6 +41,7 @@ CREATE TABLE cstore_skipnodes (
exists_stream_offset bigint NOT NULL,
exists_stream_length bigint NOT NULL,
value_compression_type int NOT NULL,
value_decompressed_length bigint NOT NULL,
PRIMARY KEY (storageid, stripe, attr, chunk),
FOREIGN KEY (storageid, stripe) REFERENCES cstore_stripes(storageid, stripe) ON DELETE CASCADE
) WITH (user_catalog_table = true);

View File

@ -117,6 +117,12 @@ typedef struct ColumnChunkSkipNode
uint64 existsChunkOffset;
uint64 existsLength;
/*
* This is used for (1) determining destination size when decompressing,
* (2) calculating compression rates when logging stats.
*/
uint64 decompressedValueSize;
CompressionType valueCompressionType;
} ColumnChunkSkipNode;
@ -170,6 +176,7 @@ typedef struct ColumnChunkBuffers
StringInfo existsBuffer;
StringInfo valueBuffer;
CompressionType valueCompressionType;
uint64 decompressedValueSize;
} ColumnChunkBuffers;

View File

@ -70,6 +70,7 @@ vacuum verbose t_compressed;
INFO: statistics for "t_compressed":
storage id: -1
total file size: 0, total data size: 0
compression rate: 1.00x
total row count: 0, stripe count: 0, average rows per stripe: 0
chunk count: 0, containing data for dropped columns: 0, none compressed: 0, pglz compressed: 0
@ -77,6 +78,7 @@ vacuum verbose t_uncompressed;
INFO: statistics for "t_uncompressed":
storage id: -1
total file size: 0, total data size: 0
compression rate: 1.00x
total row count: 0, stripe count: 0, average rows per stripe: 0
chunk count: 0, containing data for dropped columns: 0, none compressed: 0, pglz compressed: 0

View File

@ -150,6 +150,7 @@ VACUUM VERBOSE t;
INFO: statistics for "t":
storage id: xxxxx
total file size: 122880, total data size: 10754
compression rate: 1.00x
total row count: 2530, stripe count: 3, average rows per stripe: 843
chunk count: 3, containing data for dropped columns: 0, none compressed: 3, pglz compressed: 0
@ -197,6 +198,7 @@ VACUUM VERBOSE t;
INFO: statistics for "t":
storage id: xxxxx
total file size: 57344, total data size: 18808
compression rate: 1.25x
total row count: 5530, stripe count: 5, average rows per stripe: 1106
chunk count: 7, containing data for dropped columns: 0, none compressed: 5, pglz compressed: 2
@ -214,6 +216,7 @@ VACUUM VERBOSE t;
INFO: statistics for "t":
storage id: xxxxx
total file size: 73728, total data size: 31372
compression rate: 1.15x
total row count: 7030, stripe count: 6, average rows per stripe: 1171
chunk count: 11, containing data for dropped columns: 2, none compressed: 9, pglz compressed: 2
@ -231,6 +234,7 @@ VACUUM VERBOSE t;
INFO: statistics for "t":
storage id: xxxxx
total file size: 57344, total data size: 15728
compression rate: 1.96x
total row count: 7030, stripe count: 4, average rows per stripe: 1757
chunk count: 8, containing data for dropped columns: 0, none compressed: 2, pglz compressed: 6
@ -243,3 +247,18 @@ SELECT count(distinct storageid) - :columnar_table_count FROM cstore.cstore_stri
0
(1 row)
-- A table with high compression ratio
SET cstore.compression TO 'pglz';
SET cstore.stripe_row_count TO 1000000;
SET cstore.chunk_row_count TO 100000;
CREATE TABLE t(a int, b char, c text) USING columnar;
INSERT INTO t SELECT 1, 'a', 'xyz' FROM generate_series(1, 1000000) i;
VACUUM VERBOSE t;
INFO: statistics for "t":
storage id: xxxxx
total file size: 630784, total data size: 604480
compression rate: 33.71x
total row count: 1000000, stripe count: 1, average rows per stripe: 1000000
chunk count: 30, containing data for dropped columns: 0, none compressed: 0, pglz compressed: 30
DROP TABLE t;

View File

@ -13,6 +13,7 @@ step s1-insert:
s2: INFO: statistics for "test_vacuum_vs_insert":
storage id: xxxxx
total file size: 24576, total data size: 26
compression rate: 1.00x
total row count: 3, stripe count: 1, average rows per stripe: 3
chunk count: 2, containing data for dropped columns: 0, none compressed: 2, pglz compressed: 0

View File

@ -111,6 +111,7 @@ VACUUM VERBOSE test_options_1;
INFO: statistics for "test_options_1":
storage id: xxxxx
total file size: 65536, total data size: 43136
compression rate: 1.91x
total row count: 10000, stripe count: 2, average rows per stripe: 5000
chunk count: 20, containing data for dropped columns: 0, none compressed: 10, pglz compressed: 10
@ -130,6 +131,7 @@ VACUUM VERBOSE test_options_2;
INFO: statistics for "test_options_2":
storage id: xxxxx
total file size: 163840, total data size: 125636
compression rate: 1.31x
total row count: 20000, stripe count: 4, average rows per stripe: 5000
chunk count: 30, containing data for dropped columns: 0, none compressed: 20, pglz compressed: 10

View File

@ -111,3 +111,14 @@ DROP VIEW t_stripes;
-- Make sure we cleaned the metadata for t too
SELECT count(distinct storageid) - :columnar_table_count FROM cstore.cstore_stripes;
-- A table with high compression ratio
SET cstore.compression TO 'pglz';
SET cstore.stripe_row_count TO 1000000;
SET cstore.chunk_row_count TO 100000;
CREATE TABLE t(a int, b char, c text) USING columnar;
INSERT INTO t SELECT 1, 'a', 'xyz' FROM generate_series(1, 1000000) i;
VACUUM VERBOSE t;
DROP TABLE t;