From dba84214536f8de820219856a19d2e3f8f06ead6 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Fri, 25 Jun 2021 16:22:14 +0300 Subject: [PATCH 1/3] Refactor ColumnarScanCost into ColumnarPerChunkGroupScanCost --- src/backend/columnar/columnar_customscan.c | 82 +++++++++++++++------- 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/src/backend/columnar/columnar_customscan.c b/src/backend/columnar/columnar_customscan.c index 88760c654..e2914e509 100644 --- a/src/backend/columnar/columnar_customscan.c +++ b/src/backend/columnar/columnar_customscan.c @@ -60,7 +60,10 @@ static void RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPred static bool IsNotIndexPath(Path *path); static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); -static Cost ColumnarScanCost(RangeTblEntry *rte); +static Cost ColumnarScanCost(Oid relationId, int numberOfColumnsRead); +static Cost ColumnarPerStripeScanCost(Oid relationId, + int numberOfColumnsRead); +static uint64 ColumnarTableStripeCount(Oid relationId); static Plan * ColumnarScanPath_PlanCustomPath(PlannerInfo *root, RelOptInfo *rel, struct CustomPath *best_path, @@ -265,7 +268,9 @@ CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) */ path->rows = rel->rows; path->startup_cost = 0; - path->total_cost = path->startup_cost + ColumnarScanCost(rte); + int numberOfColumnsRead = bms_num_members(rte->selectedCols); + path->total_cost = path->startup_cost + + ColumnarScanCost(rte->relid, numberOfColumnsRead); return (Path *) cspath; } @@ -277,42 +282,65 @@ CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) * need to be read. */ static Cost -ColumnarScanCost(RangeTblEntry *rte) +ColumnarScanCost(Oid relationId, int numberOfColumnsRead) { - Relation rel = RelationIdGetRelation(rte->relid); - List *stripeList = StripesForRelfilenode(rel->rd_node); - RelationClose(rel); + return ColumnarTableStripeCount(relationId) * + ColumnarPerStripeScanCost(relationId, numberOfColumnsRead); +} + + +/* + * ColumnarPerStripeScanCost calculates the cost to scan a single stripe + * of given columnar table based on number of columns that needs to be + * read during scan operation. + */ +static Cost +ColumnarPerStripeScanCost(Oid relationId, int numberOfColumnsRead) +{ + Relation relation = RelationIdGetRelation(relationId); + List *stripeList = StripesForRelfilenode(relation->rd_node); + RelationClose(relation); uint32 maxColumnCount = 0; uint64 totalStripeSize = 0; - ListCell *stripeMetadataCell = NULL; - rel = NULL; - - foreach(stripeMetadataCell, stripeList) + StripeMetadata *stripeMetadata = NULL; + foreach_ptr(stripeMetadata, stripeList) { - StripeMetadata *stripeMetadata = (StripeMetadata *) lfirst(stripeMetadataCell); totalStripeSize += stripeMetadata->dataLength; maxColumnCount = Max(maxColumnCount, stripeMetadata->columnCount); } + /* + * When no stripes are in the table we don't have a count in maxColumnCount. To + * prevent a division by zero turning into a NaN we keep the ratio on zero. + * This will result in a cost of 0 for scanning the table which is a reasonable + * cost on an empty table. + */ + if (maxColumnCount == 0) { - Bitmapset *attr_needed = rte->selectedCols; - double numberOfColumnsRead = bms_num_members(attr_needed); - double selectionRatio = 0; - - /* - * When no stripes are in the table we don't have a count in maxColumnCount. To - * prevent a division by zero turning into a NaN we keep the ratio on zero. - * This will result in a cost of 0 for scanning the table which is a reasonable - * cost on an empty table. - */ - if (maxColumnCount != 0) - { - selectionRatio = numberOfColumnsRead / (double) maxColumnCount; - } - Cost scanCost = (double) totalStripeSize / BLCKSZ * selectionRatio; - return scanCost; + return 0; } + + double columnSelectionRatio = numberOfColumnsRead / (double) maxColumnCount; + Cost tableScanCost = (double) totalStripeSize / BLCKSZ * columnSelectionRatio; + Cost perStripeScanCost = tableScanCost / list_length(stripeList); + return perStripeScanCost; +} + + +/* + * ColumnarTableStripeCount returns the number of stripes that columnar + * table with relationId has by using stripe metadata. + */ +static uint64 +ColumnarTableStripeCount(Oid relationId) +{ + Relation relation = RelationIdGetRelation(relationId); + List *stripeList = StripesForRelfilenode(relation->rd_node); + int stripeCount = list_length(stripeList); + RelationClose(relation); + + return stripeCount; } From 8adcf2096b28a4a0eb5402ab22d325e104d468e8 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Mon, 5 Jul 2021 17:52:56 +0300 Subject: [PATCH 2/3] Multiply ColumnarCustomScan cost by tblspace.seqpage cost --- src/backend/columnar/columnar_customscan.c | 27 +++++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/backend/columnar/columnar_customscan.c b/src/backend/columnar/columnar_customscan.c index e2914e509..748049740 100644 --- a/src/backend/columnar/columnar_customscan.c +++ b/src/backend/columnar/columnar_customscan.c @@ -23,6 +23,7 @@ #include "optimizer/paths.h" #include "optimizer/restrictinfo.h" #include "utils/relcache.h" +#include "utils/spccache.h" #include "columnar/columnar_customscan.h" #include "columnar/columnar_metadata.h" @@ -60,8 +61,8 @@ static void RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPred static bool IsNotIndexPath(Path *path); static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); -static Cost ColumnarScanCost(Oid relationId, int numberOfColumnsRead); -static Cost ColumnarPerStripeScanCost(Oid relationId, +static Cost ColumnarScanCost(RelOptInfo *rel, Oid relationId, int numberOfColumnsRead); +static Cost ColumnarPerStripeScanCost(RelOptInfo *rel, Oid relationId, int numberOfColumnsRead); static uint64 ColumnarTableStripeCount(Oid relationId); static Plan * ColumnarScanPath_PlanCustomPath(PlannerInfo *root, @@ -270,7 +271,7 @@ CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) path->startup_cost = 0; int numberOfColumnsRead = bms_num_members(rte->selectedCols); path->total_cost = path->startup_cost + - ColumnarScanCost(rte->relid, numberOfColumnsRead); + ColumnarScanCost(rel, rte->relid, numberOfColumnsRead); return (Path *) cspath; } @@ -282,10 +283,10 @@ CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) * need to be read. */ static Cost -ColumnarScanCost(Oid relationId, int numberOfColumnsRead) +ColumnarScanCost(RelOptInfo *rel, Oid relationId, int numberOfColumnsRead) { return ColumnarTableStripeCount(relationId) * - ColumnarPerStripeScanCost(relationId, numberOfColumnsRead); + ColumnarPerStripeScanCost(rel, relationId, numberOfColumnsRead); } @@ -295,7 +296,7 @@ ColumnarScanCost(Oid relationId, int numberOfColumnsRead) * read during scan operation. */ static Cost -ColumnarPerStripeScanCost(Oid relationId, int numberOfColumnsRead) +ColumnarPerStripeScanCost(RelOptInfo *rel, Oid relationId, int numberOfColumnsRead) { Relation relation = RelationIdGetRelation(relationId); List *stripeList = StripesForRelfilenode(relation->rd_node); @@ -324,6 +325,20 @@ ColumnarPerStripeScanCost(Oid relationId, int numberOfColumnsRead) double columnSelectionRatio = numberOfColumnsRead / (double) maxColumnCount; Cost tableScanCost = (double) totalStripeSize / BLCKSZ * columnSelectionRatio; Cost perStripeScanCost = tableScanCost / list_length(stripeList); + + /* + * Finally, multiply the cost of reading a single stripe by seq page read + * cost to make our estimation scale compatible with postgres. + * Since we are calculating the cost for a single stripe here, we use seq + * page cost instead of random page cost. This is because, random page + * access only happens when switching between columns, which is pretty + * much neglactable. + */ + double relSpaceSeqPageCost; + get_tablespace_page_costs(rel->reltablespace, + NULL, &relSpaceSeqPageCost); + perStripeScanCost = perStripeScanCost * relSpaceSeqPageCost; + return perStripeScanCost; } From 297f59a70eefc32131c67ecfe3c38d3cb3519ec5 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Thu, 24 Jun 2021 20:40:54 +0300 Subject: [PATCH 3/3] Re-cost columnar table index paths --- src/backend/columnar/columnar_customscan.c | 197 +++++++++++- src/test/regress/columnar_schedule | 2 +- src/test/regress/expected/columnar_paths.out | 302 ++++++++++++++++++ .../expected/columnar_test_helpers.out | 27 ++ src/test/regress/sql/columnar_paths.sql | 203 ++++++++++++ .../regress/sql/columnar_test_helpers.sql | 29 ++ 6 files changed, 755 insertions(+), 5 deletions(-) create mode 100644 src/test/regress/expected/columnar_paths.out create mode 100644 src/test/regress/sql/columnar_paths.sql diff --git a/src/backend/columnar/columnar_customscan.c b/src/backend/columnar/columnar_customscan.c index 748049740..50354be46 100644 --- a/src/backend/columnar/columnar_customscan.c +++ b/src/backend/columnar/columnar_customscan.c @@ -14,6 +14,7 @@ #include "postgres.h" +#include "access/amapi.h" #include "access/skey.h" #include "nodes/extensible.h" #include "nodes/pg_list.h" @@ -25,6 +26,7 @@ #include "utils/relcache.h" #include "utils/spccache.h" +#include "columnar/columnar.h" #include "columnar/columnar_customscan.h" #include "columnar/columnar_metadata.h" #include "columnar/columnar_tableam.h" @@ -59,6 +61,14 @@ static void ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index RangeTblEntry *rte); static void RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate); static bool IsNotIndexPath(Path *path); +static void RecostColumnarIndexPaths(PlannerInfo *root, RelOptInfo *rel, Oid relationId); +static void RecostColumnarIndexPath(PlannerInfo *root, RelOptInfo *rel, Oid relationId, + IndexPath *indexPath); +static Cost ColumnarIndexScanAddStartupCost(RelOptInfo *rel, Oid relationId, + IndexPath *indexPath); +static Cost ColumnarIndexScanAddTotalCost(PlannerInfo *root, RelOptInfo *rel, + Oid relationId, IndexPath *indexPath); +static int RelationIdGetNumberOfAttributes(Oid relationId); static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static Cost ColumnarScanCost(RelOptInfo *rel, Oid relationId, int numberOfColumnsRead); @@ -187,12 +197,13 @@ ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, ereport(DEBUG1, (errmsg("pathlist hook for columnar table am"))); /* - * TODO: Since we don't have a proper costing model for - * ColumnarCustomScan, we remove other paths to force postgres - * using ColumnarCustomScan. Note that we still keep index paths - * since they still might be useful. + * When columnar custom scan is enabled (columnar.enable_custom_scan), + * we only consider ColumnarScanPath's & IndexPath's. For this reason, + * we remove other paths and re-estimate IndexPath costs to make accurate + * comparisons between them. */ RemovePathsByPredicate(rel, IsNotIndexPath); + RecostColumnarIndexPaths(root, rel, rte->relid); add_path(rel, customPath); } } @@ -232,6 +243,184 @@ IsNotIndexPath(Path *path) } +/* + * RecostColumnarIndexPaths re-costs index paths of given RelOptInfo for + * columnar table with relationId. + */ +static void +RecostColumnarIndexPaths(PlannerInfo *root, RelOptInfo *rel, Oid relationId) +{ + Path *path = NULL; + foreach_ptr(path, rel->pathlist) + { + /* + * Since we don't provide implementations for scan_bitmap_next_block + * & scan_bitmap_next_tuple, postgres doesn't generate bitmap index + * scan paths for columnar tables already (see related comments in + * TableAmRoutine). For this reason, we only consider IndexPath's + * here. + */ + if (IsA(path, IndexPath)) + { + RecostColumnarIndexPath(root, rel, relationId, (IndexPath *) path); + } + } +} + + +/* + * RecostColumnarIndexPath re-costs given index path for columnar table with + * relationId. + */ +static void +RecostColumnarIndexPath(PlannerInfo *root, RelOptInfo *rel, Oid relationId, + IndexPath *indexPath) +{ + ereport(DEBUG4, (errmsg("columnar table index scan costs estimated by " + "indexAM: startup cost = %.10f, total cost = " + "%.10f", indexPath->path.startup_cost, + indexPath->path.total_cost))); + + /* + * We estimate the cost for columnar table read during index scan. Also, + * instead of overwriting startup & total costs, we "add" ours to the + * costs estimated by indexAM since we should consider index traversal + * related costs too. + */ + Cost indexAMStartupCost = indexPath->path.startup_cost; + Cost indexAMScanCost = indexPath->path.total_cost - indexAMStartupCost; + + Cost columnarIndexScanStartupCost = ColumnarIndexScanAddStartupCost(rel, relationId, + indexPath); + Cost columnarIndexScanCost = ColumnarIndexScanAddTotalCost(root, rel, relationId, + indexPath); + + indexPath->path.startup_cost = indexAMStartupCost + columnarIndexScanStartupCost; + indexPath->path.total_cost = indexPath->path.startup_cost + + indexAMScanCost + columnarIndexScanCost; + + ereport(DEBUG4, (errmsg("columnar table index scan costs re-estimated " + "by columnarAM (including indexAM costs): " + "startup cost = %.10f, total cost = %.10f", + indexPath->path.startup_cost, + indexPath->path.total_cost))); +} + + +/* + * ColumnarIndexScanAddStartupCost returns additional startup cost estimated + * for index scan described by IndexPath for columnar table with relationId. + */ +static Cost +ColumnarIndexScanAddStartupCost(RelOptInfo *rel, Oid relationId, IndexPath *indexPath) +{ + int numberOfColumnsRead = RelationIdGetNumberOfAttributes(relationId); + + /* we would at least read one stripe */ + return ColumnarPerStripeScanCost(rel, relationId, numberOfColumnsRead); +} + + +/* + * ColumnarIndexScanAddTotalCost returns additional cost estimated for + * index scan described by IndexPath for columnar table with relationId. + */ +static Cost +ColumnarIndexScanAddTotalCost(PlannerInfo *root, RelOptInfo *rel, + Oid relationId, IndexPath *indexPath) +{ + int numberOfColumnsRead = RelationIdGetNumberOfAttributes(relationId); + Cost perStripeCost = ColumnarPerStripeScanCost(rel, relationId, numberOfColumnsRead); + + /* + * We don't need to pass correct loop count to amcostestimate since we + * will only use index correlation & index selectivity, and loop count + * doesn't have any effect on those two. + */ + double fakeLoopCount = 1; + Cost fakeIndexStartupCost; + Cost fakeIndexTotalCost; + double fakeIndexPages; + Selectivity indexSelectivity; + double indexCorrelation; + amcostestimate_function amcostestimate = indexPath->indexinfo->amcostestimate; + amcostestimate(root, indexPath, fakeLoopCount, &fakeIndexStartupCost, + &fakeIndexTotalCost, &indexSelectivity, + &indexCorrelation, &fakeIndexPages); + + Relation relation = RelationIdGetRelation(relationId); + uint64 rowCount = ColumnarTableRowCount(relation); + RelationClose(relation); + double estimatedRows = rowCount * indexSelectivity; + + /* + * In the worst case (i.e no correlation between the column & the index), + * we need to read a different stripe for each row. + */ + double maxStripeReadCount = estimatedRows; + + /* + * In the best case (i.e the column is fully correlated with the index), + * we wouldn't read the same stripe again and again thanks + * to locality. + */ + double avgStripeRowCount = + rowCount / (double) ColumnarTableStripeCount(relationId); + double minStripeReadCount = estimatedRows / avgStripeRowCount; + + /* + * While being close to 0 means low correlation, being close to -1 or +1 + * means high correlation. For index scans on columnar tables, it doesn't + * matter if the column and the index are "correlated" (+1) or + * "anti-correlated" (-1) since both help us avoiding from reading the + * same stripe again and again. + */ + double absIndexCorrelation = Abs(indexCorrelation); + + /* + * To estimate the number of stripes that we need to read, we do linear + * interpolation between minStripeReadCount & maxStripeReadCount. To do + * that, we use complement to 1 of absolute correlation, where being + * close to 0 means high correlation and being close to 1 means low + * correlation. + * In practice, we only want to do an index scan when absIndexCorrelation + * is 1 (or extremely close to it), or when the absolute number of tuples + * returned is very small. Other cases will have a prohibitive cost. + */ + double complementIndexCorrelation = 1 - absIndexCorrelation; + double estimatedStripeReadCount = + minStripeReadCount + complementIndexCorrelation * (maxStripeReadCount - + minStripeReadCount); + + Cost scanCost = perStripeCost * estimatedStripeReadCount; + + ereport(DEBUG4, (errmsg("re-costing index scan for columnar table: " + "selectivity = %.10f, complement abs " + "correlation = %.10f, per stripe cost = %.10f, " + "estimated stripe read count = %.10f, " + "total additional cost = %.10f", + indexSelectivity, complementIndexCorrelation, + perStripeCost, estimatedStripeReadCount, + scanCost))); + + return scanCost; +} + + +/* + * RelationIdGetNumberOfAttributes returns number of attributes that relation + * with relationId has. + */ +static int +RelationIdGetNumberOfAttributes(Oid relationId) +{ + Relation relation = RelationIdGetRelation(relationId); + int nattrs = relation->rd_att->natts; + RelationClose(relation); + return nattrs; +} + + static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) { diff --git a/src/test/regress/columnar_schedule b/src/test/regress/columnar_schedule index 218b04c4a..84a106c81 100644 --- a/src/test/regress/columnar_schedule +++ b/src/test/regress/columnar_schedule @@ -9,7 +9,7 @@ test: columnar_analyze test: columnar_data_types test: columnar_drop test: columnar_indexes -test: columnar_fallback_scan +test: columnar_fallback_scan columnar_paths test: columnar_partitioning test: columnar_permissions test: columnar_empty diff --git a/src/test/regress/expected/columnar_paths.out b/src/test/regress/expected/columnar_paths.out new file mode 100644 index 000000000..cf5a86a64 --- /dev/null +++ b/src/test/regress/expected/columnar_paths.out @@ -0,0 +1,302 @@ +CREATE SCHEMA columnar_paths; +SET search_path TO columnar_paths; +CREATE TABLE full_correlated (a int, b text, c int, d int) USING columnar; +INSERT INTO full_correlated SELECT i, i::text FROM generate_series(1, 1000000) i; +CREATE INDEX full_correlated_btree ON full_correlated (a); +ANALYZE full_correlated; +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a=200; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a<0; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a>10 AND a<20; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a>1000000; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a>900000; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a<1000; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a,b FROM full_correlated WHERE a<3000; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a<9000; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +BEGIN; + TRUNCATE full_correlated; + INSERT INTO full_correlated SELECT i, i::text FROM generate_series(1, 1000) i; + -- Since we have much smaller number of rows, selectivity of below + -- query should be much higher. So we would choose columnar custom scan. + SELECT columnar_test_helpers.uses_custom_scan ( + $$ + SELECT a FROM full_correlated WHERE a=200; + $$ + ); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +ROLLBACK; +-- same filter used in above, but choosing multiple columns would increase +-- custom scan cost, so we would prefer index scan this time +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a,b,c,d FROM full_correlated WHERE a<9000; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +-- again same filter used in above, but we would choose custom scan this +-- time since it would read three less columns from disk +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT c FROM full_correlated WHERE a<10000; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a>200; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a=0 OR a=5; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +DROP INDEX full_correlated_btree; +CREATE INDEX full_correlated_hash ON full_correlated USING hash(a); +ANALYZE full_correlated; +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a<10; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a>1 AND a<10; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a=0 OR a=5; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a=1000; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a,c FROM full_correlated WHERE a=1000; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +CREATE TABLE full_anti_correlated (a int, b text) USING columnar; +INSERT INTO full_anti_correlated SELECT i, i::text FROM generate_series(1, 500000) i; +CREATE INDEX full_anti_correlated_hash ON full_anti_correlated USING hash(b); +ANALYZE full_anti_correlated; +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE b='600'; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a,b FROM full_anti_correlated WHERE b='600'; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a,b FROM full_anti_correlated WHERE b='600' OR b='10'; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +DROP INDEX full_anti_correlated_hash; +CREATE INDEX full_anti_correlated_btree ON full_anti_correlated (a,b); +ANALYZE full_anti_correlated; +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE a>6500 AND a<7000 AND b<'10000'; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE a>2000 AND a<7000; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE a>2000 AND a<7000 AND b='24'; +$$ +); + uses_index_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE a<7000 AND b<'10000'; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +CREATE TABLE no_correlation (a int, b text) USING columnar; +INSERT INTO no_correlation SELECT random()*5000, (random()*5000)::int::text FROM generate_series(1, 500000) i; +CREATE INDEX no_correlation_btree ON no_correlation (a); +ANALYZE no_correlation; +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM no_correlation WHERE a < 2; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM no_correlation WHERE a = 200; +$$ +); + uses_custom_scan +--------------------------------------------------------------------- + t +(1 row) + +SET client_min_messages TO WARNING; +DROP SCHEMA columnar_paths CASCADE; diff --git a/src/test/regress/expected/columnar_test_helpers.out b/src/test/regress/expected/columnar_test_helpers.out index 2f7cad23b..6b1421de8 100644 --- a/src/test/regress/expected/columnar_test_helpers.out +++ b/src/test/regress/expected/columnar_test_helpers.out @@ -77,3 +77,30 @@ CREATE FUNCTION top_memory_context_usage() RETURNS BIGINT AS $$ SELECT TopMemoryContext FROM columnar_test_helpers.columnar_store_memory_stats(); $$ LANGUAGE SQL VOLATILE; +CREATE OR REPLACE FUNCTION uses_index_scan(command text) +RETURNS BOOLEAN AS $$ +DECLARE + query_plan text; +BEGIN + FOR query_plan IN EXECUTE 'EXPLAIN' || command LOOP + IF query_plan ILIKE '%Index Only Scan using%' OR + query_plan ILIKE '%Index Scan using%' + THEN + RETURN true; + END IF; + END LOOP; + RETURN false; +END; $$ language plpgsql; +CREATE OR REPLACE FUNCTION uses_custom_scan(command text) +RETURNS BOOLEAN AS $$ +DECLARE + query_plan text; +BEGIN + FOR query_plan IN EXECUTE 'EXPLAIN' || command LOOP + IF query_plan ILIKE '%Custom Scan (ColumnarScan)%' + THEN + RETURN true; + END IF; + END LOOP; + RETURN false; +END; $$ language plpgsql; diff --git a/src/test/regress/sql/columnar_paths.sql b/src/test/regress/sql/columnar_paths.sql new file mode 100644 index 000000000..be128ba9b --- /dev/null +++ b/src/test/regress/sql/columnar_paths.sql @@ -0,0 +1,203 @@ +CREATE SCHEMA columnar_paths; +SET search_path TO columnar_paths; + +CREATE TABLE full_correlated (a int, b text, c int, d int) USING columnar; +INSERT INTO full_correlated SELECT i, i::text FROM generate_series(1, 1000000) i; +CREATE INDEX full_correlated_btree ON full_correlated (a); +ANALYZE full_correlated; + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a=200; +$$ +); + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a<0; +$$ +); + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a>10 AND a<20; +$$ +); + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a>1000000; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a>900000; +$$ +); + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_correlated WHERE a<1000; +$$ +); + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a,b FROM full_correlated WHERE a<3000; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a<9000; +$$ +); + +BEGIN; + TRUNCATE full_correlated; + INSERT INTO full_correlated SELECT i, i::text FROM generate_series(1, 1000) i; + + -- Since we have much smaller number of rows, selectivity of below + -- query should be much higher. So we would choose columnar custom scan. + SELECT columnar_test_helpers.uses_custom_scan ( + $$ + SELECT a FROM full_correlated WHERE a=200; + $$ + ); +ROLLBACK; + +-- same filter used in above, but choosing multiple columns would increase +-- custom scan cost, so we would prefer index scan this time +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a,b,c,d FROM full_correlated WHERE a<9000; +$$ +); + +-- again same filter used in above, but we would choose custom scan this +-- time since it would read three less columns from disk +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT c FROM full_correlated WHERE a<10000; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a>200; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a=0 OR a=5; +$$ +); + +DROP INDEX full_correlated_btree; + +CREATE INDEX full_correlated_hash ON full_correlated USING hash(a); +ANALYZE full_correlated; + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a<10; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a>1 AND a<10; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a=0 OR a=5; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_correlated WHERE a=1000; +$$ +); + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a,c FROM full_correlated WHERE a=1000; +$$ +); + +CREATE TABLE full_anti_correlated (a int, b text) USING columnar; +INSERT INTO full_anti_correlated SELECT i, i::text FROM generate_series(1, 500000) i; +CREATE INDEX full_anti_correlated_hash ON full_anti_correlated USING hash(b); +ANALYZE full_anti_correlated; + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE b='600'; +$$ +); + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a,b FROM full_anti_correlated WHERE b='600'; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a,b FROM full_anti_correlated WHERE b='600' OR b='10'; +$$ +); + +DROP INDEX full_anti_correlated_hash; + +CREATE INDEX full_anti_correlated_btree ON full_anti_correlated (a,b); +ANALYZE full_anti_correlated; + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE a>6500 AND a<7000 AND b<'10000'; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE a>2000 AND a<7000; +$$ +); + +SELECT columnar_test_helpers.uses_index_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE a>2000 AND a<7000 AND b='24'; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM full_anti_correlated WHERE a<7000 AND b<'10000'; +$$ +); + +CREATE TABLE no_correlation (a int, b text) USING columnar; +INSERT INTO no_correlation SELECT random()*5000, (random()*5000)::int::text FROM generate_series(1, 500000) i; +CREATE INDEX no_correlation_btree ON no_correlation (a); +ANALYZE no_correlation; + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM no_correlation WHERE a < 2; +$$ +); + +SELECT columnar_test_helpers.uses_custom_scan ( +$$ +SELECT a FROM no_correlation WHERE a = 200; +$$ +); + +SET client_min_messages TO WARNING; +DROP SCHEMA columnar_paths CASCADE; diff --git a/src/test/regress/sql/columnar_test_helpers.sql b/src/test/regress/sql/columnar_test_helpers.sql index 08438a75f..07b6230ab 100644 --- a/src/test/regress/sql/columnar_test_helpers.sql +++ b/src/test/regress/sql/columnar_test_helpers.sql @@ -84,3 +84,32 @@ CREATE FUNCTION top_memory_context_usage() RETURNS BIGINT AS $$ SELECT TopMemoryContext FROM columnar_test_helpers.columnar_store_memory_stats(); $$ LANGUAGE SQL VOLATILE; + +CREATE OR REPLACE FUNCTION uses_index_scan(command text) +RETURNS BOOLEAN AS $$ +DECLARE + query_plan text; +BEGIN + FOR query_plan IN EXECUTE 'EXPLAIN' || command LOOP + IF query_plan ILIKE '%Index Only Scan using%' OR + query_plan ILIKE '%Index Scan using%' + THEN + RETURN true; + END IF; + END LOOP; + RETURN false; +END; $$ language plpgsql; + +CREATE OR REPLACE FUNCTION uses_custom_scan(command text) +RETURNS BOOLEAN AS $$ +DECLARE + query_plan text; +BEGIN + FOR query_plan IN EXECUTE 'EXPLAIN' || command LOOP + IF query_plan ILIKE '%Custom Scan (ColumnarScan)%' + THEN + RETURN true; + END IF; + END LOOP; + RETURN false; +END; $$ language plpgsql;