Merge pull request #5140 from citusdata/col/seq-path-costing

Re-cost columnar table sequential scan paths 

With the changes in this pr, we adjust the cost estimates done by postgres for sequential scan paths for columnar tables.

We want to make better decisions when columnar custom scan is disabled too. That means, there are cases where index scan is more preferable over sequential scan for heapAM but not for columnarAM. 
For this reason, we want to make better decisions regarding whether to choose index scan or sequential scan when columnar custom is scan is **disabled**.
So with this pr, we re-estimate costs for sequential scan paths in a way that is quite similar to what we do for columnar custom scan.

The idea is that columnar custom scan uses projection pushdown so the cost is directly proportional to column selectivity. However, for sequential scan, we re-estimate the cost considering **all** the columns since projection pushdown is not supported for plain sequential scan.
One thing to note here is that we still don't consider chunk group filtering when estimating the cost for columnar custom scan. For this reason, we calculate the same costs for sequential scan & columnar custom scan if query reads all columns, regardless of the filters in the `where` clause.
To avoid mistakenly choosing sequential scan in such cases, we still remove non `IndexPath`s if columnar custom scan is enabled.
That way, even when we calculate the same cost for sequential scan and columnar scan, we will anyway remove sequential one and guarantee that we would choose either columnar custom scan or index scan.
pull/5130/head
Onur Tirtir 2021-08-02 11:38:11 +03:00 committed by GitHub
commit ef6a8604ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 624 additions and 13 deletions

View File

@ -19,6 +19,7 @@
#include "nodes/extensible.h"
#include "nodes/pg_list.h"
#include "nodes/plannodes.h"
#include "optimizer/cost.h"
#include "optimizer/optimizer.h"
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
@ -61,13 +62,16 @@ static void ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index
RangeTblEntry *rte);
static void RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate);
static bool IsNotIndexPath(Path *path);
static void RecostColumnarIndexPaths(PlannerInfo *root, RelOptInfo *rel, Oid relationId);
static Path * CreateColumnarSeqScanPath(PlannerInfo *root, RelOptInfo *rel,
Oid relationId);
static void RecostColumnarPaths(PlannerInfo *root, RelOptInfo *rel, Oid relationId);
static void RecostColumnarIndexPath(PlannerInfo *root, RelOptInfo *rel, Oid relationId,
IndexPath *indexPath);
static Cost ColumnarIndexScanAddStartupCost(RelOptInfo *rel, Oid relationId,
IndexPath *indexPath);
static Cost ColumnarIndexScanAddTotalCost(PlannerInfo *root, RelOptInfo *rel,
Oid relationId, IndexPath *indexPath);
static void RecostColumnarSeqPath(RelOptInfo *rel, Oid relationId, Path *path);
static int RelationIdGetNumberOfAttributes(Oid relationId);
static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel,
RangeTblEntry *rte);
@ -190,6 +194,22 @@ ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti,
/* columnar doesn't support parallel paths */
rel->partial_pathlist = NIL;
/*
* There are cases where IndexPath is normally more preferrable over
* SeqPath for heapAM but not for columnarAM. In such cases, an
* IndexPath could wrongly dominate a SeqPath based on the costs
* estimated by postgres earlier. For this reason, here we manually
* create a SeqPath, estimate the cost based on columnarAM and append
* to pathlist.
*
* Before doing that, we first re-cost all the existing paths so that
* add_path makes correct cost comparisons when appending our SeqPath.
*/
RecostColumnarPaths(root, rel, rte->relid);
Path *seqPath = CreateColumnarSeqScanPath(root, rel, rte->relid);
add_path(rel, seqPath);
if (EnableColumnarCustomScan)
{
Path *customPath = CreateColumnarScanPath(root, rel, rte);
@ -201,9 +221,15 @@ ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti,
* we only consider ColumnarScanPath's & IndexPath's. For this reason,
* we remove other paths and re-estimate IndexPath costs to make accurate
* comparisons between them.
*
* Even more, we might calculate an equal cost for a
* ColumnarCustomScan and a SeqPath if we are reading all columns
* of given table since we don't consider chunk group filtering
* when costing ColumnarCustomScan.
* In that case, if we don't remove SeqPath's, we might wrongly choose
* SeqPath thinking that its cost would be equal to ColumnarCustomScan.
*/
RemovePathsByPredicate(rel, IsNotIndexPath);
RecostColumnarIndexPaths(root, rel, rte->relid);
add_path(rel, customPath);
}
}
@ -244,26 +270,47 @@ IsNotIndexPath(Path *path)
/*
* RecostColumnarIndexPaths re-costs index paths of given RelOptInfo for
* CreateColumnarSeqScanPath returns Path for sequential scan on columnar
* table with relationId.
*/
static Path *
CreateColumnarSeqScanPath(PlannerInfo *root, RelOptInfo *rel, Oid relationId)
{
/* columnar doesn't support parallel scan */
int parallelWorkers = 0;
Relids requiredOuter = rel->lateral_relids;
Path *path = create_seqscan_path(root, rel, requiredOuter, parallelWorkers);
RecostColumnarSeqPath(rel, relationId, path);
return path;
}
/*
* RecostColumnarPaths re-costs paths of given RelOptInfo for
* columnar table with relationId.
*/
static void
RecostColumnarIndexPaths(PlannerInfo *root, RelOptInfo *rel, Oid relationId)
RecostColumnarPaths(PlannerInfo *root, RelOptInfo *rel, Oid relationId)
{
Path *path = NULL;
foreach_ptr(path, rel->pathlist)
{
/*
* Since we don't provide implementations for scan_bitmap_next_block
* & scan_bitmap_next_tuple, postgres doesn't generate bitmap index
* scan paths for columnar tables already (see related comments in
* TableAmRoutine). For this reason, we only consider IndexPath's
* here.
*/
if (IsA(path, IndexPath))
{
/*
* Since we don't provide implementations for scan_bitmap_next_block
* & scan_bitmap_next_tuple, postgres doesn't generate bitmap index
* scan paths for columnar tables already (see related comments in
* TableAmRoutine). For this reason, we only consider IndexPath's
* here.
*/
RecostColumnarIndexPath(root, rel, relationId, (IndexPath *) path);
}
else if (path->pathtype == T_SeqScan)
{
RecostColumnarSeqPath(rel, relationId, path);
}
}
}
@ -276,6 +323,12 @@ static void
RecostColumnarIndexPath(PlannerInfo *root, RelOptInfo *rel, Oid relationId,
IndexPath *indexPath)
{
if (!enable_indexscan)
{
/* costs are already set to disable_cost, don't adjust them */
return;
}
ereport(DEBUG4, (errmsg("columnar table index scan costs estimated by "
"indexAM: startup cost = %.10f, total cost = "
"%.10f", indexPath->path.startup_cost,
@ -407,6 +460,33 @@ ColumnarIndexScanAddTotalCost(PlannerInfo *root, RelOptInfo *rel,
}
/*
* RecostColumnarSeqPath re-costs given seq path for columnar table with
* relationId.
*/
static void
RecostColumnarSeqPath(RelOptInfo *rel, Oid relationId, Path *path)
{
if (!enable_seqscan)
{
/* costs are already set to disable_cost, don't adjust them */
return;
}
path->startup_cost = 0;
/*
* Seq scan doesn't support projection pushdown, so we will read all the
* columns.
* Also note that seq scan doesn't support chunk group filtering too but
* our costing model already doesn't consider chunk group filtering.
*/
int numberOfColumnsRead = RelationIdGetNumberOfAttributes(relationId);
path->total_cost = path->startup_cost +
ColumnarScanCost(rel, relationId, numberOfColumnsRead);
}
/*
* RelationIdGetNumberOfAttributes returns number of attributes that relation
* with relationId has.

View File

@ -105,9 +105,9 @@ EXPLAIN (costs off) SELECT count(*), sum(i), min(i), max(i) FROM parent;
Workers Planned: 4
-> Partial Aggregate
-> Parallel Append
-> Seq Scan on p3 parent_4
-> Seq Scan on p0 parent_1
-> Seq Scan on p1 parent_2
-> Seq Scan on p3 parent_4
-> Parallel Seq Scan on p2 parent_3
(9 rows)
@ -123,6 +123,45 @@ SET min_parallel_table_scan_size TO DEFAULT;
SET parallel_tuple_cost TO DEFAULT;
SET max_parallel_workers TO DEFAULT;
SET max_parallel_workers_per_gather TO DEFAULT;
CREATE INDEX parent_btree ON parent (n);
ANALYZE parent;
-- will use columnar custom scan on columnar partitions but index
-- scan on heap partition
EXPLAIN (costs off) SELECT count(*), sum(i), min(i), max(i) FROM parent
WHERE ts > '2020-02-20' AND n < 5;
QUERY PLAN
---------------------------------------------------------------------
Aggregate
-> Append
-> Custom Scan (ColumnarScan) on p1 parent_1
Filter: ((ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone) AND (n < '5'::numeric))
-> Index Scan using p2_n_idx on p2 parent_2
Index Cond: (n < '5'::numeric)
Filter: (ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone)
-> Custom Scan (ColumnarScan) on p3 parent_3
Filter: ((ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone) AND (n < '5'::numeric))
(9 rows)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
-- now that we disabled columnar custom scan, will use seq scan on columnar
-- partitions since index scan is more expensive than seq scan too
EXPLAIN (costs off) SELECT count(*), sum(i), min(i), max(i) FROM parent
WHERE ts > '2020-02-20' AND n < 5;
QUERY PLAN
---------------------------------------------------------------------
Aggregate
-> Append
-> Seq Scan on p1 parent_1
Filter: ((ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone) AND (n < '5'::numeric))
-> Index Scan using p2_n_idx on p2 parent_2
Index Cond: (n < '5'::numeric)
Filter: (ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone)
-> Seq Scan on p3 parent_3
Filter: ((ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone) AND (n < '5'::numeric))
(9 rows)
ROLLBACK;
DROP TABLE parent;
--
-- Test inheritance

View File

@ -105,9 +105,9 @@ EXPLAIN (costs off) SELECT count(*), sum(i), min(i), max(i) FROM parent;
Workers Planned: 4
-> Partial Aggregate
-> Parallel Append
-> Seq Scan on p3
-> Seq Scan on p0
-> Seq Scan on p1
-> Seq Scan on p3
-> Parallel Seq Scan on p2
(9 rows)
@ -123,6 +123,45 @@ SET min_parallel_table_scan_size TO DEFAULT;
SET parallel_tuple_cost TO DEFAULT;
SET max_parallel_workers TO DEFAULT;
SET max_parallel_workers_per_gather TO DEFAULT;
CREATE INDEX parent_btree ON parent (n);
ANALYZE parent;
-- will use columnar custom scan on columnar partitions but index
-- scan on heap partition
EXPLAIN (costs off) SELECT count(*), sum(i), min(i), max(i) FROM parent
WHERE ts > '2020-02-20' AND n < 5;
QUERY PLAN
---------------------------------------------------------------------
Aggregate
-> Append
-> Custom Scan (ColumnarScan) on p1
Filter: ((ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone) AND (n < '5'::numeric))
-> Index Scan using p2_n_idx on p2
Index Cond: (n < '5'::numeric)
Filter: (ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone)
-> Custom Scan (ColumnarScan) on p3
Filter: ((ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone) AND (n < '5'::numeric))
(9 rows)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
-- now that we disabled columnar custom scan, will use seq scan on columnar
-- partitions since index scan is more expensive than seq scan too
EXPLAIN (costs off) SELECT count(*), sum(i), min(i), max(i) FROM parent
WHERE ts > '2020-02-20' AND n < 5;
QUERY PLAN
---------------------------------------------------------------------
Aggregate
-> Append
-> Seq Scan on p1
Filter: ((ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone) AND (n < '5'::numeric))
-> Index Scan using p2_n_idx on p2
Index Cond: (n < '5'::numeric)
Filter: (ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone)
-> Seq Scan on p3
Filter: ((ts > 'Thu Feb 20 00:00:00 2020 PST'::timestamp with time zone) AND (n < '5'::numeric))
(9 rows)
ROLLBACK;
DROP TABLE parent;
--
-- Test inheritance

View File

@ -54,6 +54,19 @@ $$
t
(1 row)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM full_correlated WHERE a>900000;
$$
);
uses_seq_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT a FROM full_correlated WHERE a<1000;
@ -84,6 +97,19 @@ $$
t
(1 row)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT a FROM full_correlated WHERE a<9000;
$$
);
uses_index_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
BEGIN;
TRUNCATE full_correlated;
INSERT INTO full_correlated SELECT i, i::text FROM generate_series(1, 1000) i;
@ -99,6 +125,17 @@ BEGIN;
t
(1 row)
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM full_correlated WHERE a=200;
$$
);
uses_seq_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
-- same filter used in above, but choosing multiple columns would increase
-- custom scan cost, so we would prefer index scan this time
@ -112,6 +149,19 @@ $$
t
(1 row)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT a,b,c,d FROM full_correlated WHERE a<9000;
$$
);
uses_index_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
-- again same filter used in above, but we would choose custom scan this
-- time since it would read three less columns from disk
SELECT columnar_test_helpers.uses_custom_scan (
@ -124,6 +174,19 @@ $$
t
(1 row)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT c FROM full_correlated WHERE a<10000;
$$
);
uses_index_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
SELECT columnar_test_helpers.uses_custom_scan (
$$
SELECT a FROM full_correlated WHERE a>200;
@ -144,6 +207,162 @@ $$
t
(1 row)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM full_correlated WHERE a=0 OR a=5;
$$
);
uses_seq_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
--
-- some tests with joins / subqueries etc.
--
CREATE TABLE heap_table (a int, b text, c int, d int);
INSERT INTO heap_table SELECT i, i::text, (i+1000)*7, (i+900)*5 FROM generate_series(1, 1000000) i;
CREATE INDEX heap_table_btree ON heap_table (a);
ANALYZE heap_table;
EXPLAIN (COSTS OFF)
WITH cte AS MATERIALIZED (SELECT d FROM full_correlated WHERE a > 1)
SELECT SUM(ht_1.a), MIN(ct_1.c)
FROM heap_table AS ht_1
LEFT JOIN full_correlated AS ct_1 ON ht_1.a=ct_1.d
LEFT JOIN heap_table AS ht_2 ON ht_2.a=ct_1.c
JOIN cte ON cte.d=ht_1.a
WHERE ct_1.a < 3000;
QUERY PLAN
---------------------------------------------------------------------
Aggregate
CTE cte
-> Custom Scan (ColumnarScan) on full_correlated
Filter: (a > 1)
-> Nested Loop Left Join
-> Hash Join
Hash Cond: (cte.d = ht_1.a)
-> CTE Scan on cte
-> Hash
-> Nested Loop
-> Index Scan using full_correlated_btree on full_correlated ct_1
Index Cond: (a < 3000)
-> Index Only Scan using heap_table_btree on heap_table ht_1
Index Cond: (a = ct_1.d)
-> Index Only Scan using heap_table_btree on heap_table ht_2
Index Cond: (a = ct_1.c)
(16 rows)
-- same query but columnar custom scan is disabled
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
EXPLAIN (COSTS OFF)
WITH cte AS MATERIALIZED (SELECT d FROM full_correlated WHERE a > 1)
SELECT SUM(ht_1.a), MIN(ct_1.c)
FROM heap_table AS ht_1
LEFT JOIN full_correlated AS ct_1 ON ht_1.a=ct_1.d
LEFT JOIN heap_table AS ht_2 ON ht_2.a=ct_1.c
JOIN cte ON cte.d=ht_1.a
WHERE ct_1.a < 3000;
QUERY PLAN
---------------------------------------------------------------------
Aggregate
CTE cte
-> Seq Scan on full_correlated
Filter: (a > 1)
-> Nested Loop Left Join
-> Hash Join
Hash Cond: (cte.d = ht_1.a)
-> CTE Scan on cte
-> Hash
-> Nested Loop
-> Index Scan using full_correlated_btree on full_correlated ct_1
Index Cond: (a < 3000)
-> Index Only Scan using heap_table_btree on heap_table ht_1
Index Cond: (a = ct_1.d)
-> Index Only Scan using heap_table_btree on heap_table ht_2
Index Cond: (a = ct_1.c)
(16 rows)
ROLLBACK;
-- use custom scan
EXPLAIN (COSTS OFF) WITH w AS (SELECT * FROM full_correlated)
SELECT * FROM w AS w1 JOIN w AS w2 ON w1.a = w2.d
WHERE w2.a = 123;
QUERY PLAN
---------------------------------------------------------------------
Merge Join
Merge Cond: (w2.d = w1.a)
CTE w
-> Custom Scan (ColumnarScan) on full_correlated
-> Sort
Sort Key: w2.d
-> CTE Scan on w w2
Filter: (a = 123)
-> Materialize
-> Sort
Sort Key: w1.a
-> CTE Scan on w w1
(12 rows)
-- use index
EXPLAIN (COSTS OFF) WITH w AS NOT MATERIALIZED (SELECT * FROM full_correlated)
SELECT * FROM w AS w1 JOIN w AS w2 ON w1.a = w2.d
WHERE w2.a = 123;
QUERY PLAN
---------------------------------------------------------------------
Nested Loop
-> Index Scan using full_correlated_btree on full_correlated full_correlated_1
Index Cond: (a = 123)
-> Index Scan using full_correlated_btree on full_correlated
Index Cond: (a = full_correlated_1.d)
(5 rows)
EXPLAIN (COSTS OFF) SELECT sub_1.b, sub_2.a, sub_3.avg
FROM
(SELECT b FROM full_correlated WHERE (a > 2) GROUP BY b HAVING count(DISTINCT a) > 0 ORDER BY 1 DESC LIMIT 5) AS sub_1,
(SELECT a FROM full_correlated WHERE (a > 10) GROUP BY a HAVING count(DISTINCT a) >= 1 ORDER BY 1 DESC LIMIT 3) AS sub_2,
(SELECT avg(a) AS AVG FROM full_correlated WHERE (a > 2) GROUP BY a HAVING sum(a) > 10 ORDER BY (sum(d) - avg(a) - COALESCE(array_upper(ARRAY[max(a)],1) * 5, 0)) DESC LIMIT 3) AS sub_3
WHERE sub_2.a < sub_1.b::integer
ORDER BY 3 DESC, 2 DESC, 1 DESC
LIMIT 100;
QUERY PLAN
---------------------------------------------------------------------
Limit
-> Sort
Sort Key: sub_3.avg DESC, full_correlated_1.a DESC, full_correlated.b DESC
-> Nested Loop
-> Nested Loop
Join Filter: (full_correlated_1.a < (full_correlated.b)::integer)
-> Limit
-> GroupAggregate
Group Key: full_correlated.b
Filter: (count(DISTINCT full_correlated.a) > 0)
-> Sort
Sort Key: full_correlated.b DESC
-> Custom Scan (ColumnarScan) on full_correlated
Filter: (a > 2)
-> Materialize
-> Limit
-> GroupAggregate
Group Key: full_correlated_1.a
Filter: (count(DISTINCT full_correlated_1.a) >= 1)
-> Index Only Scan Backward using full_correlated_btree on full_correlated full_correlated_1
Index Cond: (a > 10)
-> Materialize
-> Subquery Scan on sub_3
-> Limit
-> Sort
Sort Key: ((((sum(full_correlated_2.d))::numeric - avg(full_correlated_2.a)) - (COALESCE((array_upper(ARRAY[max(full_correlated_2.a)], 1) * 5), 0))::numeric)) DESC
-> GroupAggregate
Group Key: full_correlated_2.a
Filter: (sum(full_correlated_2.a) > 10)
-> Index Scan using full_correlated_btree on full_correlated full_correlated_2
Index Cond: (a > 2)
(31 rows)
DROP INDEX full_correlated_btree;
CREATE INDEX full_correlated_hash ON full_correlated USING hash(a);
ANALYZE full_correlated;
@ -197,6 +416,19 @@ $$
t
(1 row)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT a,c FROM full_correlated WHERE a=1000;
$$
);
uses_index_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
CREATE TABLE full_anti_correlated (a int, b text) USING columnar;
INSERT INTO full_anti_correlated SELECT i, i::text FROM generate_series(1, 500000) i;
CREATE INDEX full_anti_correlated_hash ON full_anti_correlated USING hash(b);
@ -231,6 +463,19 @@ $$
t
(1 row)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a,b FROM full_anti_correlated WHERE b='600' OR b='10';
$$
);
uses_seq_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
DROP INDEX full_anti_correlated_hash;
CREATE INDEX full_anti_correlated_btree ON full_anti_correlated (a,b);
ANALYZE full_anti_correlated;
@ -274,6 +519,19 @@ $$
t
(1 row)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM full_anti_correlated WHERE a<7000 AND b<'10000';
$$
);
uses_seq_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
CREATE TABLE no_correlation (a int, b text) USING columnar;
INSERT INTO no_correlation SELECT random()*5000, (random()*5000)::int::text FROM generate_series(1, 500000) i;
CREATE INDEX no_correlation_btree ON no_correlation (a);
@ -298,5 +556,18 @@ $$
t
(1 row)
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM no_correlation WHERE a = 200;
$$
);
uses_seq_scan
---------------------------------------------------------------------
t
(1 row)
ROLLBACK;
SET client_min_messages TO WARNING;
DROP SCHEMA columnar_paths CASCADE;

View File

@ -104,3 +104,16 @@ BEGIN
END LOOP;
RETURN false;
END; $$ language plpgsql;
CREATE OR REPLACE FUNCTION uses_seq_scan(command text)
RETURNS BOOLEAN AS $$
DECLARE
query_plan text;
BEGIN
FOR query_plan IN EXECUTE 'EXPLAIN' || command LOOP
IF query_plan ILIKE '%Seq Scan on %'
THEN
RETURN true;
END IF;
END LOOP;
RETURN false;
END; $$ language plpgsql;

View File

@ -52,6 +52,23 @@ SET parallel_tuple_cost TO DEFAULT;
SET max_parallel_workers TO DEFAULT;
SET max_parallel_workers_per_gather TO DEFAULT;
CREATE INDEX parent_btree ON parent (n);
ANALYZE parent;
-- will use columnar custom scan on columnar partitions but index
-- scan on heap partition
EXPLAIN (costs off) SELECT count(*), sum(i), min(i), max(i) FROM parent
WHERE ts > '2020-02-20' AND n < 5;
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
-- now that we disabled columnar custom scan, will use seq scan on columnar
-- partitions since index scan is more expensive than seq scan too
EXPLAIN (costs off) SELECT count(*), sum(i), min(i), max(i) FROM parent
WHERE ts > '2020-02-20' AND n < 5;
ROLLBACK;
DROP TABLE parent;
--

View File

@ -36,6 +36,15 @@ SELECT a FROM full_correlated WHERE a>900000;
$$
);
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM full_correlated WHERE a>900000;
$$
);
ROLLBACK;
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT a FROM full_correlated WHERE a<1000;
@ -54,6 +63,15 @@ SELECT a FROM full_correlated WHERE a<9000;
$$
);
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT a FROM full_correlated WHERE a<9000;
$$
);
ROLLBACK;
BEGIN;
TRUNCATE full_correlated;
INSERT INTO full_correlated SELECT i, i::text FROM generate_series(1, 1000) i;
@ -65,6 +83,13 @@ BEGIN;
SELECT a FROM full_correlated WHERE a=200;
$$
);
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM full_correlated WHERE a=200;
$$
);
ROLLBACK;
-- same filter used in above, but choosing multiple columns would increase
@ -75,6 +100,15 @@ SELECT a,b,c,d FROM full_correlated WHERE a<9000;
$$
);
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT a,b,c,d FROM full_correlated WHERE a<9000;
$$
);
ROLLBACK;
-- again same filter used in above, but we would choose custom scan this
-- time since it would read three less columns from disk
SELECT columnar_test_helpers.uses_custom_scan (
@ -83,6 +117,15 @@ SELECT c FROM full_correlated WHERE a<10000;
$$
);
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT c FROM full_correlated WHERE a<10000;
$$
);
ROLLBACK;
SELECT columnar_test_helpers.uses_custom_scan (
$$
SELECT a FROM full_correlated WHERE a>200;
@ -95,6 +138,65 @@ SELECT a FROM full_correlated WHERE a=0 OR a=5;
$$
);
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM full_correlated WHERE a=0 OR a=5;
$$
);
ROLLBACK;
--
-- some tests with joins / subqueries etc.
--
CREATE TABLE heap_table (a int, b text, c int, d int);
INSERT INTO heap_table SELECT i, i::text, (i+1000)*7, (i+900)*5 FROM generate_series(1, 1000000) i;
CREATE INDEX heap_table_btree ON heap_table (a);
ANALYZE heap_table;
EXPLAIN (COSTS OFF)
WITH cte AS MATERIALIZED (SELECT d FROM full_correlated WHERE a > 1)
SELECT SUM(ht_1.a), MIN(ct_1.c)
FROM heap_table AS ht_1
LEFT JOIN full_correlated AS ct_1 ON ht_1.a=ct_1.d
LEFT JOIN heap_table AS ht_2 ON ht_2.a=ct_1.c
JOIN cte ON cte.d=ht_1.a
WHERE ct_1.a < 3000;
-- same query but columnar custom scan is disabled
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
EXPLAIN (COSTS OFF)
WITH cte AS MATERIALIZED (SELECT d FROM full_correlated WHERE a > 1)
SELECT SUM(ht_1.a), MIN(ct_1.c)
FROM heap_table AS ht_1
LEFT JOIN full_correlated AS ct_1 ON ht_1.a=ct_1.d
LEFT JOIN heap_table AS ht_2 ON ht_2.a=ct_1.c
JOIN cte ON cte.d=ht_1.a
WHERE ct_1.a < 3000;
ROLLBACK;
-- use custom scan
EXPLAIN (COSTS OFF) WITH w AS (SELECT * FROM full_correlated)
SELECT * FROM w AS w1 JOIN w AS w2 ON w1.a = w2.d
WHERE w2.a = 123;
-- use index
EXPLAIN (COSTS OFF) WITH w AS NOT MATERIALIZED (SELECT * FROM full_correlated)
SELECT * FROM w AS w1 JOIN w AS w2 ON w1.a = w2.d
WHERE w2.a = 123;
EXPLAIN (COSTS OFF) SELECT sub_1.b, sub_2.a, sub_3.avg
FROM
(SELECT b FROM full_correlated WHERE (a > 2) GROUP BY b HAVING count(DISTINCT a) > 0 ORDER BY 1 DESC LIMIT 5) AS sub_1,
(SELECT a FROM full_correlated WHERE (a > 10) GROUP BY a HAVING count(DISTINCT a) >= 1 ORDER BY 1 DESC LIMIT 3) AS sub_2,
(SELECT avg(a) AS AVG FROM full_correlated WHERE (a > 2) GROUP BY a HAVING sum(a) > 10 ORDER BY (sum(d) - avg(a) - COALESCE(array_upper(ARRAY[max(a)],1) * 5, 0)) DESC LIMIT 3) AS sub_3
WHERE sub_2.a < sub_1.b::integer
ORDER BY 3 DESC, 2 DESC, 1 DESC
LIMIT 100;
DROP INDEX full_correlated_btree;
CREATE INDEX full_correlated_hash ON full_correlated USING hash(a);
@ -130,6 +232,15 @@ SELECT a,c FROM full_correlated WHERE a=1000;
$$
);
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_index_scan (
$$
SELECT a,c FROM full_correlated WHERE a=1000;
$$
);
ROLLBACK;
CREATE TABLE full_anti_correlated (a int, b text) USING columnar;
INSERT INTO full_anti_correlated SELECT i, i::text FROM generate_series(1, 500000) i;
CREATE INDEX full_anti_correlated_hash ON full_anti_correlated USING hash(b);
@ -153,6 +264,15 @@ SELECT a,b FROM full_anti_correlated WHERE b='600' OR b='10';
$$
);
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a,b FROM full_anti_correlated WHERE b='600' OR b='10';
$$
);
ROLLBACK;
DROP INDEX full_anti_correlated_hash;
CREATE INDEX full_anti_correlated_btree ON full_anti_correlated (a,b);
@ -182,6 +302,15 @@ SELECT a FROM full_anti_correlated WHERE a<7000 AND b<'10000';
$$
);
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM full_anti_correlated WHERE a<7000 AND b<'10000';
$$
);
ROLLBACK;
CREATE TABLE no_correlation (a int, b text) USING columnar;
INSERT INTO no_correlation SELECT random()*5000, (random()*5000)::int::text FROM generate_series(1, 500000) i;
CREATE INDEX no_correlation_btree ON no_correlation (a);
@ -199,5 +328,14 @@ SELECT a FROM no_correlation WHERE a = 200;
$$
);
BEGIN;
SET LOCAL columnar.enable_custom_scan TO 'OFF';
SELECT columnar_test_helpers.uses_seq_scan (
$$
SELECT a FROM no_correlation WHERE a = 200;
$$
);
ROLLBACK;
SET client_min_messages TO WARNING;
DROP SCHEMA columnar_paths CASCADE;

View File

@ -113,3 +113,17 @@ BEGIN
END LOOP;
RETURN false;
END; $$ language plpgsql;
CREATE OR REPLACE FUNCTION uses_seq_scan(command text)
RETURNS BOOLEAN AS $$
DECLARE
query_plan text;
BEGIN
FOR query_plan IN EXECUTE 'EXPLAIN' || command LOOP
IF query_plan ILIKE '%Seq Scan on %'
THEN
RETURN true;
END IF;
END LOOP;
RETURN false;
END; $$ language plpgsql;