citus/src/test/regress/sql/columnar_indexes.sql

402 lines
13 KiB
PL/PgSQL

--
-- Testing indexes on on columnar tables.
--
CREATE SCHEMA columnar_indexes;
SET search_path tO columnar_indexes, public;
--
-- create index with the concurrent option. We should
-- error out during index creation.
-- https://github.com/citusdata/citus/issues/4599
--
create table t(a int, b int) using columnar;
create index CONCURRENTLY t_idx on t(a, b);
REINDEX INDEX CONCURRENTLY t_idx;
\d t
explain insert into t values (1, 2);
insert into t values (1, 2);
SELECT * FROM t;
explain insert into t values (1, 2);
insert into t values (3, 4);
SELECT * FROM t;
-- make sure that we test index scan
set columnar.enable_custom_scan to 'off';
set enable_seqscan to off;
set seq_page_cost TO 10000000;
CREATE table columnar_table (a INT, b int) USING columnar;
INSERT INTO columnar_table (a) VALUES (1), (1);
CREATE UNIQUE INDEX CONCURRENTLY ON columnar_table (a);
-- CONCURRENTLY should leave an invalid index behind
SELECT COUNT(*)=1 FROM pg_index WHERE indrelid = 'columnar_table'::regclass AND indisvalid = 'false';
INSERT INTO columnar_table (a) VALUES (1), (1);
REINDEX TABLE columnar_table;
-- index is still invalid since REINDEX error'ed out
SELECT COUNT(*)=1 FROM pg_index WHERE indrelid = 'columnar_table'::regclass AND indisvalid = 'false';
TRUNCATE columnar_table;
REINDEX TABLE columnar_table;
-- now it should be valid
SELECT COUNT(*)=0 FROM pg_index WHERE indrelid = 'columnar_table'::regclass AND indisvalid = 'false';
DROP INDEX columnar_table_a_idx;
INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(0, 16000) i;
-- unique --
BEGIN;
INSERT INTO columnar_table VALUES (100000000);
SAVEPOINT s1;
-- errors out due to unflushed data in upper transaction
CREATE UNIQUE INDEX ON columnar_table (a);
ROLLBACK;
CREATE UNIQUE INDEX CONCURRENTLY ON columnar_table (a);
BEGIN;
INSERT INTO columnar_table VALUES (16050);
SAVEPOINT s1;
-- index scan errors out due to unflushed data in upper transaction
SELECT a FROM columnar_table WHERE a = 16050;
ROLLBACK;
EXPLAIN (COSTS OFF) SELECT * FROM columnar_table WHERE a=6456;
EXPLAIN (COSTS OFF) SELECT a FROM columnar_table WHERE a=6456;
SELECT (SELECT a FROM columnar_table WHERE a=6456 limit 1)=6456;
SELECT (SELECT b FROM columnar_table WHERE a=6456 limit 1)=6456*2;
-- even if a=16050 doesn't exist, we try to insert it twice so this should error out
INSERT INTO columnar_table VALUES (16050), (16050);
-- should work
INSERT INTO columnar_table VALUES (16050);
-- check edge cases around stripe boundaries, error out
INSERT INTO columnar_table VALUES (16050);
INSERT INTO columnar_table VALUES (15999);
DROP INDEX columnar_table_a_idx;
CREATE TABLE partial_unique_idx_test (a INT, b INT) USING columnar;
CREATE UNIQUE INDEX ON partial_unique_idx_test (a)
WHERE b > 500;
-- should work since b =< 500 and our partial index doesn't check this interval
INSERT INTO partial_unique_idx_test VALUES (1, 2), (1, 2);
-- should work since our partial index wouldn't cover the tuples that we inserted above
INSERT INTO partial_unique_idx_test VALUES (1, 800);
INSERT INTO partial_unique_idx_test VALUES (4, 600);
-- should error out due to (4, 600)
INSERT INTO partial_unique_idx_test VALUES (4, 700);
-- btree --
CREATE INDEX CONCURRENTLY ON columnar_table (a);
SELECT (SELECT SUM(b) FROM columnar_table WHERE a>700 and a<965)=439560;
CREATE INDEX ON columnar_table (b)
WHERE (b > 30000 AND b < 33000);
-- partial index should be way smaller than the non-partial index
SELECT pg_total_relation_size('columnar_table_b_idx') * 5 <
pg_total_relation_size('columnar_table_a_idx');
-- can't use index scan due to partial index boundaries
EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30000;
-- can use index scan
EXPLAIN (COSTS OFF) SELECT b FROM columnar_table WHERE b = 30001;
-- some more rows
INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 17000) i;
DROP INDEX CONCURRENTLY columnar_table_a_idx;
TRUNCATE columnar_table;
-- pkey --
INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16000, 16499) i;
ALTER TABLE columnar_table ADD PRIMARY KEY (a);
INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(16500, 17000) i;
BEGIN;
INSERT INTO columnar_table (a) SELECT 1;
ROLLBACK;
-- should work
INSERT INTO columnar_table (a) SELECT 1;
-- error out
INSERT INTO columnar_table VALUES (16100), (16101);
INSERT INTO columnar_table VALUES (16999);
BEGIN;
REINDEX INDEX columnar_table_pkey;
-- should error even after reindex
INSERT INTO columnar_table VALUES (16999);
ROLLBACK;
VACUUM FULL columnar_table;
-- show that we don't support clustering columnar tables using indexes
CLUSTER columnar_table USING columnar_table_pkey;
ALTER TABLE columnar_table CLUSTER ON columnar_table_pkey;
CLUSTER columnar_table;
-- should error even after vacuum
INSERT INTO columnar_table VALUES (16999);
TRUNCATE columnar_table;
INSERT INTO columnar_table (a, b) SELECT i,i*2 FROM generate_series(1, 160000) i;
SELECT (SELECT b FROM columnar_table WHERE a = 150000)=300000;
-- Since our index is highly correlated with the relation itself, we should
-- de-serialize each chunk group only once. For this reason, if this test
-- file hangs on below queries, then you should think that we are not properly
-- caching the last-read chunk group during index reads.
SELECT SUM(a)=312487500 FROM columnar_table WHERE a < 25000;
SELECT SUM(a)=167000 FROM columnar_table WHERE a = 16000 OR a = 151000;
SELECT SUM(a)=48000 FROM columnar_table WHERE a = 16000 OR a = 32000;
TRUNCATE columnar_table;
ALTER TABLE columnar_table DROP CONSTRAINT columnar_table_pkey;
-- hash --
INSERT INTO columnar_table (a, b) SELECT i*2,i FROM generate_series(1, 8000) i;
CREATE INDEX hash_idx ON columnar_table USING HASH (b);
BEGIN;
CREATE INDEX hash_idx_fill_factor ON columnar_table USING HASH (b) WITH (fillfactor=10);
-- same hash index with lower fillfactor should be way bigger
SELECT pg_total_relation_size ('hash_idx_fill_factor') >
pg_total_relation_size ('hash_idx') * 5;
ROLLBACK;
BEGIN;
INSERT INTO columnar_table (a, b) SELECT i*3,i FROM generate_series(1, 8000) i;
ROLLBACK;
INSERT INTO columnar_table (a, b) SELECT i*4,i FROM generate_series(1, 8000) i;
SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000;
BEGIN;
REINDEX TABLE columnar_table;
SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000;
ROLLBACK;
VACUUM FULL columnar_table;
SELECT SUM(a)=42000 FROM columnar_table WHERE b = 7000;
-- exclusion contraints --
CREATE TABLE exclusion_test (c1 INT,c2 INT, c3 INT, c4 BOX,
EXCLUDE USING btree (c1 WITH =) INCLUDE(c3,c4) WHERE (c1 < 10)) USING columnar;
-- error out since "c1" is "1" for all rows to be inserted
INSERT INTO exclusion_test SELECT 1, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x;
BEGIN;
INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x;
ROLLBACK;
-- should work
INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(1,3) AS x;
INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x;
BEGIN;
-- should work thanks to "where" clause in exclusion constraint
INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x;
ROLLBACK;
REINDEX TABLE exclusion_test;
-- should still work after reindex
INSERT INTO exclusion_test SELECT x, 2, 3*x, BOX('4,4,4,4') FROM generate_series(10,15) AS x;
-- make sure that we respect INCLUDE syntax --
CREATE TABLE include_test (a INT, b BIGINT, c BIGINT, d BIGINT) USING columnar;
INSERT INTO include_test SELECT i, i, i, i FROM generate_series (1, 1000) i;
CREATE UNIQUE INDEX CONCURRENTLY unique_a ON include_test (a);
-- cannot use index only scan
EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500;
CREATE UNIQUE INDEX unique_a_include_b_c_d ON include_test (a) INCLUDE(b, c, d);
-- same unique index that includes other columns should be way bigger
SELECT pg_total_relation_size ('unique_a') * 1.5 <
pg_total_relation_size ('unique_a_include_b_c_d');
DROP INDEX unique_a;
-- should use index only scan since unique_a_include_b_c_d includes column "b" too
EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500;
BEGIN;
SET enable_indexonlyscan = OFF;
-- show that we respect enable_indexonlyscan GUC
EXPLAIN (COSTS OFF) SELECT b FROM include_test WHERE a = 500;
ROLLBACK;
-- make sure that we read the correct value for "b" when doing index only scan
SELECT b=980 FROM include_test WHERE a = 980;
-- some tests with distributed & partitioned tables --
CREATE TABLE dist_part_table(
dist_col INT,
part_col TIMESTAMPTZ,
col1 TEXT
) PARTITION BY RANGE (part_col);
-- create an index before creating a columnar partition
CREATE INDEX dist_part_table_btree ON dist_part_table (col1);
-- columnar partition
CREATE TABLE p0 PARTITION OF dist_part_table
FOR VALUES FROM ('2020-01-01') TO ('2020-02-01')
USING columnar;
SELECT create_distributed_table('dist_part_table', 'dist_col');
-- columnar partition
CREATE TABLE p1 PARTITION OF dist_part_table
FOR VALUES FROM ('2020-02-01') TO ('2020-03-01')
USING columnar;
-- row partition
CREATE TABLE p2 PARTITION OF dist_part_table
FOR VALUES FROM ('2020-03-01') TO ('2020-04-01');
INSERT INTO dist_part_table VALUES (1, '2020-03-15', 'str1', POINT(1, 1));
-- insert into columnar partitions
INSERT INTO dist_part_table VALUES (1, '2020-01-15', 'str2', POINT(2, 2));
INSERT INTO dist_part_table VALUES (1, '2020-02-15', 'str3', POINT(3, 3));
-- create another index after creating a columnar partition
CREATE UNIQUE INDEX dist_part_table_unique ON dist_part_table (dist_col, part_col);
-- verify that indexes are created on columnar partitions
SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p0';
SELECT COUNT(*)=2 FROM pg_indexes WHERE tablename = 'p1';
-- unsupported index types --
-- gin --
CREATE TABLE testjsonb (j JSONB) USING columnar;
INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(1,10) i;
CREATE INDEX jidx ON testjsonb USING GIN (j);
INSERT INTO testjsonb SELECT CAST('{"f1" : ' ||'"'|| i*4 ||'", ' || '"f2" : '||'"'|| i*10 ||'"}' AS JSON) FROM generate_series(15,20) i;
-- gist --
CREATE TABLE gist_point_tbl(id INT4, p POINT) USING columnar;
INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(1, 10) g;
CREATE INDEX gist_pointidx ON gist_point_tbl USING gist(p);
INSERT INTO gist_point_tbl (id, p) SELECT g, point(g*10, g*10) FROM generate_series(10, 20) g;
-- sp gist --
CREATE TABLE box_temp (f1 box) USING columnar;
INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i;
CREATE INDEX CONCURRENTLY box_spgist ON box_temp USING spgist (f1);
-- CONCURRENTLY should not leave an invalid index behind
SELECT COUNT(*)=0 FROM pg_index WHERE indrelid = 'box_temp'::regclass AND indisvalid = 'false';
INSERT INTO box_temp SELECT box(point(i, i), point(i * 2, i * 2)) FROM generate_series(1, 10) AS i;
-- brin --
CREATE TABLE brin_summarize (value int) USING columnar;
CREATE INDEX brin_summarize_idx ON brin_summarize USING brin (value) WITH (pages_per_range=2);
-- Show that we safely fallback to serial index build.
CREATE TABLE parallel_scan_test(a int) USING columnar WITH ( parallel_workers = 2 );
INSERT INTO parallel_scan_test SELECT i FROM generate_series(1,10) i;
CREATE INDEX ON parallel_scan_test (a);
VACUUM FULL parallel_scan_test;
REINDEX TABLE parallel_scan_test;
CREATE INDEX CONCURRENTLY ON parallel_scan_test (a);
REINDEX TABLE CONCURRENTLY parallel_scan_test;
-- test with different data types & indexAM's --
CREATE TABLE hash_text(a INT, b TEXT) USING columnar;
INSERT INTO hash_text SELECT i, (i*2)::TEXT FROM generate_series(1, 10) i;
CREATE INDEX ON hash_text USING hash (b);
SELECT b FROM hash_text WHERE b='10';
CREATE TABLE hash_int(a INT, b TEXT) USING columnar;
INSERT INTO hash_int SELECT i, (i*3)::TEXT FROM generate_series(1, 10) i;
CREATE INDEX ON hash_int USING hash (a);
SELECT b='15' FROM hash_int WHERE a=5;
CREATE TABLE mixed_data_types (
timestamp_col timestamp,
box_col box,
circle_col circle,
float_col float,
uuid_col uuid,
text_col text,
numeric_col numeric,
PRIMARY KEY(timestamp_col, text_col)
) USING columnar;
INSERT INTO mixed_data_types
SELECT
to_timestamp(i+36000),
box(point(i, i+90)),
circle(point(i*2, i*3), i*100),
(i*1.2)::float,
uuid_in(md5((i*10)::text || (i*15)::text)::cstring),
(i*8)::text,
(i*42)::numeric
FROM generate_series(1, 10) i;
SELECT text_col='64'
FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:08';
SELECT uuid_col='298923c8-1900-45e9-1288-b430794814c4'
FROM mixed_data_types WHERE timestamp_col='1970-01-01 02:00:01';
CREATE INDEX hash_uuid ON mixed_data_types USING hash(uuid_col);
SELECT box_col=box(point(1, 91)) AND timestamp_col='1970-01-01 02:00:01'
FROM mixed_data_types WHERE uuid_col='298923c8-1900-45e9-1288-b430794814c4';
DROP INDEX hash_uuid;
CREATE INDEX btree_multi_numeric_text_timestamp
ON mixed_data_types (numeric_col, text_col, timestamp_col);
SELECT uuid_col='ab2481c9-f93d-0ed3-033a-3281d865ccb2'
FROM mixed_data_types
WHERE
numeric_col >= 120 AND numeric_col <= 220 AND
circle_col >= circle(point(7, 7), 350) AND
float_col <= 5.0;
CREATE TABLE revisit_same_cgroup(a INT, b TEXT) USING columnar;
CREATE INDEX ON revisit_same_cgroup USING HASH (b);
INSERT INTO revisit_same_cgroup SELECT random()*500, (random()*500)::INT::TEXT FROM generate_series(1, 100000) i;
SELECT sum(a)>-1 FROM revisit_same_cgroup WHERE b = '1';
SET client_min_messages TO WARNING;
DROP SCHEMA columnar_indexes CASCADE;