-- -- MULTI_INSERT_SELECT -- ALTER SEQUENCE pg_catalog.pg_dist_shardid_seq RESTART 13300000; ALTER SEQUENCE pg_catalog.pg_dist_jobid_seq RESTART 13300000; CREATE TABLE raw_events_first (user_id int, time timestamp, value_1 int, value_2 int, value_3 float, value_4 bigint, UNIQUE(user_id, value_1)); SELECT master_create_distributed_table('raw_events_first', 'user_id', 'hash'); SELECT master_create_worker_shards('raw_events_first', 4, 2); CREATE TABLE raw_events_second (user_id int, time timestamp, value_1 int, value_2 int, value_3 float, value_4 bigint, UNIQUE(user_id, value_1)); SELECT master_create_distributed_table('raw_events_second', 'user_id', 'hash'); SELECT master_create_worker_shards('raw_events_second', 4, 2); CREATE TABLE agg_events (user_id int, value_1_agg int, value_2_agg int, value_3_agg float, value_4_agg bigint, agg_time timestamp, UNIQUE(user_id, value_1_agg)); SELECT master_create_distributed_table('agg_events', 'user_id', 'hash'); SELECT master_create_worker_shards('agg_events', 4, 2); -- make tables as co-located UPDATE pg_dist_partition SET colocationid = 100000 WHERE logicalrelid IN ('raw_events_first', 'raw_events_second', 'agg_events'); CREATE TABLE reference_table (user_id int); SELECT master_create_distributed_table('reference_table', 'user_id', 'hash'); SELECT master_create_worker_shards('reference_table', 1, 2); INSERT INTO raw_events_first (user_id, time, value_1, value_2, value_3, value_4) VALUES (1, now(), 10, 100, 1000.1, 10000); INSERT INTO raw_events_first (user_id, time, value_1, value_2, value_3, value_4) VALUES (2, now(), 20, 200, 2000.1, 20000); INSERT INTO raw_events_first (user_id, time, value_1, value_2, value_3, value_4) VALUES (3, now(), 30, 300, 3000.1, 30000); INSERT INTO raw_events_first (user_id, time, value_1, value_2, value_3, value_4) VALUES (4, now(), 40, 400, 4000.1, 40000); INSERT INTO raw_events_first (user_id, time, value_1, value_2, value_3, value_4) VALUES (5, now(), 50, 500, 5000.1, 50000); INSERT INTO raw_events_first (user_id, time, value_1, value_2, value_3, value_4) VALUES (6, now(), 60, 600, 6000.1, 60000); SET client_min_messages TO DEBUG4; -- raw table to raw table INSERT INTO raw_events_second SELECT * FROM raw_events_first; -- see that our first multi shard INSERT...SELECT works expected SET client_min_messages TO INFO; SELECT raw_events_first.user_id FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id; -- see that we get unique vialitons INSERT INTO raw_events_second SELECT * FROM raw_events_first; -- add one more row INSERT INTO raw_events_first (user_id, time) VALUES (7, now()); -- try a single shard query SET client_min_messages TO DEBUG4; INSERT INTO raw_events_second (user_id, time) SELECT user_id, time FROM raw_events_first WHERE user_id = 7; SET client_min_messages TO INFO; -- add one more row INSERT INTO raw_events_first (user_id, time, value_1, value_2, value_3, value_4) VALUES (8, now(), 80, 800, 8000, 80000); -- reorder columns SET client_min_messages TO DEBUG4; INSERT INTO raw_events_second (value_2, value_1, value_3, value_4, user_id, time) SELECT value_2, value_1, value_3, value_4, user_id, time FROM raw_events_first WHERE user_id = 8; -- a zero shard select INSERT INTO raw_events_second (value_2, value_1, value_3, value_4, user_id, time) SELECT value_2, value_1, value_3, value_4, user_id, time FROM raw_events_first WHERE false; -- another zero shard select INSERT INTO raw_events_second (value_2, value_1, value_3, value_4, user_id, time) SELECT value_2, value_1, value_3, value_4, user_id, time FROM raw_events_first WHERE 0 != 0; -- add one more row SET client_min_messages TO INFO; INSERT INTO raw_events_first (user_id, time, value_1, value_2, value_3, value_4) VALUES (9, now(), 90, 900, 9000, 90000); -- show that RETURNING also works SET client_min_messages TO DEBUG4; INSERT INTO raw_events_second (user_id, value_1, value_3) SELECT user_id, value_1, value_3 FROM raw_events_first WHERE value_3 = 9000 RETURNING *; -- hits two shards INSERT INTO raw_events_second (user_id, value_1, value_3) SELECT user_id, value_1, value_3 FROM raw_events_first WHERE user_id = 9 OR user_id = 16 RETURNING *; -- now do some aggregations INSERT INTO agg_events SELECT user_id, sum(value_1), avg(value_2), sum(value_3), count(value_4) FROM raw_events_first GROUP BY user_id; -- group by column not exists on the SELECT target list -- TODO: there is a bug on RETURNING INSERT INTO agg_events (value_3_agg, value_4_agg, value_1_agg, user_id) SELECT sum(value_3), count(value_4), sum(value_1), user_id FROM raw_events_first GROUP BY value_2, user_id RETURNING *; -- some subquery tests INSERT INTO agg_events (value_1_agg, user_id) SELECT SUM(value_1), id FROM (SELECT raw_events_second.user_id AS id, raw_events_second.value_1 FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id) AS foo GROUP BY id; -- subquery one more level depth INSERT INTO agg_events (value_4_agg, value_1_agg, user_id) SELECT v4, v1, id FROM (SELECT SUM(raw_events_second.value_4) AS v4, SUM(raw_events_first.value_1) AS v1, raw_events_second.user_id AS id FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id GROUP BY raw_events_second.user_id) AS foo; -- join between subqueries INSERT INTO agg_events (user_id) SELECT f2.id FROM (SELECT id FROM (SELECT reference_table.user_id AS id FROM raw_events_first, reference_table WHERE raw_events_first.user_id = reference_table.user_id ) AS foo) as f INNER JOIN (SELECT v4, v1, id FROM (SELECT SUM(raw_events_second.value_4) AS v4, SUM(raw_events_first.value_1) AS v1, raw_events_second.user_id AS id FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id GROUP BY raw_events_second.user_id HAVING SUM(raw_events_second.value_4) > 10) AS foo2 ) as f2 ON (f.id = f2.id); -- add one more level subqueris on top of subquery JOINs INSERT INTO agg_events (user_id, value_4_agg) SELECT outer_most.id, max(outer_most.value) FROM ( SELECT f2.id as id, f2.v4 as value FROM (SELECT id FROM (SELECT reference_table.user_id AS id FROM raw_events_first, reference_table WHERE raw_events_first.user_id = reference_table.user_id ) AS foo) as f INNER JOIN (SELECT v4, v1, id FROM (SELECT SUM(raw_events_second.value_4) AS v4, SUM(raw_events_first.value_1) AS v1, raw_events_second.user_id AS id FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id GROUP BY raw_events_second.user_id HAVING SUM(raw_events_second.value_4) > 10) AS foo2 ) as f2 ON (f.id = f2.id)) as outer_most GROUP BY outer_most.id; -- some UPSERTS INSERT INTO agg_events AS ae ( user_id, value_1_agg, agg_time ) SELECT user_id, value_1, time FROM raw_events_first ON conflict (user_id, value_1_agg) DO UPDATE SET agg_time = EXCLUDED.agg_time WHERE ae.agg_time < EXCLUDED.agg_time; -- upserts with returning INSERT INTO agg_events AS ae ( user_id, value_1_agg, agg_time ) SELECT user_id, value_1, time FROM raw_events_first ON conflict (user_id, value_1_agg) DO UPDATE SET agg_time = EXCLUDED.agg_time WHERE ae.agg_time < EXCLUDED.agg_time RETURNING user_id, value_1_agg; -- TODO:: add hll and date_trunc INSERT INTO agg_events (user_id, value_1_agg) SELECT user_id, sum(value_1 + value_2) FROM raw_events_first GROUP BY user_id; -- FILTER CLAUSE INSERT INTO agg_events (user_id, value_1_agg) SELECT user_id, sum(value_1 + value_2) FILTER (where value_3 = 15) FROM raw_events_first GROUP BY user_id; -- TODO: UUIDs -- a test with reference table JOINs INSERT INTO agg_events (user_id, value_1_agg) SELECT raw_events_first.user_id, sum(value_1) FROM reference_table, raw_events_first WHERE raw_events_first.user_id = reference_table.user_id GROUP BY raw_events_first.user_id; -- We do not support CTEs WITH fist_table_agg AS (SELECT sum(value_1) as v1_agg, user_id FROM raw_events_first GROUP BY user_id) INSERT INTO agg_events (value_1_agg, user_id) SELECT v1_agg, user_id FROM fist_table_agg; -- We do not support CTEs in the INSERT as well INSERT INTO agg_events WITH sub_cte AS (SELECT 1) SELECT raw_events_first.user_id, (SELECT * FROM sub_cte) FROM raw_events_first; -- We do not support any set operations INSERT INTO raw_events_first(user_id) SELECT user_id FROM ((SELECT user_id FROM raw_events_first) UNION (SELECT user_id FROM raw_events_second)) as foo; -- We do not support any set operations INSERT INTO raw_events_first(user_id) (SELECT user_id FROM raw_events_first) INTERSECT (SELECT user_id FROM raw_events_first); -- We do not support any set operations INSERT INTO raw_events_first(user_id) SELECT user_id FROM ((SELECT user_id FROM raw_events_first WHERE user_id = 15) EXCEPT (SELECT user_id FROM raw_events_second where user_id = 17)) as foo; -- unsupported JOIN INSERT INTO agg_events (value_4_agg, value_1_agg, user_id) SELECT v4, v1, id FROM (SELECT SUM(raw_events_second.value_4) AS v4, SUM(raw_events_first.value_1) AS v1, raw_events_second.user_id AS id FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id != raw_events_second.user_id GROUP BY raw_events_second.user_id) AS foo; -- INSERT partition column does not match with SELECT partition column INSERT INTO agg_events (value_4_agg, value_1_agg, user_id) SELECT v4, v1, id FROM (SELECT SUM(raw_events_second.value_4) AS v4, SUM(raw_events_first.value_1) AS v1, raw_events_second.value_3 AS id FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id GROUP BY raw_events_second.value_3) AS foo; -- error cases -- no part column at all INSERT INTO raw_events_second (value_1) SELECT value_1 FROM raw_events_first; INSERT INTO raw_events_second (value_1) SELECT user_id FROM raw_events_first; INSERT INTO raw_events_second (user_id) SELECT value_1 FROM raw_events_first; INSERT INTO raw_events_second (user_id) SELECT user_id * 2 FROM raw_events_first; INSERT INTO raw_events_second (user_id) SELECT user_id :: bigint FROM raw_events_first; INSERT INTO agg_events (value_3_agg, value_4_agg, value_1_agg, value_2_agg, user_id) SELECT SUM(value_3), Count(value_4), user_id, SUM(value_1), Avg(value_2) FROM raw_events_first GROUP BY user_id; INSERT INTO agg_events (value_3_agg, value_4_agg, value_1_agg, value_2_agg, user_id) SELECT SUM(value_3), Count(value_4), user_id, SUM(value_1), value_2 FROM raw_events_first GROUP BY user_id, value_2; -- tables should be co-located INSERT INTO agg_events (user_id) SELECT user_id FROM reference_table; -- unsupported joins between subqueries -- we do not return bare partition column on the inner query INSERT INTO agg_events (user_id) SELECT f2.id FROM (SELECT id FROM (SELECT reference_table.user_id AS id FROM raw_events_first, reference_table WHERE raw_events_first.user_id = reference_table.user_id ) AS foo) as f INNER JOIN (SELECT v4, v1, id FROM (SELECT SUM(raw_events_second.value_4) AS v4, raw_events_second.value_1 AS v1, SUM(raw_events_second.user_id) AS id FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id GROUP BY raw_events_second.value_1 HAVING SUM(raw_events_second.value_4) > 10) AS foo2 ) as f2 ON (f.id = f2.id); -- the second part of the query is not routable since -- no GROUP BY on the partition column INSERT INTO agg_events (user_id) SELECT f.id FROM (SELECT id FROM (SELECT raw_events_first.user_id AS id FROM raw_events_first, reference_table WHERE raw_events_first.user_id = reference_table.user_id ) AS foo) as f INNER JOIN (SELECT v4, v1, id FROM (SELECT SUM(raw_events_second.value_4) AS v4, raw_events_second.value_1 AS v1, SUM(raw_events_second.user_id) AS id FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id GROUP BY raw_events_second.value_1 HAVING SUM(raw_events_second.value_4) > 10) AS foo2 ) as f2 ON (f.id = f2.id); -- cannot pushdown the query since the JOIN is not equi JOIN INSERT INTO agg_events (user_id, value_4_agg) SELECT outer_most.id, max(outer_most.value) FROM ( SELECT f2.id as id, f2.v4 as value FROM (SELECT id FROM (SELECT reference_table.user_id AS id FROM raw_events_first, reference_table WHERE raw_events_first.user_id = reference_table.user_id ) AS foo) as f INNER JOIN (SELECT v4, v1, id FROM (SELECT SUM(raw_events_second.value_4) AS v4, SUM(raw_events_first.value_1) AS v1, raw_events_second.user_id AS id FROM raw_events_first, raw_events_second WHERE raw_events_first.user_id = raw_events_second.user_id GROUP BY raw_events_second.user_id HAVING SUM(raw_events_second.value_4) > 10) AS foo2 ) as f2 ON (f.id != f2.id)) as outer_most GROUP BY outer_most.id; -- Views does not work CREATE VIEW test_view AS SELECT * FROM raw_events_first; INSERT INTO raw_events_second SELECT * FROM test_view;