-- -- multi subquery complex queries aims to expand existing subquery pushdown -- regression tests to cover more caeses -- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql -- -- We don't need shard id sequence here, so commented out to prevent conflicts with concurrent tests -- SET citus.next_shard_id TO 1400000; CREATE TABLE user_buy_test_table(user_id int, item_id int, buy_count int); SELECT create_distributed_table('user_buy_test_table', 'user_id'); create_distributed_table --------------------------------------------------------------------- (1 row) INSERT INTO user_buy_test_table VALUES(1,2,1); INSERT INTO user_buy_test_table VALUES(2,3,4); INSERT INTO user_buy_test_table VALUES(3,4,2); INSERT INTO user_buy_test_table VALUES(7,5,2); CREATE TABLE users_return_test_table(user_id int, item_id int, buy_count int); SELECT create_distributed_table('users_return_test_table', 'user_id'); create_distributed_table --------------------------------------------------------------------- (1 row) INSERT INTO users_return_test_table VALUES(4,1,1); INSERT INTO users_return_test_table VALUES(1,3,1); INSERT INTO users_return_test_table VALUES(3,2,2); -- Simple Join test with reference table SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; count --------------------------------------------------------------------- 3 (1 row) -- Should work, reference table at the inner side is allowed SELECT count(*) FROM (SELECT random(), k_no FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1 WHERE k_no = 47; count --------------------------------------------------------------------- 1 (1 row) -- Should work, although no equality between partition column and reference table SELECT subquery_1.item_id FROM (SELECT user_buy_test_table.item_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1 ORDER BY 1; item_id --------------------------------------------------------------------- 2 3 4 5 (4 rows) -- Should work, although no equality between partition column and reference table SELECT subquery_1.user_id FROM (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_1 ORDER BY 1; user_id --------------------------------------------------------------------- 1 2 3 3 7 7 7 7 7 7 (10 rows) -- Shouldn't work, reference table at the outer side is not allowed SELECT * FROM (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table ON users_ref_test_table.id = user_buy_test_table.user_id) subquery_1; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- Should work, reference table at the inner side is allowed SELECT count(*) FROM (SELECT random() FROM users_ref_test_table RIGHT JOIN user_buy_test_table ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; count --------------------------------------------------------------------- 4 (1 row) -- Shouldn't work, reference table at the outer side is not allowed SELECT * FROM (SELECT random() FROM user_buy_test_table RIGHT JOIN users_ref_test_table ON user_buy_test_table.user_id = users_ref_test_table.id) subquery_1; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- Equi join test with reference table on non-partition keys SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table ON user_buy_test_table.item_id = users_ref_test_table.id) subquery_1; count --------------------------------------------------------------------- 4 (1 row) -- Non-equi join test with reference table on non-partition keys SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN users_ref_test_table ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; count --------------------------------------------------------------------- 10 (1 row) -- Non-equi left joins with reference tables on non-partition keys SELECT count(*) FROM (SELECT random() FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; count --------------------------------------------------------------------- 10 (1 row) -- Should pass since reference table locates in the inner part of each left join SELECT count(*) FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 ON tt1.user_id = tt2.user_id) subquery_1 LEFT JOIN (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 LEFT JOIN users_ref_test_table as ref ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id; count --------------------------------------------------------------------- 2 (1 row) -- two subqueries, each include joins with reference table -- also, two hash distributed tables are joined on partition keys SELECT count(*) FROM (SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.item_id > users_ref_test_table.id AND users_ref_test_table.k_no > 88 AND user_buy_test_table.item_id < 88) subquery_1, (SELECT DISTINCT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.user_id > users_ref_test_table.id AND users_ref_test_table.k_no > 44 AND user_buy_test_table.user_id > 44) subquery_2 WHERE subquery_1.user_id = subquery_2.user_id ; count --------------------------------------------------------------------- 4 (1 row) -- Should be able to push down since reference tables are inner joined -- with hash distributed tables, the results of those joins are the parts of -- an outer join SELECT subquery_2.id FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 ON tt1.user_id = tt2.user_id) subquery_1 RIGHT JOIN (SELECT tt1.user_id, ref.id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref ON tt1.user_id = ref.id) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1 DESC LIMIT 5; id --------------------------------------------------------------------- 3 2 1 (3 rows) -- almost the same query as the above, but reference table join wrapped with -- another subquery SELECT subquery_1.user_id FROM (SELECT tt1.user_id, random() FROM user_buy_test_table AS tt1 JOIN users_return_test_table as tt2 ON tt1.user_id = tt2.user_id) subquery_1 RIGHT JOIN (SELECT *, random() FROM (SELECT tt1.user_id, random() FROM user_buy_test_table as tt1 JOIN users_ref_test_table as ref ON tt1.user_id = ref.id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id ORDER BY 1; user_id --------------------------------------------------------------------- 1 3 (3 rows) -- should be able to pushdown since reference table is in the -- inner part of the left join SELECT user_id, sum(value_1) FROM (SELECT users_table.user_id, users_table.value_1, random() FROM users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; user_id | sum --------------------------------------------------------------------- 2 | 31248 3 | 15120 4 | 14994 5 | 8694 1 | 7590 (5 rows) -- same query as above, reference table is wrapped into a subquery SELECT user_id, sum(value_1) FROM (SELECT users_table.user_id, users_table.value_1, random() FROM users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) INNER JOIN (SELECT *, random() FROM events_reference_table) as ref_all ON (ref_all.value_2 = users_table.user_id) ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; user_id | sum --------------------------------------------------------------------- 2 | 31248 3 | 15120 4 | 14994 5 | 8694 1 | 7590 (5 rows) -- table function can be the inner relationship in a join SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN generate_series(1,10) AS users_ref_test_table(id) ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; count --------------------------------------------------------------------- 10 (1 row) -- table function cannot be used without subquery pushdown SELECT count(*) FROM user_buy_test_table JOIN generate_series(1,10) AS users_ref_test_table(id) ON user_buy_test_table.item_id = users_ref_test_table.id; count --------------------------------------------------------------------- 4 (1 row) -- table function can be the inner relationship in an outer join SELECT count(*) FROM (SELECT random() FROM user_buy_test_table LEFT JOIN generate_series(1,10) AS users_ref_test_table(id) ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; count --------------------------------------------------------------------- 10 (1 row) SELECT count(*) FROM user_buy_test_table LEFT JOIN (SELECT * FROM generate_series(1,10) id) users_ref_test_table ON user_buy_test_table.item_id = users_ref_test_table.id; count --------------------------------------------------------------------- 4 (1 row) -- table function cannot be the outer relationship in an outer join SELECT count(*) FROM (SELECT random() FROM user_buy_test_table RIGHT JOIN generate_series(1,10) AS users_ref_test_table(id) ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; ERROR: cannot pushdown the subquery DETAIL: There exist a table function in the outer part of the outer join SELECT count(*) FROM user_buy_test_table RIGHT JOIN (SELECT * FROM generate_series(1,10) id) users_ref_test_table ON user_buy_test_table.item_id = users_ref_test_table.id; ERROR: cannot pushdown the subquery DETAIL: There exist a table function in the outer part of the outer join -- volatile functions can be used as table expressions through recursive planning SET client_min_messages TO DEBUG; SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN random() AS users_ref_test_table(id) ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; DEBUG: Router planner cannot handle multi-shard select queries DEBUG: generating subplan XXX_1 for subquery SELECT id FROM random() users_ref_test_table(id) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT random() AS random FROM (public.user_buy_test_table JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id double precision)) users_ref_test_table(id) ON (((user_buy_test_table.item_id)::double precision OPERATOR(pg_catalog.>) users_ref_test_table.id)))) subquery_1 DEBUG: Router planner cannot handle multi-shard select queries count --------------------------------------------------------------------- 4 (1 row) -- can sneak in a volatile function as a parameter SELECT count(*) FROM (SELECT item_id FROM user_buy_test_table JOIN generate_series(random()::int,10) AS users_ref_test_table(id) ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1 WHERE item_id = 6; DEBUG: Router planner cannot handle multi-shard select queries DEBUG: generating subplan XXX_1 for subquery SELECT id FROM generate_series((random())::integer, 10) users_ref_test_table(id) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT user_buy_test_table.item_id FROM (public.user_buy_test_table JOIN (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) users_ref_test_table(id) ON ((user_buy_test_table.item_id OPERATOR(pg_catalog.>) users_ref_test_table.id)))) subquery_1 WHERE (item_id OPERATOR(pg_catalog.=) 6) DEBUG: Router planner cannot handle multi-shard select queries count --------------------------------------------------------------------- 0 (1 row) -- can perform a union with table function through recursive planning SELECT count(*) FROM (SELECT user_id FROM user_buy_test_table UNION ALL SELECT id FROM generate_series(1,10) AS users_ref_test_table(id)) subquery_1; DEBUG: Router planner cannot handle multi-shard select queries DEBUG: Router planner cannot handle multi-shard select queries DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.user_buy_test_table DEBUG: Creating router plan DEBUG: generating subplan XXX_2 for subquery SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer) UNION ALL SELECT users_ref_test_table.id FROM generate_series(1, 10) users_ref_test_table(id) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) subquery_1 DEBUG: Creating router plan count --------------------------------------------------------------------- 14 (1 row) RESET client_min_messages; -- subquery without FROM can be the inner relationship in a join SELECT count(*) FROM (SELECT random() FROM user_buy_test_table JOIN (SELECT 4 AS id) users_ref_test_table ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; count --------------------------------------------------------------------- 1 (1 row) -- subquery without FROM triggers subquery pushdown SELECT count(*) FROM user_buy_test_table JOIN (SELECT 5 AS id) users_ref_test_table ON user_buy_test_table.item_id = users_ref_test_table.id; count --------------------------------------------------------------------- 1 (1 row) -- subquery without FROM can be the inner relationship in an outer join SELECT count(*) FROM user_buy_test_table LEFT JOIN (SELECT 5 AS id) users_ref_test_table ON user_buy_test_table.item_id = users_ref_test_table.id; count --------------------------------------------------------------------- 4 (1 row) -- subquery without FROM cannot be the outer relationship in an outer join SELECT count(*) FROM user_buy_test_table RIGHT JOIN (SELECT 5 AS id) users_ref_test_table ON user_buy_test_table.item_id = users_ref_test_table.id; ERROR: cannot pushdown the subquery DETAIL: There exist a subquery without FROM in the outer part of the outer join -- can perform a union with subquery without FROM -- with pulling data to coordinator SET client_min_messages TO DEBUG; SELECT count(*) FROM (SELECT user_id FROM user_buy_test_table UNION ALL SELECT id FROM (SELECT 5 AS id) users_ref_test_table) subquery_1; DEBUG: Router planner cannot handle multi-shard select queries DEBUG: Router planner cannot handle multi-shard select queries DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.user_buy_test_table DEBUG: Creating router plan DEBUG: generating subplan XXX_2 for subquery SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer) UNION ALL SELECT users_ref_test_table.id FROM (SELECT 5 AS id) users_ref_test_table DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) subquery_1 DEBUG: Creating router plan count --------------------------------------------------------------------- 5 (1 row) -- union involving reference table and distributed table subqueries -- is supported with pulling data to coordinator SELECT * FROM (SELECT id FROM users_ref_test_table UNION SELECT user_id FROM user_buy_test_table) sub ORDER BY 1 DESC; DEBUG: Router planner cannot handle multi-shard select queries DEBUG: Router planner cannot handle multi-shard select queries DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.user_buy_test_table DEBUG: Creating router plan DEBUG: generating subplan XXX_2 for subquery SELECT users_ref_test_table.id FROM public.users_ref_test_table UNION SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT id FROM (SELECT intermediate_result.id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer)) sub ORDER BY id DESC DEBUG: Creating router plan id --------------------------------------------------------------------- 7 6 5 4 3 2 1 (7 rows) -- same query but inner query has wrapped into another subquery SELECT * FROM (SELECT id, random() * 0 FROM (SELECT id FROM users_ref_test_table) sub1 UNION SELECT user_id, random() * 0 FROM (SELECT user_id FROM user_buy_test_table) sub2) sub ORDER BY 1 DESC; DEBUG: Router planner cannot handle multi-shard select queries DEBUG: Router planner cannot handle multi-shard select queries DEBUG: generating subplan XXX_1 for subquery SELECT user_id, (random() OPERATOR(pg_catalog.*) (0)::double precision) FROM (SELECT user_buy_test_table.user_id FROM public.user_buy_test_table) sub2 DEBUG: Creating router plan DEBUG: generating subplan XXX_2 for subquery SELECT sub1.id, (random() OPERATOR(pg_catalog.*) (0)::double precision) FROM (SELECT users_ref_test_table.id FROM public.users_ref_test_table) sub1 UNION SELECT intermediate_result.user_id, intermediate_result."?column?" FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, "?column?" double precision) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT id, "?column?" FROM (SELECT intermediate_result.id, intermediate_result."?column?" FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(id integer, "?column?" double precision)) sub(id, "?column?") ORDER BY id DESC DEBUG: Creating router plan id | ?column? --------------------------------------------------------------------- 7 | 0 6 | 0 5 | 0 4 | 0 3 | 0 2 | 0 1 | 0 (7 rows) -- query can be pushed down when a reference table inside union query is -- joined with a distributed table SELECT * FROM (SELECT user_id FROM users_ref_test_table ref JOIN user_buy_test_table dis on (ref.id = dis.user_id) UNION SELECT user_id FROM user_buy_test_table) sub ORDER BY 1 DESC; DEBUG: Router planner cannot handle multi-shard select queries user_id --------------------------------------------------------------------- 7 3 2 1 (4 rows) -- same query with inner query wrapped with a subquery SELECT * FROM (SELECT user_id, random() * 0 FROM (SELECT dis.user_id FROM users_ref_test_table ref join user_buy_test_table dis on (ref.id = dis.user_id)) sub1 UNION SELECT user_id, random() * 0 FROM (SELECT user_id FROM user_buy_test_table) sub2) sub ORDER BY 1 DESC; DEBUG: Router planner cannot handle multi-shard select queries user_id | ?column? --------------------------------------------------------------------- 7 | 0 3 | 0 2 | 0 1 | 0 (4 rows) SELECT * FROM (SELECT user_id FROM users_ref_test_table ref JOIN user_buy_test_table dis on (ref.id = dis.user_id) WHERE id in (SELECT user_id from user_buy_test_table) UNION SELECT user_id FROM user_buy_test_table) sub ORDER BY 1 DESC; DEBUG: Router planner cannot handle multi-shard select queries ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT * FROM (SELECT user_id FROM users_ref_test_table ref JOIN user_buy_test_table dis on (ref.id = dis.user_id) UNION SELECT user_id FROM user_buy_test_table WHERE user_id in (select id from users_ref_test_table)) sub ORDER BY 1 DESC; DEBUG: Router planner cannot handle multi-shard select queries user_id --------------------------------------------------------------------- 3 2 1 (3 rows) -- query can be pushed down when a reference table inside union query is -- joined with a distributed table. outer join is also supported -- if reference table is in the inner part SELECT * FROM (SELECT user_id FROM users_ref_test_table ref RIGHT JOIN user_buy_test_table dis on (ref.id = dis.user_id) UNION SELECT user_id FROM user_buy_test_table) sub ORDER BY 1 DESC; DEBUG: Router planner cannot handle multi-shard select queries user_id --------------------------------------------------------------------- 7 3 2 1 (4 rows) -- query can be pushed down when a reference table inside union query is -- joined with a distributed table. reference table cannot be at -- the outer part. SELECT * FROM (SELECT user_id FROM users_ref_test_table ref LEFT JOIN user_buy_test_table dis on (ref.id = dis.user_id) UNION SELECT user_id FROM user_buy_test_table) sub ORDER BY 1 DESC; DEBUG: Router planner cannot handle multi-shard select queries ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join RESET client_min_messages; -- should be able to pushdown since reference table is in the -- inner part of the left join SELECT user_id, sum(value_1) FROM (SELECT users_table.user_id, users_table.value_1, random() FROM users_table LEFT JOIN events_table ON (users_table.user_id = events_table.user_id) LEFT JOIN events_reference_table ON (events_reference_table.value_2 = users_table.user_id) ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; user_id | sum --------------------------------------------------------------------- 2 | 31248 3 | 15120 4 | 14994 5 | 8694 1 | 7590 6 | 210 (6 rows) -- should not be able to pushdown since reference table is in the -- direct outer part of the left join SELECT user_id, sum(value_1) FROM (SELECT users_table.user_id, users_table.value_1, random() FROM events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should not be able to pushdown since reference table is in the -- direct outer part of the left join wrapped into a subquery SELECT * FROM (SELECT *, random() FROM events_reference_table) as ref_all LEFT JOIN users_table ON (users_table.user_id = ref_all.value_2); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should not be able to pushdown since reference table is in the -- outer part of the left join SELECT user_id, sum(value_1) FROM (SELECT users_table.user_id, users_table.value_1, random() FROM events_reference_table LEFT JOIN users_table ON (users_table.user_id = events_reference_table.value_2) LEFT JOIN events_table ON (events_table.user_id = users_table.user_id) ) as foo GROUP BY user_id ORDER BY 2 DESC LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- should be able to pushdown since reference table is in the -- inner part of the left join SELECT * FROM ( SELECT DISTINCT foo.user_id FROM ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() FROM events_reference_table as "events" WHERE event_type > 2) as "temp_data_queries" INNER JOIN (SELECT "users"."user_id" FROM users_table as "users" WHERE user_id > 2 and value_2 = 1) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar ORDER BY 1; user_id --------------------------------------------------------------------- 3 4 5 (3 rows) -- the same query but this time reference table is in the outer part of the query SELECT * FROM ( SELECT DISTINCT foo.user_id FROM ((SELECT "events"."time", "events"."user_id" as event_user_id, value_2 as event_val_2, random() FROM events_reference_table as "events" WHERE event_type > 2) as "temp_data_queries" LEFT JOIN (SELECT "users"."user_id" FROM users_table as "users" WHERE user_id > 2 and value_2 = 1) as foo_in ON (event_val_2 = user_id)) as foo LEFT JOIN (SELECT user_id as user_user_id FROM users_table) as fooo ON (user_id = user_user_id)) as bar; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- we could even support the following where the subquery -- on the outer part of the left join contains a reference table SELECT max(events_all.cnt), events_all.usr_id FROM (SELECT users_table.user_id as usr_id, count(*) as cnt FROM events_reference_table INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; max | usr_id --------------------------------------------------------------------- 432 | 2 391 | 4 364 | 5 357 | 3 105 | 1 (5 rows) -- we still support the query when the join involving reference table is wrapped with a subquery SELECT max(events_all.cnt), events_all.usr_id FROM (SELECT *, random() FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt FROM events_reference_table INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all_inner ) AS events_all LEFT JOIN events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; max | usr_id --------------------------------------------------------------------- 432 | 2 391 | 4 364 | 5 357 | 3 105 | 1 (5 rows) -- should be fine even if the tables are deep inside subqueries SELECT max(events_all.cnt), events_all.usr_id FROM (SELECT *, random() FROM (SELECT *, random() FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all_inner ) AS events_all ) AS events_all LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; max | usr_id --------------------------------------------------------------------- 432 | 2 391 | 4 364 | 5 357 | 3 105 | 1 (5 rows) -- should be fine with FULL join as well SELECT max(events_all.cnt), events_all.usr_id FROM events_table FULL JOIN (SELECT *, random() FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt FROM events_reference_table INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all_inner ) AS events_all ON (events_all.usr_id = events_table.user_id) GROUP BY 2 ORDER BY 1 DESC, 2 DESC LIMIT 5; max | usr_id --------------------------------------------------------------------- 432 | 2 391 | 4 364 | 5 357 | 3 105 | 1 (5 rows) -- two levels of "(ref_table JOIN dist_table) LEFT JOIN" -- should be fine as well SELECT users_table.* FROM (SELECT events_all.*, random() FROM events_reference_table JOIN users_table USING(user_id) JOIN (SELECT *, random() FROM (SELECT users_table.user_id AS usr_id, count(*) AS cnt FROM (SELECT *,random FROM (SELECT *, random() FROM events_reference_table) as events_reference_table) as events_reference_table INNER JOIN users_table ON (users_table.user_id = events_reference_table.user_id) GROUP BY users_table.user_id) AS events_all_inner ) AS events_all ON (user_id = usr_id) ) AS events_all LEFT JOIN (SELECT *,random() FROM (SELECT *, random() FROM events_table) as events_table) as events_table ON (events_all.usr_id = events_table.user_id) LEFT JOIN users_table USING (user_id) ORDER BY 1,2,3,4 LIMIT 5; user_id | time | value_1 | value_2 | value_3 | value_4 --------------------------------------------------------------------- 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | 1 | Wed Nov 22 22:51:43.132261 2017 | 4 | 0 | 3 | (5 rows) -- we should be able to support OUTER joins involving -- reference tables even if the subquery is in WHERE clause SELECT count(*) FROM events_table WHERE user_id IN (SELECT subquery_1.user_id FROM (SELECT tt1.user_id, random() FROM users_table AS tt1 JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 RIGHT JOIN (SELECT *, random() FROM (SELECT tt1.user_id, random() FROM users_table AS tt1 JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id); count --------------------------------------------------------------------- 101 (1 row) -- we should be able to support OUTER joins involving -- reference tables even if the subquery is in the outer part of a JOIN SELECT count(*) FROM users_table RIGHT JOIN (SELECT subquery_1.user_id FROM (SELECT tt1.user_id, random() FROM users_table AS tt1 JOIN events_table AS tt2 ON tt1.user_id = tt2.user_id) subquery_1 RIGHT JOIN (SELECT *, random() FROM (SELECT tt1.user_id, random() FROM users_table AS tt1 JOIN events_reference_table AS REF ON tt1.user_id = ref.user_id) subquery_2_inner) subquery_2 ON subquery_1.user_id = subquery_2.user_id) AS foo USING (user_id); count --------------------------------------------------------------------- 12664199 (1 row) -- LATERAL JOINs used with INNER JOINs with reference tables SET citus.subquery_pushdown to ON; NOTICE: Setting citus.subquery_pushdown flag is discouraged becuase it forces the planner to pushdown certain queries, skipping relevant correctness checks. DETAIL: When enabled, the planner skips many correctness checks for subqueries and pushes down the queries to shards as-is. It means that the queries are likely to return wrong results unless the user is absolutely sure that pushing down the subquery is safe. This GUC is maintained only for backward compatibility, no new users are supposed to use it. The planner is capable of pushing down as much computation as possible to the shards depending on the query. SELECT user_id, lastseen FROM (SELECT "some_users_data".user_id, lastseen FROM (SELECT filter_users_1.user_id, time AS lastseen FROM (SELECT user_where_1_1.user_id FROM (SELECT "users"."user_id" FROM users_reference_table as "users" WHERE user_id > 0 and user_id < 5 and value_1 > 1) user_where_1_1 INNER JOIN (SELECT "users"."user_id" FROM users_reference_table as "users" WHERE user_id > 0 and user_id < 5 and value_2 > 2) user_where_1_join_1 ON ("user_where_1_1".user_id = "user_where_1_join_1".user_id)) filter_users_1 JOIN LATERAL (SELECT user_id, time FROM events_reference_table as "events" WHERE user_id > 0 and user_id < 5 AND user_id = filter_users_1.user_id ORDER BY time DESC LIMIT 1) "last_events_1" ON TRUE ORDER BY time DESC LIMIT 10) "some_recent_users" JOIN LATERAL (SELECT "users".user_id FROM users_reference_table as "users" WHERE "users"."user_id" = "some_recent_users"."user_id" AND "users"."value_2" > 2 LIMIT 1) "some_users_data" ON TRUE ORDER BY lastseen DESC LIMIT 10) "some_users" ORDER BY user_id DESC LIMIT 10; user_id | lastseen --------------------------------------------------------------------- 1 | Thu Nov 23 21:54:46.924477 2017 1 | Thu Nov 23 21:54:46.924477 2017 1 | Thu Nov 23 21:54:46.924477 2017 1 | Thu Nov 23 21:54:46.924477 2017 1 | Thu Nov 23 21:54:46.924477 2017 1 | Thu Nov 23 21:54:46.924477 2017 1 | Thu Nov 23 21:54:46.924477 2017 1 | Thu Nov 23 21:54:46.924477 2017 1 | Thu Nov 23 21:54:46.924477 2017 1 | Thu Nov 23 21:54:46.924477 2017 (10 rows) SET citus.subquery_pushdown to OFF; -- NESTED INNER JOINs with reference tables SELECT count(*) AS value, "generated_group_field" FROM (SELECT DISTINCT "pushedDownQuery"."user_id", "generated_group_field" FROM (SELECT "eventQuery"."user_id", "eventQuery"."time", random(), ("eventQuery"."value_2") AS "generated_group_field" FROM (SELECT * FROM (SELECT "events"."time", "events"."user_id", "events"."value_2" FROM events_table as "events" WHERE user_id > 0 and user_id < 4 AND event_type IN (4, 5) ) "temp_data_queries" INNER JOIN (SELECT user_where_1_1.real_user_id FROM (SELECT "users"."user_id" as real_user_id FROM users_reference_table as "users" WHERE user_id > 0 and user_id < 4 and value_2 > 3 ) user_where_1_1 INNER JOIN (SELECT "users"."user_id" FROM users_reference_table as "users" WHERE user_id > 0 and user_id < 4 and value_3 > 3 ) user_where_1_join_1 ON ("user_where_1_1".real_user_id = "user_where_1_join_1".user_id)) "user_filters_1" ON ("temp_data_queries".user_id = "user_filters_1".real_user_id)) "eventQuery") "pushedDownQuery") "pushedDownQuery" GROUP BY "generated_group_field" ORDER BY generated_group_field DESC, value DESC; value | generated_group_field --------------------------------------------------------------------- 2 | 5 1 | 3 3 | 2 3 | 1 1 | 0 (5 rows) -- single level inner joins with reference tables SELECT "value_3", count(*) AS cnt FROM (SELECT "value_3", "user_id", random() FROM (SELECT users_in_segment_1.user_id, value_3 FROM (SELECT user_id, value_3 * 2 as value_3 FROM (SELECT user_id, value_3 FROM (SELECT "users"."user_id", value_3 FROM users_reference_table as "users" WHERE user_id > 1 and user_id < 4 and value_2 > 2 ) simple_user_where_1 ) all_buckets_1 ) users_in_segment_1 JOIN (SELECT "users"."user_id" FROM users_reference_table as "users" WHERE user_id > 1 and user_id < 4 and value_2 > 3 ) some_users_data ON ("users_in_segment_1".user_id = "some_users_data".user_id) ) segmentalias_1) "tempQuery" GROUP BY "value_3" ORDER BY cnt, value_3 DESC LIMIT 10; value_3 | cnt --------------------------------------------------------------------- 0 | 7 10 | 21 4 | 21 8 | 28 6 | 28 2 | 35 (6 rows) -- nested LATERAL JOINs with reference tables SET citus.subquery_pushdown to ON; NOTICE: Setting citus.subquery_pushdown flag is discouraged becuase it forces the planner to pushdown certain queries, skipping relevant correctness checks. DETAIL: When enabled, the planner skips many correctness checks for subqueries and pushes down the queries to shards as-is. It means that the queries are likely to return wrong results unless the user is absolutely sure that pushing down the subquery is safe. This GUC is maintained only for backward compatibility, no new users are supposed to use it. The planner is capable of pushing down as much computation as possible to the shards depending on the query. SELECT * FROM (SELECT "some_users_data".user_id, "some_recent_users".value_3 FROM (SELECT filter_users_1.user_id, value_3 FROM (SELECT "users"."user_id" FROM users_reference_table as "users" WHERE user_id > 2 and user_id < 5 and users.value_2 = 3) filter_users_1 JOIN LATERAL (SELECT user_id, value_3 FROM events_reference_table as "events" WHERE user_id > 2 and user_id < 5 AND ("events".user_id = "filter_users_1".user_id) ORDER BY value_3 DESC LIMIT 1) "last_events_1" ON true ORDER BY value_3 DESC LIMIT 10) "some_recent_users" JOIN LATERAL (SELECT "users".user_id FROM users_reference_table as "users" WHERE "users"."user_id" = "some_recent_users"."user_id" AND users.value_2 > 3 LIMIT 1) "some_users_data" ON true ORDER BY value_3 DESC LIMIT 10) "some_users" ORDER BY value_3 DESC LIMIT 10; user_id | value_3 --------------------------------------------------------------------- 3 | 5 3 | 5 3 | 5 4 | 4 4 | 4 4 | 4 (6 rows) SET citus.subquery_pushdown to OFF; -- LEFT JOINs used with INNER JOINs should not error out since reference table joined -- with hash table that Citus can push down SELECT count(*) AS cnt, "generated_group_field" FROM (SELECT "eventQuery"."user_id", random(), generated_group_field FROM (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM (SELECT "events"."time", "events"."user_id" as event_user_id FROM events_table as "events" WHERE user_id > 4) "temp_data_queries" INNER JOIN (SELECT "users"."user_id" FROM users_reference_table as "users" WHERE user_id > 2 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" LEFT JOIN (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" FROM users_table as "users") "left_group_by_1" ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; cnt | generated_group_field --------------------------------------------------------------------- 336 | 2 210 | 1 210 | 3 126 | 4 126 | 5 84 | 0 (6 rows) -- RIGHT JOINs used with INNER JOINs should error out since reference table exist in the -- right side of the RIGHT JOIN. SELECT count(*) AS cnt, "generated_group_field" FROM (SELECT "eventQuery"."user_id", random(), generated_group_field FROM (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM (SELECT "events"."time", "events"."user_id" as event_user_id FROM events_table as "events" WHERE user_id > 2) "temp_data_queries" INNER JOIN (SELECT "users"."user_id" FROM users_table as "users" WHERE user_id > 2 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id = "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" FROM users_reference_table as "users") "right_group_by_1" ON ("right_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- right join where the inner part of the join includes a reference table -- joined with hash partitioned table using non-equi join SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT t1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(hasdone_event, 'Has not done event') AS hasdone_event FROM ( ( SELECT u.user_id, 'step=>1'::text AS event, e.time FROM users_table AS u, events_reference_table AS e WHERE u.user_id > e.user_id AND u.user_id >= 1 AND u.user_id <= 3 AND e.event_type IN (1, 2) ) ) t1 RIGHT JOIN ( SELECT DISTINCT user_id, 'Has done event'::TEXT AS hasdone_event FROM events_table AS e WHERE e.user_id >= 1 AND e.user_id <= 3 AND e.event_type IN (3, 4) ) t2 ON (t1.user_id = t2.user_id) GROUP BY t1.user_id, hasdone_event ) t GROUP BY user_id, hasdone_event ORDER BY user_id; user_id | sum | length | hasdone_event --------------------------------------------------------------------- 2 | 72 | 14 | Has done event 3 | 238 | 14 | Has done event | 1 | 14 | Has done event (3 rows) -- a similar query as the above, with non-partition key comparison SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT t1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(hasdone_event, 'Has not done event') AS hasdone_event FROM ( ( SELECT u.user_id, 'step=>1'::text AS event, e.time FROM users_table AS u, events_reference_table AS e WHERE u.value_1 > e.user_id AND u.user_id >= 1 AND u.user_id <= 3 AND e.event_type >= 2 AND e.event_type < 3 ) ) t1 RIGHT JOIN ( SELECT DISTINCT user_id, 'Has done event'::TEXT AS hasdone_event FROM events_table AS e WHERE e.user_id >= 1 AND e.user_id <= 3 AND e.event_type >= 3 AND e.event_type < 4 ) t2 ON (t1.user_id = t2.user_id) GROUP BY t1.user_id, hasdone_event ) t GROUP BY user_id, hasdone_event ORDER BY user_id; user_id | sum | length | hasdone_event --------------------------------------------------------------------- 1 | 55 | 14 | Has done event 2 | 88 | 14 | Has done event 3 | 83 | 14 | Has done event (3 rows) -- LEFT JOINs used with INNER JOINs -- events_table and users_reference_table joined -- with event_table.non_part_key < reference_table.any_key SELECT count(*) AS cnt, "generated_group_field" FROM (SELECT "eventQuery"."user_id", random(), generated_group_field FROM (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM (SELECT "events"."time", "events"."user_id" as event_user_id FROM events_table as "events" WHERE user_id > 2) "temp_data_queries" INNER JOIN (SELECT "users"."user_id" FROM users_reference_table as "users" WHERE user_id > 2 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" FROM users_table as "users") "left_group_by_1" ON ("left_group_by_1".user_id = "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; cnt | generated_group_field --------------------------------------------------------------------- 737 | 5 679 | 1 591 | 2 479 | 3 374 | 4 159 | 0 (6 rows) -- Outer subquery with reference table SELECT "some_users_data".user_id, lastseen FROM (SELECT user_id, max(time) AS lastseen FROM (SELECT user_id, time FROM (SELECT user_id, time FROM events_reference_table as "events" WHERE user_id > 1 and user_id < 4) "events_1" ORDER BY time DESC) "recent_events_1" GROUP BY user_id ORDER BY max(TIME) DESC) "some_recent_users" FULL JOIN (SELECT "users".user_id FROM users_table as "users" WHERE users.value_2 > 2 and users.value_2 < 4) "some_users_data" ON "some_users_data"."user_id" = "some_recent_users"."user_id" ORDER BY user_id limit 50; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- -- UNIONs and JOINs with reference tables, should error out -- SELECT ("final_query"."event_types") as types FROM ( SELECT *, random() FROM ( SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" FROM ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( (SELECT * FROM (SELECT "events"."user_id", "events"."time", 0 AS event FROM events_table as "events" WHERE event_type IN (1, 2) ) events_subquery_1) UNION (SELECT * FROM (SELECT "events"."user_id", "events"."time", 1 AS event FROM events_reference_table as "events" WHERE event_type IN (3, 4) ) events_subquery_2) UNION (SELECT * FROM (SELECT "events"."user_id", "events"."time", 2 AS event FROM events_table as "events" WHERE event_type IN (5, 6) ) events_subquery_3) UNION (SELECT * FROM (SELECT "events"."user_id", "events"."time", 3 AS event FROM events_table as "events" WHERE event_type IN (1, 6)) events_subquery_4)) t1 GROUP BY "t1"."user_id") AS t) "q" INNER JOIN (SELECT "users"."user_id" FROM users_table as "users" WHERE value_1 > 2 and value_1 < 4) AS t ON (t.user_id = q.user_id)) as final_query ORDER BY types LIMIT 5; types --------------------------------------------------------------------- 0 0 0 0 0 (5 rows) -- reference table exist in the subquery of union SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM ( SELECT *, random() FROM (SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" FROM ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( (SELECT * FROM (SELECT "events"."time", 0 AS event, "events"."user_id" FROM events_table as "events" WHERE event_type IN (1, 2) ) events_subquery_1) UNION (SELECT * FROM ( SELECT * FROM ( SELECT max("users"."time"), 0 AS event, "users"."user_id" FROM events_reference_table as "events", users_table as "users" WHERE events.user_id = users.user_id AND event_type IN (1, 2) GROUP BY "users"."user_id" ) as events_subquery_5 ) events_subquery_2) UNION (SELECT * FROM (SELECT "events"."time", 2 AS event, "events"."user_id" FROM events_table as "events" WHERE event_type IN (3, 4) ) events_subquery_3) UNION (SELECT * FROM (SELECT "events"."time", 3 AS event, "events"."user_id" FROM events_table as "events" WHERE event_type IN (5, 6)) events_subquery_4) ) t1 GROUP BY "t1"."user_id") AS t) "q" INNER JOIN (SELECT "users"."user_id" FROM users_table as "users" WHERE value_1 > 2 and value_1 < 4) AS t ON (t.user_id = q.user_id)) as final_query GROUP BY types ORDER BY types; types | sumofeventtype --------------------------------------------------------------------- 0 | 217 2 | 191 3 | 31 (3 rows) -- -- Should error out with UNION ALL Queries on reference tables -- SELECT ("final_query"."event_types") as types, count(*) AS sumOfEventType FROM ( SELECT *, random() FROM ( SELECT "t"."user_id", "t"."time", unnest("t"."collected_events") AS "event_types" FROM ( SELECT "t1"."user_id", min("t1"."time") AS "time", array_agg(("t1"."event") ORDER BY TIME ASC, event DESC) AS collected_events FROM ( (SELECT * FROM (SELECT "events"."user_id", "events"."time", 0 AS event FROM events_table as "events" WHERE event_type IN (1, 2) ) events_subquery_1) UNION ALL (SELECT * FROM (SELECT "events"."user_id", "events"."time", 1 AS event FROM events_table as "events" WHERE event_type IN (3, 4) ) events_subquery_2) UNION ALL (SELECT * FROM (SELECT "events"."user_id", "events"."time", 2 AS event FROM events_reference_table as "events" WHERE event_type IN (5, 6) ) events_subquery_3) UNION ALL (SELECT * FROM (SELECT "events"."user_id", "events"."time", 3 AS event FROM events_table as "events" WHERE event_type IN (1, 6)) events_subquery_4)) t1 GROUP BY "t1"."user_id") AS t) "q" INNER JOIN (SELECT "users"."user_id" FROM users_table as "users" WHERE value_1 > 2 and value_1 < 4) AS t ON (t.user_id = q.user_id)) as final_query GROUP BY types ORDER BY types; types | sumofeventtype --------------------------------------------------------------------- 0 | 191 1 | 191 2 | 31 3 | 120 (4 rows) -- just a sanity check that we don't allow this if the reference table is on the -- left part of the left join SELECT count(*) FROM (SELECT random() FROM users_ref_test_table LEFT JOIN user_buy_test_table ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- we do allow non equi join among subqueries via recursive planning SET client_min_messages TO DEBUG1; SELECT count(*) FROM (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.item_id > users_ref_test_table.id) subquery_1, (SELECT user_buy_test_table.user_id, random() FROM user_buy_test_table LEFT JOIN users_ref_test_table ON user_buy_test_table.user_id > users_ref_test_table.id) subquery_2 WHERE subquery_1.user_id != subquery_2.user_id ; DEBUG: generating subplan XXX_1 for subquery SELECT user_buy_test_table.user_id, random() AS random FROM (public.user_buy_test_table LEFT JOIN public.users_ref_test_table ON ((user_buy_test_table.user_id OPERATOR(pg_catalog.>) users_ref_test_table.id))) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT user_buy_test_table.user_id, random() AS random FROM (public.user_buy_test_table LEFT JOIN public.users_ref_test_table ON ((user_buy_test_table.item_id OPERATOR(pg_catalog.>) users_ref_test_table.id)))) subquery_1, (SELECT intermediate_result.user_id, intermediate_result.random FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, random double precision)) subquery_2 WHERE (subquery_1.user_id OPERATOR(pg_catalog.<>) subquery_2.user_id) count --------------------------------------------------------------------- 67 (1 row) -- we could not push this query not due to non colocated -- subqueries (i.e., they are recursively planned) -- but due to outer join restrictions SELECT count(*) AS cnt, "generated_group_field" FROM (SELECT "eventQuery"."user_id", random(), generated_group_field FROM (SELECT "multi_group_wrapper_1".*, generated_group_field, random() FROM (SELECT * FROM (SELECT "events"."time", "events"."user_id" as event_user_id FROM events_table as "events" WHERE user_id > 2) "temp_data_queries" INNER JOIN (SELECT "users"."user_id" FROM users_reference_table as "users" WHERE user_id > 2 and value_2 = 5) "user_filters_1" ON ("temp_data_queries".event_user_id < "user_filters_1".user_id)) AS "multi_group_wrapper_1" RIGHT JOIN (SELECT "users"."user_id" AS "user_id", value_2 AS "generated_group_field" FROM users_table as "users") "left_group_by_1" ON ("left_group_by_1".user_id > "multi_group_wrapper_1".event_user_id)) "eventQuery") "pushedDownQuery" group BY "generated_group_field" ORDER BY cnt DESC, generated_group_field ASC LIMIT 10; DEBUG: generating subplan XXX_1 for subquery SELECT user_id, value_2 AS generated_group_field FROM public.users_table users DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS cnt, generated_group_field FROM (SELECT "eventQuery".user_id, random() AS random, "eventQuery".generated_group_field FROM (SELECT multi_group_wrapper_1."time", multi_group_wrapper_1.event_user_id, multi_group_wrapper_1.user_id, left_group_by_1.generated_group_field, random() AS random FROM ((SELECT temp_data_queries."time", temp_data_queries.event_user_id, user_filters_1.user_id FROM ((SELECT events."time", events.user_id AS event_user_id FROM public.events_table events WHERE (events.user_id OPERATOR(pg_catalog.>) 2)) temp_data_queries JOIN (SELECT users.user_id FROM public.users_reference_table users WHERE ((users.user_id OPERATOR(pg_catalog.>) 2) AND (users.value_2 OPERATOR(pg_catalog.=) 5))) user_filters_1 ON ((temp_data_queries.event_user_id OPERATOR(pg_catalog.<) user_filters_1.user_id)))) multi_group_wrapper_1 RIGHT JOIN (SELECT intermediate_result.user_id, intermediate_result.generated_group_field FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, generated_group_field integer)) left_group_by_1 ON ((left_group_by_1.user_id OPERATOR(pg_catalog.>) multi_group_wrapper_1.event_user_id)))) "eventQuery") "pushedDownQuery" GROUP BY generated_group_field ORDER BY (count(*)) DESC, generated_group_field LIMIT 10 ERROR: cannot pushdown the subquery DETAIL: Complex subqueries, CTEs and local tables cannot be in the outer part of an outer join with a distributed table RESET client_min_messages; -- two hash partitioned relations are not joined -- on partiton keys although reference table is fine -- to push down SELECT u1.user_id, count(*) FROM events_table as e1, users_table as u1 WHERE event_type IN (SELECT event_type FROM events_reference_table as e2 WHERE value_2 = 1 AND value_3 > 3 AND e1.value_2 > e2.value_2 ) AND u1.user_id > e1.user_id GROUP BY 1 ORDER BY 2 DESC, 1 DESC LIMIT 5; ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns SELECT foo.user_id FROM ( SELECT m.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) WHERE event_type > 100 ) as foo; user_id --------------------------------------------------------------------- (0 rows) -- not pushdownable since group by is on the reference table column -- recursively planned SELECT foo.user_id FROM ( SELECT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) GROUP BY r.user_id ) as foo ORDER BY 1 DESC; user_id --------------------------------------------------------------------- 6 5 4 3 2 1 (6 rows) -- not pushdownable since the group by contains at least one distributed table -- recursively planned SELECT foo.user_id FROM ( SELECT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) GROUP BY r.user_id, m.user_id ) as foo ORDER BY 1 LIMIT 3; user_id --------------------------------------------------------------------- 1 2 3 (3 rows) -- not pushdownable since distinct is on the reference table column -- recursively planned SELECT foo.user_id FROM ( SELECT DISTINCT r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) ) as foo ORDER BY 1 DESC LIMIT 5; user_id --------------------------------------------------------------------- 6 6 6 6 6 (5 rows) -- not supported since distinct on is on the reference table column -- but recursively planned SELECT foo.user_id FROM ( SELECT DISTINCT ON(r.user_id) r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) ) as foo; user_id --------------------------------------------------------------------- 1 2 3 4 5 6 (6 rows) -- supported since the distinct on contains at least one distributed table SELECT foo.user_id FROM ( SELECT DISTINCT ON(r.user_id, m.user_id) r.user_id, random() FROM users_table m JOIN events_reference_table r ON int4eq(m.user_id, r.user_id) ) as foo ORDER BY 1 LIMIT 3; user_id --------------------------------------------------------------------- 1 2 3 (3 rows) -- should be able to pushdown since one of the subqueries has distinct on reference tables -- and there is only reference table in that subquery SELECT distinct_users, event_type, time FROM (SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN (SELECT DISTINCT user_id as distinct_users FROM users_reference_table) users_ref ON (events_dist.user_id = users_ref.distinct_users) ORDER BY time DESC LIMIT 5 OFFSET 0; distinct_users | event_type | time --------------------------------------------------------------------- 1 | 6 | Thu Nov 23 21:54:46.924477 2017 4 | 1 | Thu Nov 23 18:10:21.338399 2017 3 | 2 | Thu Nov 23 18:08:26.550729 2017 2 | 1 | Thu Nov 23 17:26:14.563216 2017 3 | 4 | Thu Nov 23 16:44:41.903713 2017 (5 rows) -- the same query wuth multiple reference tables in the subquery SELECT distinct_users, event_type, time FROM (SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN (SELECT DISTINCT users_reference_table.user_id as distinct_users FROM users_reference_table, events_reference_table WHERE events_reference_table.user_id = users_reference_table.user_id AND events_reference_table.event_type IN (1,2,3,4)) users_ref ON (events_dist.user_id = users_ref.distinct_users) ORDER BY time DESC LIMIT 5 OFFSET 0; distinct_users | event_type | time --------------------------------------------------------------------- 1 | 6 | Thu Nov 23 21:54:46.924477 2017 4 | 1 | Thu Nov 23 18:10:21.338399 2017 3 | 2 | Thu Nov 23 18:08:26.550729 2017 2 | 1 | Thu Nov 23 17:26:14.563216 2017 3 | 4 | Thu Nov 23 16:44:41.903713 2017 (5 rows) -- similar query as the above, but with group bys SELECT distinct_users, event_type, time FROM (SELECT user_id, time, event_type FROM events_table) as events_dist INNER JOIN (SELECT user_id as distinct_users FROM users_reference_table GROUP BY distinct_users) users_ref ON (events_dist.user_id = users_ref.distinct_users) ORDER BY time DESC LIMIT 5 OFFSET 0; distinct_users | event_type | time --------------------------------------------------------------------- 1 | 6 | Thu Nov 23 21:54:46.924477 2017 4 | 1 | Thu Nov 23 18:10:21.338399 2017 3 | 2 | Thu Nov 23 18:08:26.550729 2017 2 | 1 | Thu Nov 23 17:26:14.563216 2017 3 | 4 | Thu Nov 23 16:44:41.903713 2017 (5 rows) -- should not push down this query since there is a distributed table (i.e., events_table) -- which is not in the DISTINCT clause. Recursive planning also fails since router execution -- is disabled SELECT * FROM ( SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 ) as foo; user_id --------------------------------------------------------------------- (0 rows) SELECT * FROM ( SELECT users_reference_table.user_id FROM users_reference_table, events_table WHERE users_reference_table.user_id = events_table.value_4 GROUP BY 1 ) as foo; user_id --------------------------------------------------------------------- (0 rows) -- similiar to the above examples, this time there is a subquery -- whose output is not in the DISTINCT clause SELECT * FROM ( SELECT DISTINCT users_reference_table.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table) as us_events WHERE users_reference_table.user_id = us_events.user_id ) as foo ORDER BY 1; user_id --------------------------------------------------------------------- 1 2 3 4 5 6 (6 rows) -- the following query is safe to push down since the DISTINCT clause include distribution column SELECT * FROM ( SELECT DISTINCT users_reference_table.user_id, us_events.user_id FROM users_reference_table, (SELECT user_id, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id ) as foo ORDER BY 1 DESC LIMIT 4; user_id | user_id --------------------------------------------------------------------- 6 | 6 5 | 5 4 | 4 3 | 3 (4 rows) -- should not pushdown since there is a non partition column on the DISTINCT clause -- Recursive planning also fails since router execution -- is disabled SELECT * FROM ( SELECT DISTINCT users_reference_table.user_id, us_events.value_4 FROM users_reference_table, (SELECT user_id, value_4, random() FROM events_table WHERE event_type IN (2,3)) as us_events WHERE users_reference_table.user_id = us_events.user_id ) as foo ORDER BY 1 DESC LIMIT 4; user_id | value_4 --------------------------------------------------------------------- 6 | 5 | 4 | 3 | (4 rows) -- test the read_intermediate_result() for GROUP BYs BEGIN; SELECT broadcast_intermediate_result('squares', 'SELECT s, s*s FROM generate_series(1,200) s'); broadcast_intermediate_result --------------------------------------------------------------------- 200 (1 row) -- single appereance of read_intermediate_result SELECT DISTINCT user_id FROM users_table JOIN (SELECT max(res.val) as mx FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int) GROUP BY res.val_square) squares ON (mx = user_id) ORDER BY 1 LIMIT 5; user_id --------------------------------------------------------------------- 1 2 3 4 5 (5 rows) -- similar to the above, with DISTINCT on intermediate result SELECT DISTINCT user_id FROM users_table JOIN (SELECT DISTINCT res.val AS mx FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) squares ON (mx = user_id) ORDER BY 1 LIMIT 5; user_id --------------------------------------------------------------------- 1 2 3 4 5 (5 rows) -- single appereance of read_intermediate_result but inside a subquery SELECT DISTINCT user_id FROM users_table JOIN ( SELECT *,random() FROM (SELECT max(res.val) as mx FROM (SELECT val, val_square FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int)) res GROUP BY res.val_square) foo) squares ON (mx = user_id) ORDER BY 1 LIMIT 5; user_id --------------------------------------------------------------------- 1 2 3 4 5 (5 rows) -- multiple read_intermediate_results in the same subquery is OK SELECT DISTINCT user_id FROM users_table JOIN (SELECT max(res.val) as mx FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int), read_intermediate_result('squares', 'binary') AS res2 (val int, val_square int) WHERE res.val = res2.val_square GROUP BY res2.val_square) squares ON (mx = user_id) ORDER BY 1 LIMIT 5; user_id --------------------------------------------------------------------- 1 4 (2 rows) -- mixed recurring tuples should be supported SELECT DISTINCT user_id FROM users_table JOIN (SELECT max(res.val) as mx FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int), generate_series(0, 10) i WHERE res.val = i GROUP BY i) squares ON (mx = user_id) ORDER BY 1 LIMIT 5; user_id --------------------------------------------------------------------- 1 2 3 4 5 (5 rows) -- should recursively plan since -- there are no columns on the GROUP BY from the distributed table SELECT DISTINCT user_id FROM users_reference_table JOIN (SELECT max(val_square) as mx FROM read_intermediate_result('squares', 'binary') AS res (val int, val_square int), events_table WHERE events_table.user_id = res.val GROUP BY res.val) squares ON (mx = user_id) ORDER BY 1 LIMIT 5; user_id --------------------------------------------------------------------- 1 4 (2 rows) ROLLBACK; -- should work since we're using an immutable function as recurring tuple SELECT DISTINCT user_id FROM users_table JOIN (SELECT max(i+5)as mx FROM generate_series(0, 10) as i GROUP BY i) squares ON (mx = user_id) ORDER BY 1 LIMIT 5; user_id --------------------------------------------------------------------- 5 6 (2 rows) -- should recursively plan since we're -- using an immutable function as recurring tuple -- along with a distributed table, where GROUP BY is -- on the recurring tuple SELECT DISTINCT user_id FROM users_reference_table JOIN (SELECT max(i+5)as mx FROM generate_series(0, 10) as i, events_table WHERE events_table.user_id = i GROUP BY i) squares ON (mx = user_id) ORDER BY 1 LIMIT 5; user_id --------------------------------------------------------------------- 6 (1 row) -- outer part of the LEFT JOIN consists only reference tables, so we cannot push down -- we have different combinations for ON condition, true/false/two column join/single column filter SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON true; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON false; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id > 5); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 5); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref1.id = user_buy_test_table.user_id); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id LEFT JOIN user_buy_test_table ON (ref2.id = user_buy_test_table.user_id); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- outer part of the LEFT JOIN consists only reference tables within a subquery, so we cannot push down -- we have different combinations for ON condition, true/false/two column join/single column filter SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON true; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON false; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id > 5); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 LEFT JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (user_buy_test_table.user_id > 19); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- one example where unsupported outer join is deep inside a subquery SELECT *, random() FROM ( SELECT *,random() FROM user_buy_test_table WHERE user_id > ( SELECT count(*) FROM (SELECT *,random() FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id) as bar) as foo LEFT JOIN (SELECT *, random() FROM (SELECT *,random() FROM user_buy_test_table d1 JOIN user_buy_test_table d2 USING (user_id)) as bar_inner ) as bar ON true)) as boo; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join -- In theory, we should be able to pushdown this query -- however, as the LEFT JOIN condition is between a reference table and the distributed table -- Postgres generates a LEFT JOIN alternative among those tables SELECT count(*) FROM (SELECT ref1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.id = user_buy_test_table.user_id); ERROR: cannot pushdown the subquery DETAIL: Complex subqueries, CTEs and local tables cannot be in the outer part of an outer join with a distributed table -- same as the above query, but this time LEFT JOIN condition is between distributed tables -- so Postgres doesn't generate join restriction between reference and distributed tables SELECT count(*) FROM (SELECT u1.*, random() FROM users_ref_test_table ref1 INNER JOIN user_buy_test_table u1 on ref1.id = u1.user_id) as foo LEFT JOIN user_buy_test_table ON (foo.user_id = user_buy_test_table.user_id); count --------------------------------------------------------------------- 3 (1 row) -- outer part of the LEFT JOIN consists only intermediate result due to LIMIT, so we cannot push down SELECT count(*) FROM (SELECT ref1.* FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id LIMIT 5) as foo LEFT JOIN user_buy_test_table ON true; ERROR: cannot pushdown the subquery DETAIL: Complex subqueries, CTEs and local tables cannot be in the outer part of an outer join with a distributed table -- should be fine as OUTER part is the distributed table SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON true; count --------------------------------------------------------------------- 24 (1 row) SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON false; count --------------------------------------------------------------------- 4 (1 row) SELECT count(*) FROM users_ref_test_table ref1 INNER JOIN users_ref_test_table ref2 on ref1.id = ref2.id RIGHT JOIN user_buy_test_table ON (ref1.id = user_id); count --------------------------------------------------------------------- 4 (1 row) DROP TABLE user_buy_test_table; DROP TABLE users_ref_test_table; DROP TABLE users_return_test_table;