CREATE SCHEMA recurring_outer_join; SET search_path TO recurring_outer_join; SET citus.next_shard_id TO 1520000; SET citus.shard_count TO 32; SET client_min_messages TO DEBUG1; CREATE TABLE dist_1 (a int, b int); SELECT create_distributed_table('dist_1', 'a'); INSERT INTO dist_1 VALUES (1, 10), (1, 11), (1, 12), (2, 20), (2, 21), (2, 22), (2, 23), (3, 30), (3, 31), (3, 32), (3, 33), (3, 34), (7, 40), (7, 41), (7, 42); CREATE TABLE dist_2_columnar(LIKE dist_1) USING columnar; INSERT INTO dist_2_columnar SELECT * FROM dist_1; SELECT create_distributed_table('dist_2_columnar', 'a'); CREATE TABLE dist_3_partitioned(LIKE dist_1) PARTITION BY RANGE(a); CREATE TABLE dist_3_partitioned_p1 PARTITION OF dist_3_partitioned FOR VALUES FROM (0) TO (2); CREATE TABLE dist_3_partitioned_p2 PARTITION OF dist_3_partitioned FOR VALUES FROM (2) TO (4); CREATE TABLE dist_3_partitioned_p3 PARTITION OF dist_3_partitioned FOR VALUES FROM (4) TO (100); SELECT create_distributed_table('dist_3_partitioned', 'a'); INSERT INTO dist_3_partitioned SELECT * FROM dist_1; CREATE TABLE ref_1 (a int, b int); SELECT create_reference_table('ref_1'); INSERT INTO ref_1 VALUES (1, 100), (1, 11), (null, 102), (2, 200), (2, 21), (null, 202), (2, 203), (4, 300), (4, 301), (null, 302), (4, 303), (4, 304), (null, 400), (null, 401), (null, 402); CREATE TABLE local_1 (a int, b int); INSERT INTO local_1 VALUES (null, 1000), (1, 11), (1, 100), (5, 2000), (5, 2001), (5, 2002), (null, 2003), (6, 3000), (6, 3001), (6, 3002), (null, 3003), (6, 3004), (null, 4000), (null, 4001), (null, 4002); CREATE TABLE citus_local_1(LIKE local_1); INSERT INTO citus_local_1 SELECT * FROM local_1; SELECT citus_add_local_table_to_metadata('citus_local_1'); CREATE TABLE dist_4_different_colocation_group(LIKE dist_1); INSERT INTO dist_4_different_colocation_group SELECT * FROM local_1; DELETE FROM dist_4_different_colocation_group WHERE a IS NULL; SELECT create_distributed_table('dist_4_different_colocation_group', 'a', colocate_with=>'none'); CREATE TABLE dist_5_with_pkey(LIKE dist_1); INSERT INTO dist_5_with_pkey VALUES (1, 11), (2, 22), (3, 34), (7, 40); SELECT create_distributed_table('dist_5_with_pkey', 'a'); ALTER TABLE dist_5_with_pkey ADD CONSTRAINT pkey_1 PRIMARY KEY (a); -- -- basic cases -- SELECT COUNT(*) FROM ref_1 LEFT JOIN dist_1 USING (a); SELECT COUNT(*) FROM ref_1 LEFT JOIN dist_1 USING (a,b); SELECT COUNT(*) FROM dist_1 RIGHT JOIN ref_1 USING (a); SELECT COUNT(*) FROM ref_1 FULL JOIN dist_1 USING (a); SELECT COUNT(*) FROM dist_1 FULL JOIN ref_1 USING (a); SELECT COUNT(*) FROM dist_1 FULL JOIN ref_1 USING (a,b); -- distributed side is a subquery SELECT COUNT(*) FROM ref_1 LEFT JOIN (SELECT * FROM dist_1) q USING (a); -- distributed side is a join tree SELECT COUNT(*) FROM ref_1 LEFT JOIN (dist_1 t1 JOIN dist_1 t2 USING (a)) q USING (a); SELECT COUNT(*) FROM ref_1 LEFT JOIN (dist_1 t1 LEFT JOIN dist_1 t2 USING (a)) q USING (a); -- use functions/VALUES clauses/intrermediate results as the recurring rel -- values clause SELECT COUNT(*) FROM (SELECT a, b FROM (VALUES (1, 'one'), (2, 'two'), (3, 'three')) as t(a,b)) recurring LEFT JOIN dist_1 USING (a); -- generate_series() SELECT COUNT(*) FROM dist_1 RIGHT JOIN (SELECT a FROM generate_series(1, 10) a) recurring USING (a); -- materialized cte WITH dist_1_materialized AS MATERIALIZED ( SELECT * FROM dist_1 ) SELECT COUNT(*) FROM dist_1 RIGHT JOIN dist_1_materialized USING (a); -- offset in the subquery SELECT COUNT(*) FROM dist_1 t1 RIGHT JOIN (SELECT * FROM dist_1 OFFSET 0) t2 USING (a); -- limit in the subquery SELECT COUNT(*) FROM dist_1 t1 RIGHT JOIN (SELECT * FROM dist_1 ORDER BY 1,2 LIMIT 2) t2 USING (a); -- local-distributed join as the recurring rel -- -- We plan local-distributed join by converting local_1 into an intermediate result -- and hence it becomes a recurring rel. Then we convert distributed - inner side of -- the right join (dist_1) into an intermediate result too and this makes rhs of the -- full join a recurring rel. And finally, we convert lhs of the full join (t1) into -- an intermediate result too. SELECT COUNT(*) FROM dist_1 t1 FULL JOIN (dist_1 RIGHT JOIN local_1 USING(a)) t2 USING (a); SELECT COUNT(*) FROM dist_1 t1 FULL JOIN (dist_1 RIGHT JOIN citus_local_1 USING(a)) t2 USING (a); -- subqury without FROM SELECT COUNT(*) FROM dist_1 t1 RIGHT JOIN (SELECT generate_series(1,10) AS a) t2 USING (a); -- such semi joins / anti joins are supported too -- reference table SELECT COUNT(*) FROM ref_1 t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE t1.a IN (SELECT a FROM dist_1 t3); -- not supported because we join t3 (inner rel of the anti join) with a column -- of reference table, not with the distribution column of the other distributed -- table (t2) SELECT COUNT(*) FROM ref_1 t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE NOT EXISTS (SELECT * FROM dist_1 t3 WHERE t1.a = a); -- supported because the semi join is performed based on distribution keys -- of the distributed tables SELECT COUNT(*) FROM ref_1 t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE NOT EXISTS (SELECT * FROM dist_1 t3 WHERE t2.a = a); -- values clause SELECT COUNT(*) FROM (SELECT a, b FROM (VALUES (1, 'one'), (2, 'two'), (3, 'three')) as t(a,b)) t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE EXISTS (SELECT * FROM dist_1 t3 WHERE t1.a = a); -- offset in the subquery SELECT COUNT(*) FROM (SELECT * FROM dist_1 OFFSET 0) t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE t1.a IN (SELECT a FROM dist_1 t3); -- local-distributed join as the recurring rel SELECT COUNT(*) FROM (dist_1 RIGHT JOIN local_1 USING(a)) t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE t1.a IN (SELECT a FROM dist_1 t3); -- materialized cte WITH dist_1_materialized AS MATERIALIZED ( SELECT * FROM dist_1 ) SELECT COUNT(*) FROM dist_1_materialized t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE t1.a IN (SELECT a FROM dist_1 t3); WITH dist_1_materialized AS MATERIALIZED ( SELECT * FROM dist_1 ) SELECT COUNT(*) FROM dist_1_materialized t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE EXISTS (SELECT a FROM dist_1 t3 WHERE t3.a = t1.a); -- not supported because we anti-join t3 --inner rel-- with a column -- of t1 (intermediate result) --outer-rel-- WITH dist_1_materialized AS MATERIALIZED ( SELECT * FROM dist_1 ) SELECT COUNT(*) FROM dist_1_materialized t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE NOT EXISTS (SELECT a FROM dist_1 t3 WHERE t3.a = t1.a); -- so this is supported because now t3 is joined with t2, not t1 WITH dist_1_materialized AS MATERIALIZED ( SELECT a AS a_alias, b AS b_alias FROM dist_1 ) SELECT COUNT(*) FROM dist_1_materialized t1 JOIN dist_1 t2 ON (t1.a_alias = t2.a) WHERE NOT EXISTS (SELECT a FROM dist_1 t3 WHERE t3.a = t2.a); WITH dist_1_materialized AS MATERIALIZED ( SELECT a AS a_alias, b AS b_alias FROM dist_1 ) SELECT COUNT(*) FROM dist_1_materialized t1 JOIN dist_1 t2 ON (t1.a_alias = t2.a) WHERE t1.a_alias NOT IN (SELECT a FROM dist_1 t3); -- generate_series() SELECT COUNT(*) FROM (SELECT a FROM generate_series(1, 10) a) t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE t1.a IN (SELECT a FROM dist_1 t3); -- subqury without FROM SELECT COUNT(*) FROM (SELECT generate_series(1,10) AS a) t1 JOIN dist_1 t2 ON (t1.a = t2.a) WHERE t1.a IN (SELECT a FROM dist_1 t3); -- together with correlated subqueries SELECT COUNT(*) FROM ref_1 t1 LEFT JOIN dist_1 t2 USING (a,b) WHERE EXISTS (SELECT * FROM dist_1 t3 WHERE t1.a = t3.a); SELECT COUNT(*) FROM dist_1 t1 RIGHT JOIN ref_1 t2 USING (a,b) WHERE EXISTS (SELECT * FROM dist_1 t3 WHERE t2.a = t3.a); -- "dist_1 t2" can't contribute to result set of the right join with -- a tuple having "(t2.a) a = NULL" because t2 is in the inner side of -- right join. For this reason, Postgres knows that can -- never evaluate to true (because never yields "true") -- and replaces the right join with an inner join. -- And as a result, we can push-down the query without having to go -- through recursive planning. SELECT COUNT(*) FROM dist_1 t1 WHERE EXISTS ( SELECT * FROM dist_1 t2 RIGHT JOIN ref_1 t3 USING (a) WHERE t2.a = t1.a ); -- same here, Postgres converts the left join into an inner one SELECT foo.* FROM ref_1 r1, LATERAL ( SELECT * FROM ref_1 r2 LEFT JOIN dist_1 USING (a) WHERE r1.a > dist_1.b ) as foo; -- Qual is the same but top-level join is an anti-join. Right join -- stays as is and hence requires recursive planning. SELECT COUNT(*) FROM dist_1 t1 WHERE NOT EXISTS ( SELECT * FROM dist_1 t2 RIGHT JOIN ref_1 t3 USING (a) WHERE t2.a = t1.a ); -- This time the semi-join qual is (not <) -- where t3 is the outer rel of the right join. Hence Postgres can't -- replace right join with an inner join and so we recursively plan -- inner side of the right join since the outer side is a recurring -- rel. SELECT COUNT(*) FROM dist_1 t1 WHERE EXISTS ( SELECT * FROM dist_1 t2 RIGHT JOIN ref_1 t3 USING (a) WHERE t3.a = t1.a ); SELECT COUNT(*) FROM dist_1 t1 WHERE NOT EXISTS ( SELECT * FROM dist_1 t2 RIGHT JOIN ref_1 t3 USING (a) WHERE t3.a = t1.a ); -- -- more complex cases -- SELECT COUNT(*) FROM -- 1) right side is distributed but t1 is recurring, hence what -- makes the right side distributed (t3) is recursively planned ref_1 t1 LEFT JOIN (ref_1 t2 RIGHT JOIN dist_1 t3(x,y) ON (t2.a=t3.x)) t5 USING(a) -- 2) outer side of the join tree became recurring, hence t4 is -- recursively planned too LEFT JOIN dist_1 t4 ON (t4.a = t5.a AND t4.b = t5.b) WHERE t4.b IS NULL; SELECT COUNT(*) FROM -- 2) right side is distributed but t1 is recurring, hence what -- makes the right side distributed (t4) is recursively planned ref_1 t1 LEFT JOIN ( dist_1 t4 JOIN -- 1) t6 is recursively planned since the outer side is recurring (SELECT t6.a FROM dist_1 t6 RIGHT JOIN ref_1 t7 USING(a)) t5 USING(a) ) q USING(a) -- 3) outer side of the join tree became recurring, hence t8 is -- recursively planned too LEFT JOIN dist_1 t8 USING (a) WHERE t8.b IS NULL; SELECT COUNT(*) FROM ref_1 t1 -- all distributed tables in the rhs will be recursively planned -- in the order of t3, t4, t5 LEFT JOIN ( ref_1 t2 JOIN dist_1 t3 USING (a) JOIN (dist_1 t4 JOIN dist_1 t5 USING (a)) USING(a) ) USING (a); -- Even if dist_1 and dist_4_different_colocation_group belong to different -- colocation groups, we can run query without doing a repartition join as -- we first decide recursively planning lhs of the right join because rhs -- (ref_1) is a recurring rel. And while doing so, we anyway recursively plan -- the distributed tables in the subjoin tree individually hence the whole join -- tree becomes: -- RIGHT JOIN -- / \ -- intermediate_result_for_dist_1 ref_1 -- JOIN -- intermediate_result_for_dist_4_different_colocation_group -- -- When we decide implementing the optimization noted in -- RecursivelyPlanDistributedJoinNode in an XXX comment, then this query would -- require enabling repartition joins. SELECT COUNT(*) FROM dist_1 JOIN dist_4_different_colocation_group USING(a) RIGHT JOIN ref_1 USING(a); SELECT COUNT(*) FROM ref_1 t1 LEFT JOIN -- 2) t6 subquery is distributed so needs to be recursively planned -- because t1 is recurring ( SELECT * FROM (SELECT * FROM ref_1 t2 JOIN dist_1 t3 USING (a) WHERE t3.b IS NULL) p JOIN -- 1) t5 is recursively planned since the outer side is recurring (SELECT * FROM ref_1 t4 LEFT JOIN dist_1 t5 USING (a)) q USING(a) ) t6 USING (a); -- No need to recursively plan dist_5_with_pkey thanks to -- pkey optimizations done by Postgres. SELECT COUNT(*) FROM ref_1 LEFT JOIN dist_5_with_pkey USING(a); -- Similarly, implies that "dist_1.a" cannot be NULL -- and hence Postgres converts the LEFT JOIN into an INNER JOIN form. -- For this reason, we don't need to recursively plan dist_1. SELECT COUNT(*) FROM ref_1 LEFT JOIN dist_1 USING(a) WHERE dist_1.a IN (1,4); SELECT COUNT(*) FROM ref_1 t1 LEFT JOIN -- 2) t6 subquery is distributed so needs to be recursively planned -- because t1 is recurring ( SELECT * FROM (SELECT * FROM ref_1 t2 JOIN dist_3_partitioned t3 USING (a) WHERE t3.b IS NULL) p JOIN -- 1) t5 is recursively planned since the outer side is recurring (SELECT * FROM ref_1 t4 LEFT JOIN dist_1 t5 USING (a)) q USING(a) ) t6 USING (a); SELECT COUNT(t1.a), t1.b FROM ref_1 t1 LEFT JOIN -- 2) t6 subquery is distributed so needs to be recursively planned -- because t1 is recurring ( SELECT * FROM (SELECT * FROM ref_1 t2 JOIN dist_3_partitioned t3 USING (a) WHERE t3.b IS NULL) p JOIN -- 1) t5 is recursively planned since the outer side is recurring (SELECT * FROM ref_1 t4 LEFT JOIN dist_1 t5 USING (a)) q USING(a) ) t6 USING (a) GROUP BY (t1.b) HAVING t1.b > 200 ORDER BY 1,2; SELECT COUNT(t1.a), t1.b FROM ref_1 t1 LEFT JOIN -- 2) t6 subquery is distributed so needs to be recursively planned -- because t1 is recurring ( SELECT * FROM (SELECT * FROM ref_1 t2 JOIN dist_3_partitioned t3 USING (a) WHERE t3.b IS NULL) p JOIN -- 1) t5 is recursively planned since the outer side is recurring (SELECT * FROM ref_1 t4 LEFT JOIN dist_1 t5 USING (a)) q USING(a) ) t6 USING (a) GROUP BY (t1.b) HAVING ( EXISTS ( SELECT * FROM ref_1 t6 LEFT JOIN dist_1 t7 USING (a) WHERE t7.b > 10 ) ) ORDER BY 1,2; SELECT COUNT(*) FROM citus_local_1 t1 LEFT JOIN -- 2) t6 subquery is distributed so needs to be recursively planned -- because t1 is first recursively planned ( SELECT * FROM (SELECT * FROM ref_1 t2 JOIN dist_1 t3 USING (a) WHERE t3.b IS NULL) p JOIN -- 1) t5 is recursively planned since the outer side is recurring (SELECT * FROM ref_1 t4 LEFT JOIN dist_1 t5 USING (a)) q USING(a) ) t6 USING (a); SELECT COUNT(*) FROM -- 2) t1 is recursively planned because the outer side (t2) is -- converted into a recurring rel dist_2_columnar t1 RIGHT JOIN ( -- 1) t4 is recursively planned since the outer side is recurring ref_1 t3 LEFT JOIN dist_1 t4 USING(a) ) t2 USING (a); SELECT COUNT(*) FROM -- 3) t1 is recursively planned because the outer side (t2) is -- converted into a recurring rel dist_1 t1 RIGHT JOIN ( -- 2) t6 is recursively planned because now it's part of a distributed -- inner join node that is about to be outer joined with t3 ref_1 t3 LEFT JOIN ( -- 1-a) t4 is recursively planned since the outer side is recurring (ref_1 t5 LEFT JOIN dist_1 t4 USING(a)) JOIN dist_1 t6 USING(a) JOIN -- 1-b) t8 is recursively planned since the outer side is recurring (ref_1 t7 LEFT JOIN dist_1 t8 USING(a)) USING(a) ) USING(a) ) t2 USING (a); SELECT COUNT(*) FROM ref_1 t6 LEFT JOIN ( ref_1 t1 LEFT JOIN ( -- t3 is a distributed join tree so needs to be recursively planned -- because t2 is recurring ref_1 t2 LEFT JOIN (dist_1 t7 JOIN dist_1 t8 USING (a)) t3 USING(a) JOIN ref_1 t5 USING(a) ) USING(a) ) USING(a); SELECT COUNT(*) FROM ref_1 t6 LEFT JOIN ( ref_1 t1 LEFT JOIN ( -- t4 subquery is distributed so needs to be recursively planned -- because t2 is recurring ref_1 t2 LEFT JOIN (SELECT * FROM dist_1 t3) t4 USING(a) JOIN ref_1 t5 USING(a) ) USING(a) ) USING(a); SELECT COUNT(*) FROM ref_1 t6 LEFT JOIN ( ref_1 t1 LEFT JOIN ( -- t4 subquery is distributed so needs to be recursively planned -- because t2 is recurring ref_1 t2 LEFT JOIN (SELECT * FROM dist_3_partitioned t3) t4 USING(a) JOIN ref_1 t5 USING(a) ) USING(a) ) USING(a); -- cannot recursively plan because t3 (inner - distributed) -- references t1 (outer - recurring) SELECT COUNT(*) FROM ref_1 t1 LEFT JOIN LATERAL (SELECT * FROM dist_1 t2 WHERE t1.b < t2.b) t3 USING (a); SELECT COUNT(*) FROM (SELECT * FROM dist_1 OFFSET 100) t1 LEFT JOIN LATERAL (SELECT * FROM dist_1 t2 WHERE t1.b < t2.b) t3 USING (a); SELECT COUNT(*) FROM local_1 t1 LEFT JOIN LATERAL (SELECT * FROM dist_1 t2 WHERE t1.b < t2.b) t3 USING (a); SELECT COUNT(*) FROM (SELECT 1 a, generate_series(1,2) b) t1 LEFT JOIN LATERAL (SELECT * FROM dist_1 t2 WHERE t1.b < t2.b) t3 USING (a); SELECT COUNT(*) FROM (ref_1 t10 JOIN ref_1 t11 USING(a,b)) t1 LEFT JOIN LATERAL (SELECT * FROM dist_1 t2 WHERE t1.b < t2.b) t3 USING (a); -- cannot plan because the query in the WHERE clause of t3 -- (inner - distributed) references t1 (outer - recurring) SELECT COUNT(*) FROM ref_1 t1 LEFT JOIN LATERAL ( SELECT * FROM dist_1 t2 WHERE EXISTS ( SELECT * FROM dist_1 t4 WHERE t4.a = t2.a AND t4.b > t1.b ) ) t3 USING (a); -- can recursively plan after dropping (t4.b > t1.b) qual from t3 SELECT COUNT(*) FROM ref_1 t1 LEFT JOIN ( SELECT * FROM dist_1 t2 WHERE EXISTS ( SELECT * FROM dist_1 t4 WHERE t4.a = t2.a ) ) t3 USING (a); -- same test using a view, can be recursively planned CREATE VIEW my_view_1 AS SELECT * FROM dist_1 table_name_for_view WHERE EXISTS ( SELECT * FROM dist_1 t4 WHERE t4.a = table_name_for_view.a); SELECT COUNT(*) FROM ref_1 t1 LEFT JOIN my_view_1 t3 USING (a); SELECT COUNT(*) FROM ref_1 t6 LEFT JOIN ( ref_1 t1 LEFT JOIN ( -- t4 subquery is distributed so needs to be recursively planned -- because t2 is recurring. -- However, we fail to recursively plan t4 because it references -- t6. ref_1 t2 LEFT JOIN LATERAL (SELECT * FROM dist_2_columnar t3 WHERE t3.a > t6.a) t4 USING(a) JOIN ref_1 t5 USING(a) ) USING(a) ) USING(a); SELECT COUNT(*) FROM ref_1 t6 LEFT JOIN ( ref_1 t1 LEFT JOIN ( -- t4 subquery is distributed so needs to be recursively planned -- because t2 is recurring. -- Even if the query says t2 is lateral joined with t4, t4 doesn't -- reference anywhere else and hence can be planned recursively. ref_1 t2 LEFT JOIN LATERAL (SELECT * FROM dist_1 t3) t4 USING(a) JOIN ref_1 t5 USING(a) ) USING(a) ) USING(a); -- since t1 is recurring and t6 is distributed, all the distributed -- tables in t6 will be recursively planned SELECT COUNT(*) FROM ref_1 t1 LEFT JOIN ( ((SELECT * FROM ref_1 WHERE a > 1) t2 JOIN dist_1 t3 USING (a)) JOIN (dist_1 t4 JOIN dist_1 t5 USING (a)) USING(a) ) t6 USING (a); BEGIN; -- same test but this time should fail due to -- citus.max_intermediate_result_size SET LOCAL citus.max_intermediate_result_size TO "0.5kB"; SELECT COUNT(*) FROM ref_1 t1 LEFT JOIN ( ((SELECT * FROM ref_1 WHERE a > 1) t2 JOIN dist_1 t3 USING (a)) JOIN (dist_1 t4 JOIN dist_1 t5 USING (a)) USING(a) ) t6 USING (a); ROLLBACK; -- Same test using some views, can be recursively planned too. -- Since t1 is recurring and t6 is distributed, all the distributed -- tables in t6 will be recursively planned. CREATE VIEW my_view_2 AS (SELECT * FROM ref_1 WHERE a > 1); CREATE VIEW my_view_3 AS (SELECT * FROM ref_1); SELECT COUNT(*) FROM my_view_3 t1 LEFT JOIN ( (my_view_2 t2 JOIN dist_1 t3 USING (a)) JOIN (dist_1 t4 JOIN dist_1 t5 USING (a)) USING(a) ) t6 USING (a); SELECT COUNT(*) FROM ref_1 t1 -- 2) Since t8 is distributed and t1 is recurring, t8 needs be converted -- to a recurring rel too. For this reason, subquery t8 is recursively -- planned because t7 is recurring already. LEFT JOIN ( SELECT * FROM (SELECT * FROM ref_1 t2 RIGHT JOIN dist_1 t3 USING (a)) AS t4 JOIN -- 1) subquery t6 is recursively planned because t5 is recurring (SELECT * FROM ref_1 t5 LEFT JOIN (SELECT * FROM dist_2_columnar WHERE b < 150) t6 USING (a)) as t7 USING(a) ) t8 USING (a); -- same test using a prepared statement PREPARE recurring_outer_join_p1 AS SELECT COUNT(*) FROM ref_1 t1 -- 2) Since t8 is distributed and t1 is recurring, t8 needs be converted -- to a recurring rel too. For this reason, subquery t8 is recursively -- planned because t7 is recurring already. LEFT JOIN ( SELECT * FROM (SELECT * FROM ref_1 t2 RIGHT JOIN dist_1 t3 USING (a)) AS t4 JOIN -- 1) subquery t6 is recursively planned because t5 is recurring (SELECT * FROM ref_1 t5 LEFT JOIN (SELECT * FROM dist_2_columnar WHERE b < $1) t6 USING (a)) as t7 USING(a) ) t8 USING (a); EXECUTE recurring_outer_join_p1(0); EXECUTE recurring_outer_join_p1(100); EXECUTE recurring_outer_join_p1(100); EXECUTE recurring_outer_join_p1(10); EXECUTE recurring_outer_join_p1(10); EXECUTE recurring_outer_join_p1(1000); EXECUTE recurring_outer_join_p1(1000); -- t5 is recursively planned because the outer side of the final -- left join is recurring SELECT * FROM ref_1 t1 JOIN ref_1 t2 USING (a) LEFT JOIN ref_1 t3 USING (a) LEFT JOIN ref_1 t4 USING (a) LEFT JOIN dist_1 t5 USING (a) ORDER BY 1,2,3,4,5,6 DESC LIMIT 5; -- t6 is recursively planned because the outer side of the final -- left join is recurring SELECT * FROM (SELECT * FROM ref_1 ORDER BY 1,2 LIMIT 7) t1 JOIN ref_1 t2 USING (a) LEFT JOIN (SELECT *, random() > 1 FROM dist_1 t3) t4 USING (a) LEFT JOIN ref_1 t5 USING (a) LEFT JOIN dist_1 t6 USING (a) ORDER BY 1,2,3,4,5,6,7 DESC LIMIT 10; -- -- Such join rels can recursively appear anywhere in the query instead -- of simple relation rtes. -- SELECT COUNT(*) FROM (SELECT ref_1.a, t10.b FROM ref_1 LEFT JOIN dist_1 t10 USING(b)) AS t1, (SELECT ref_1.a, t20.b FROM ref_1 LEFT JOIN dist_1 t20 USING(b)) AS t2, (SELECT ref_1.a, t30.b FROM ref_1 LEFT JOIN dist_1 t30 USING(b)) AS t3, (SELECT ref_1.a, t40.b FROM ref_1 LEFT JOIN dist_1 t40 USING(b)) AS t4, (SELECT ref_1.a, t50.b FROM ref_1 LEFT JOIN dist_1 t50 USING(b)) AS t5 WHERE t1.a = t5.a AND t1.a = t4.a AND t1.a = t3.a AND t1.a = t2.a AND t1.a = t1.a; -- subqueries in the target list SELECT t1.b, (SELECT b FROM ref_1 WHERE t1.a = a ORDER BY a,b LIMIT 1), (SELECT t2.a) FROM ref_1 LEFT JOIN dist_1 t1 USING (a,b) JOIN dist_1 t2 USING (a,b) ORDER BY 1,2,3 LIMIT 5; WITH outer_cte_1 AS ( SELECT t1.b, -- 9) t3 is recursively planned since t2 is recurring (SELECT a FROM ref_1 t2 LEFT JOIN dist_1 t3 USING(a,b) WHERE t2.a=t1.a ORDER BY 1 LIMIT 1) FROM dist_1 t1 ORDER BY 1,2 LIMIT 10 ), outer_cte_2 AS ( SELECT * FROM ( SELECT * FROM ( SELECT * FROM ( SELECT * FROM ( SELECT * FROM ( -- 10) t5 is recursively planned since t4 is recurring SELECT * FROM ref_1 t4 LEFT JOIN dist_1 t5 USING(a,b) ) AS t6 ) AS t7 ) AS t8 ) AS t9 OFFSET 0 )AS t10 -- 11) t11 is recursively planned since lhs of the join tree became recurring LEFT JOIN dist_1 t11 USING (b) ) SELECT * FROM ref_1 t36 WHERE (b,100,a) IN ( WITH cte_1 AS ( WITH cte_1_inner_cte AS ( -- 3) t12 is recursively planned because t11 is recurring SELECT * FROM ref_1 t11 LEFT JOIN dist_1 t12 USING (a,b) ) -- 4) t14 is recursively planned because t13 is recurring SELECT * FROM ref_1 t13 LEFT JOIN dist_1 t14 USING (a,b) JOIN cte_1_inner_cte t15 USING (a,b) OFFSET 0 ) -- 6) t31 is recursively planned since t35 is recurring -- 7) t34 is recursively planned since lhs of the join tree is now recurring SELECT DISTINCT t31.b, -- 1) we first search for such joins in the target list and recursively plan t33 -- because t32 is recurring (SELECT max(b) FROM ref_1 t32 LEFT JOIN dist_1 t33 USING(a,b) WHERE t31.a = t32.a), (SELECT t34.a) FROM ref_1 t35 LEFT JOIN dist_1 t31 USING (a,b) LEFT JOIN dist_1 t34 USING (a,b) -- 2) cte_1 was inlided, so we then recursively check for such joins there. -- When doing so, we first check for cte_1_inner_cte was since it was -- also inlined. LEFT JOIN cte_1 USING (a,b) -- 5) Since rhs of below join is a subquery too, we recursively search -- for such joins there and plan distributed side of all those 10 -- joins. LEFT JOIN ( SELECT COUNT(DISTINCT t20.a) AS a FROM (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t20, (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t21, (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t22, (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t23, (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t24, (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t25, (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t26, (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t27, (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t28, (SELECT r.a, d.b FROM ref_1 r LEFT JOIN dist_1 d USING(b) WHERE r.a IS NOT NULL) AS t29 WHERE t20.a = t29.a AND t20.a = t28.a AND t20.a = t27.a AND t20.a = t26.a AND t20.a = t25.a AND t20.a = t24.a AND t20.a = t23.a AND t20.a = t21.a AND t20.a = t21.a AND t20.a = t20.a ) AS t30 ON (t30.a = cte_1.a) ORDER BY 1,2,3 ) AND -- 8) Then we search for such joins in the next (and final) qual of the WHERE clause. -- Since both outer_cte_1 and outer_cte_2 were inlined, we will first -- recursively check for such joins in them. a NOT IN (SELECT outer_cte_1.b FROM outer_cte_1 LEFT JOIN outer_cte_2 USING (b)); WITH cte_1 AS ( SELECT COUNT(*) FROM dist_1 t1 JOIN ( ( dist_1 t2 JOIN dist_1 t3 USING (a) ) JOIN ( dist_1 t4 JOIN ( dist_1 t5 JOIN ( dist_1 t6 JOIN ( ref_1 t7 LEFT JOIN dist_1 t8 USING (a) ) USING(a) ) USING(a) ) USING (a) ) USING(a) ) USING (a) ), cte_2 AS ( SELECT COUNT(*) FROM dist_1 t9 JOIN ( ( dist_1 t10 JOIN dist_1 t11 USING (a) ) JOIN ( dist_1 t12 JOIN ( dist_1 t13 JOIN ( dist_1 t14 JOIN ( ref_1 t15 LEFT JOIN dist_1 t16 USING (a) ) USING(a) ) USING(a) ) USING (a) ) USING(a) ) USING (a) ) SELECT * FROM cte_1, cte_2; -- such joins can appear within SET operations too SELECT COUNT(*) FROM -- 2) given that the rhs of the right join is recurring due to set -- operation, t1 is recursively planned too dist_1 t1 RIGHT JOIN ( SELECT * FROM dist_1 t2 UNION ( -- 1) t3 is recursively planned because t4 is recurring SELECT t3.a, t3.b FROM dist_1 t3 FULL JOIN ref_1 t4 USING (a) ) ) t5 USING(a); -- simple modification queries CREATE TABLE dist_5 (LIKE dist_1); INSERT INTO dist_5 SELECT * FROM dist_1 WHERE a < 5; SELECT create_distributed_table('dist_5', 'a'); BEGIN; DELETE FROM dist_5 USING ( SELECT t1.a, t1.b FROM ref_1 t1 LEFT JOIN ( SELECT * FROM dist_1 t2 WHERE EXISTS ( SELECT * FROM dist_1 t4 WHERE t4.a = t2.a ) ) t3 USING (a) ) q WHERE dist_5.a = q.a RETURNING *; ROLLBACK; BEGIN; UPDATE dist_5 SET b = 10 WHERE a IN ( SELECT t1.a FROM ref_1 t1 LEFT JOIN ( SELECT * FROM dist_1 t2 WHERE EXISTS ( SELECT * FROM dist_1 t4 WHERE t4.a = t2.a ) ) t3 USING (a) ) RETURNING *; ROLLBACK; -- INSERT .. SELECT: pull to coordinator BEGIN; DELETE FROM ref_1 WHERE a IS NULL; INSERT INTO dist_1 SELECT t1.* FROM ref_1 t1 LEFT JOIN dist_1 t2 ON (t1.a = t2.a); ROLLBACK; -- INSERT .. SELECT: repartitioned (due to ) BEGIN; INSERT INTO dist_1 SELECT t1.a*3, t1.b FROM dist_1 t1 JOIN (ref_1 t2 LEFT JOIN dist_1 t3 USING(a)) t4 ON (t1.a = t4.a); ROLLBACK; -- INSERT .. SELECT: repartitioned -- should be able to push-down once https://github.com/citusdata/citus/issues/6544 is fixed BEGIN; INSERT INTO dist_1 SELECT t1.* FROM dist_1 t1 JOIN (ref_1 t2 LEFT JOIN dist_1 t3 USING(a)) t4 ON (t1.a = t4.a); ROLLBACK; SET client_min_messages TO ERROR; DROP SCHEMA recurring_outer_join CASCADE;