From c49077d5949b34786884152779f23b4db3127275 Mon Sep 17 00:00:00 2001 From: Onur Tirtir Date: Mon, 19 Oct 2020 16:58:11 +0300 Subject: [PATCH] Disallow outer joins `ON TRUE` with ref & dist tables when ref table is outer relation (#4255) Disallow `ON TRUE` outer joins with reference & distributed tables when reference table is outer relation by fixing the logic bug made when calling `LeftListIsSubset` function. Also, be more defensive when removing duplicate join restrictions when join clause is empty for non-inner joins as they might still contain useful information for non-inner joins. --- .../distributed/planner/distributed_planner.c | 3 + .../relation_restriction_equivalence.c | 38 +++- src/include/distributed/distributed_planner.h | 1 + .../regress/expected/sqlancer_failures.out | 171 ++++++++++++++++++ src/test/regress/sql/sqlancer_failures.sql | 131 ++++++++++++++ 5 files changed, 334 insertions(+), 10 deletions(-) diff --git a/src/backend/distributed/planner/distributed_planner.c b/src/backend/distributed/planner/distributed_planner.c index cc83c66b6..ef4da8f0d 100644 --- a/src/backend/distributed/planner/distributed_planner.c +++ b/src/backend/distributed/planner/distributed_planner.c @@ -1744,6 +1744,8 @@ multi_join_restriction_hook(PlannerInfo *root, */ joinRestrictionContext->hasSemiJoin = joinRestrictionContext->hasSemiJoin || extra->sjinfo->jointype == JOIN_SEMI; + joinRestrictionContext->hasOnlyInnerJoin = joinRestrictionContext->hasOnlyInnerJoin && + extra->sjinfo->jointype == JOIN_INNER; MemoryContextSwitchTo(oldMemoryContext); } @@ -2133,6 +2135,7 @@ CreateAndPushPlannerRestrictionContext(void) /* we'll apply logical AND as we add tables */ plannerRestrictionContext->relationRestrictionContext->allReferenceTables = true; + plannerRestrictionContext->joinRestrictionContext->hasOnlyInnerJoin = true; plannerRestrictionContextList = lcons(plannerRestrictionContext, plannerRestrictionContextList); diff --git a/src/backend/distributed/planner/relation_restriction_equivalence.c b/src/backend/distributed/planner/relation_restriction_equivalence.c index 199a6cd8a..47dc1da3d 100644 --- a/src/backend/distributed/planner/relation_restriction_equivalence.c +++ b/src/backend/distributed/planner/relation_restriction_equivalence.c @@ -1906,7 +1906,12 @@ FilterJoinRestrictionContext(JoinRestrictionContext *joinRestrictionContext, Rel } } - /* the filtered restriction might not have semiJoin, but it is OK for now */ + /* + * No need to re calculate has join fields as we are still operating on + * the same query and as these values are calculated per-query basis. + */ + filtererdJoinRestrictionContext->hasOnlyInnerJoin = + joinRestrictionContext->hasOnlyInnerJoin; filtererdJoinRestrictionContext->hasSemiJoin = joinRestrictionContext->hasSemiJoin; return filtererdJoinRestrictionContext; @@ -2028,7 +2033,11 @@ RemoveDuplicateJoinRestrictions(JoinRestrictionContext *joinRestrictionContext) lappend(filteredContext->joinRestrictionList, joinRestriction); } - /* the filtered restriction might not have semiJoin, but it is OK for now */ + /* + * No need to re calculate has join fields as we are still operating on + * the same query and as these values are calculated per-query basis. + */ + filteredContext->hasOnlyInnerJoin = joinRestrictionContext->hasOnlyInnerJoin; filteredContext->hasSemiJoin = joinRestrictionContext->hasSemiJoin; return filteredContext; @@ -2062,21 +2071,30 @@ ContextCoversJoinRestriction(JoinRestrictionContext *joinRestrictionContext, continue; } + List *joinRestrictInfoListInTest = + joinRestrictionInTest->joinRestrictInfoList; + bool hasJoinRestriction = list_length(joinRestrictInfoListInTest) > 0; + bool hasOnlyInnerJoin = joinRestrictionContext->hasOnlyInnerJoin; + if (!hasOnlyInnerJoin && !hasJoinRestriction) + { + /* + * If join doesn't have a restriction (e.g., ON (true)) and planner + * is aware of at least one non-inner JOIN (e.g., outer/semi joins), + * we should not eliminiate joinRestrictionInTest. It can still be + * useful for detecting not supported outer-join checks even if it + * doesn't help for colocation checks. + */ + continue; + } + /* * We check whether the restrictions in joinRestrictionInTest is a subset * of the restrictions in joinRestrictionInContext in the sense that all the * restrictions in the latter already exists in the former. - * - * Also, note that list_difference() returns a list that contains all the - * cells in joinRestrictInfoList that are not in inputJoinRestrictInfoList. - * Finally, each element in these lists is a pointer to RestrictInfo - * structure, where equal() function is implemented for the struct. */ List *joinRestrictInfoListInContext = joinRestrictionInContext->joinRestrictInfoList; - List *joinRestrictInfoListInTest = - joinRestrictionInTest->joinRestrictInfoList; - if (LeftListIsSubset(joinRestrictInfoListInContext, joinRestrictInfoListInTest)) + if (LeftListIsSubset(joinRestrictInfoListInTest, joinRestrictInfoListInContext)) { return true; } diff --git a/src/include/distributed/distributed_planner.h b/src/include/distributed/distributed_planner.h index 2b5836789..a0701fd9a 100644 --- a/src/include/distributed/distributed_planner.h +++ b/src/include/distributed/distributed_planner.h @@ -76,6 +76,7 @@ typedef struct JoinRestrictionContext { List *joinRestrictionList; bool hasSemiJoin; + bool hasOnlyInnerJoin; } JoinRestrictionContext; typedef struct JoinRestriction diff --git a/src/test/regress/expected/sqlancer_failures.out b/src/test/regress/expected/sqlancer_failures.out index 2cb3de1cd..e95995a5d 100644 --- a/src/test/regress/expected/sqlancer_failures.out +++ b/src/test/regress/expected/sqlancer_failures.out @@ -151,5 +151,176 @@ SELECT ALL t7.c1, t7.c0, t8.c1, t10.c1, t8.c0 FROM t7 CROSS JOIN t10 FULL OUTER ) AS foo; ERROR: cannot pushdown the subquery DETAIL: There exist a reference table in the outer part of the outer join +CREATE TABLE reference_table(id int, it_name varchar(25), k_no int); +SELECT create_reference_table('reference_table'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE distributed_table(user_id int, item_id int, buy_count int); +SELECT create_distributed_table('distributed_table', 'user_id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- postgres plans below queries by evaluating joins as below: +-- L +-- / \ +-- ref L +-- / \ +-- dist ref +-- so we should error out as reference table is in the outer part of the top level (left) outer join +SELECT count(*) FROM distributed_table a +LEFT JOIN reference_table b ON (true) +RIGHT JOIN reference_table c ON (true); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +SELECT count(*) FROM distributed_table a +LEFT JOIN reference_table b ON (true) +RIGHT JOIN reference_table c ON (c.id > 0); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- drop existing sqlancer tables before next tests +DROP TABLE t0, t1, t2, t3, t4 CASCADE; +CREATE TABLE IF NOT EXISTS t0(c0 TEXT CHECK (TRUE), c1 money ) WITH (autovacuum_vacuum_threshold=1180014707, autovacuum_freeze_table_age=13771154, autovacuum_vacuum_cost_delay=23, autovacuum_analyze_threshold=1935153914, autovacuum_freeze_min_age=721733768, autovacuum_enabled=0, autovacuum_vacuum_cost_limit=9983); +CREATE UNLOGGED TABLE IF NOT EXISTS t1(LIKE t0); +CREATE TABLE t2(LIKE t0 INCLUDING INDEXES); +CREATE UNLOGGED TABLE t3(LIKE t0 EXCLUDING STATISTICS); +CREATE TABLE t4(LIKE t1); +SELECT create_distributed_table('t0', 'c0'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_reference_table('t1'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('t2', 'c0'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_distributed_table('t3', 'c0'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +SELECT create_reference_table('t4'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +-- whole join tree for below query is: +-- L +-- / \ +-- t1(ref) L +-- / \ +-- t0(dist) t4(ref) +-- -- so we should error out +SELECT count(*) FROM ( +SELECT ALL t4.c1, t0.c0, t0.c1 FROM ONLY t0 + LEFT OUTER JOIN t4 ON CAST(masklen('142.158.96.44') AS BOOLEAN) + RIGHT OUTER JOIN t1 ON ((0.024767844)::MONEY) BETWEEN (t1.c1) AND (CAST(0.0602135 AS MONEY)) +) AS foo; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- first subquery has the same join tree as above, so we should error out +SELECT count(*) FROM ( +SELECT ALL t4.c1, t0.c0, t0.c1 FROM ONLY t0 + LEFT OUTER JOIN t4 ON CAST(masklen('142.158.96.44') AS BOOLEAN) + RIGHT OUTER JOIN t1 ON (CAST(0.024767844 AS MONEY)) BETWEEN (t1.c1) AND (CAST(0.0602135 AS MONEY)) + WHERE NOT (((t0.c0)LIKE((t4.c0)))) +UNION ALL SELECT t4.c1, t0.c0, t0.c1 FROM ONLY t0 + LEFT OUTER JOIN t4 ON CAST(masklen('142.158.96.44') AS BOOLEAN) + RIGHT OUTER JOIN t1 ON ((0.024767844)::MONEY) BETWEEN (t1.c1) AND (CAST(0.0602135 AS MONEY)) + WHERE NOT (NOT (((t0.c0)LIKE((t4.c0))))) +UNION ALL SELECT ALL t4.c1, t0.c0, t0.c1 FROM ONLY t0 + LEFT OUTER JOIN t4 ON (masklen('142.158.96.44'))::BOOLEAN + RIGHT OUTER JOIN t1 ON ((0.024767844)::MONEY) BETWEEN (t1.c1) AND ((0.0602135)::MONEY) + WHERE (NOT (((t0.c0)LIKE((t4.c0))))) ISNULL +) AS foo; +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- unsupported outer JOIN inside a subquery in WHERE clause +SELECT * FROM distributed_table WHERE buy_count > ( +SELECT count(*) FROM distributed_table a +LEFT JOIN reference_table b ON (true) +RIGHT JOIN reference_table c ON (false)); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- unsupported outer JOIN via subqueries +SELECT count(*) FROM (SELECT *, random() FROM distributed_table) AS a +LEFT JOIN (SELECT *, random() FROM reference_table) AS b ON (true) +RIGHT JOIN (SELECT *, random() FROM reference_table) AS c ON (false); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- unsupported outer JOIN in a sublevel subquery +SELECT + count(*) +FROM + ( + SELECT a.* FROM distributed_table a JOIN distributed_table b USING (user_id) + ) AS bar +JOIN + ( + SELECT a.* FROM distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (true) + ) AS unsupported_join +ON (true); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- unsupported outer JOIN in a sublevel INNER JOIN +SELECT + unsupported_join.* +FROM + (distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (true)) as unsupported_join (x,y,z,t,e,f,q) +JOIN + (reference_table d JOIN reference_table e ON(true)) ON (true); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- unsupported outer JOIN in a sublevel LEFT JOIN +SELECT + unsupported_join.* +FROM + (distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (true)) as unsupported_join +LEFT JOIN + (reference_table d JOIN reference_table e ON(true)) ON (true); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +-- unsupported outer JOIN in a sublevel RIGHT JOIN +SELECT + unsupported_join.* +FROM + (distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (false)) as unsupported_join +RIGHT JOIN + (reference_table d JOIN reference_table e ON(true)) ON (true); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join +EXPLAIN SELECT + unsupported_join.* +FROM + (distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (true)) as unsupported_join (x,y,z,t,e,f,q) +JOIN + (reference_table d JOIN reference_table e ON(true)) ON (d.id > 0); +ERROR: cannot pushdown the subquery +DETAIL: There exist a reference table in the outer part of the outer join SET client_min_messages TO WARNING; DROP SCHEMA sqlancer_failures CASCADE; diff --git a/src/test/regress/sql/sqlancer_failures.sql b/src/test/regress/sql/sqlancer_failures.sql index 57531ea84..dd295e725 100644 --- a/src/test/regress/sql/sqlancer_failures.sql +++ b/src/test/regress/sql/sqlancer_failures.sql @@ -58,5 +58,136 @@ SELECT count(*) FROM ( SELECT ALL t7.c1, t7.c0, t8.c1, t10.c1, t8.c0 FROM t7 CROSS JOIN t10 FULL OUTER JOIN t8 ON (((((((('[832125354,1134163512)'::int4range)*('(0,2106623281)'::int4range)))-('(-600267905,509840582]'::int4range)))*('(-365203965,1662828182)'::int4range)))&<((((((('(-1286467417,697584012]'::int4range)*('[-1691485781,1341103963)'::int4range)))*((('(-1768368435,1719707648)'::int4range)*('(139536997,1275813540]'::int4range)))))*((((('[-2103910157,-1961746758)'::int4range)*('[-834534078,533073939)'::int4range)))*((('[-1030552151,552856781]'::int4range)*('[-1109419376,1205173697]'::int4range)))))))) ) AS foo; +CREATE TABLE reference_table(id int, it_name varchar(25), k_no int); +SELECT create_reference_table('reference_table'); + +CREATE TABLE distributed_table(user_id int, item_id int, buy_count int); +SELECT create_distributed_table('distributed_table', 'user_id'); + +-- postgres plans below queries by evaluating joins as below: +-- L +-- / \ +-- ref L +-- / \ +-- dist ref +-- so we should error out as reference table is in the outer part of the top level (left) outer join + +SELECT count(*) FROM distributed_table a +LEFT JOIN reference_table b ON (true) +RIGHT JOIN reference_table c ON (true); + +SELECT count(*) FROM distributed_table a +LEFT JOIN reference_table b ON (true) +RIGHT JOIN reference_table c ON (c.id > 0); + +-- drop existing sqlancer tables before next tests +DROP TABLE t0, t1, t2, t3, t4 CASCADE; + +CREATE TABLE IF NOT EXISTS t0(c0 TEXT CHECK (TRUE), c1 money ) WITH (autovacuum_vacuum_threshold=1180014707, autovacuum_freeze_table_age=13771154, autovacuum_vacuum_cost_delay=23, autovacuum_analyze_threshold=1935153914, autovacuum_freeze_min_age=721733768, autovacuum_enabled=0, autovacuum_vacuum_cost_limit=9983); +CREATE UNLOGGED TABLE IF NOT EXISTS t1(LIKE t0); +CREATE TABLE t2(LIKE t0 INCLUDING INDEXES); +CREATE UNLOGGED TABLE t3(LIKE t0 EXCLUDING STATISTICS); +CREATE TABLE t4(LIKE t1); + +SELECT create_distributed_table('t0', 'c0'); +SELECT create_reference_table('t1'); +SELECT create_distributed_table('t2', 'c0'); +SELECT create_distributed_table('t3', 'c0'); +SELECT create_reference_table('t4'); + +-- whole join tree for below query is: +-- L +-- / \ +-- t1(ref) L +-- / \ +-- t0(dist) t4(ref) +-- -- so we should error out +SELECT count(*) FROM ( +SELECT ALL t4.c1, t0.c0, t0.c1 FROM ONLY t0 + LEFT OUTER JOIN t4 ON CAST(masklen('142.158.96.44') AS BOOLEAN) + RIGHT OUTER JOIN t1 ON ((0.024767844)::MONEY) BETWEEN (t1.c1) AND (CAST(0.0602135 AS MONEY)) +) AS foo; + +-- first subquery has the same join tree as above, so we should error out +SELECT count(*) FROM ( +SELECT ALL t4.c1, t0.c0, t0.c1 FROM ONLY t0 + LEFT OUTER JOIN t4 ON CAST(masklen('142.158.96.44') AS BOOLEAN) + RIGHT OUTER JOIN t1 ON (CAST(0.024767844 AS MONEY)) BETWEEN (t1.c1) AND (CAST(0.0602135 AS MONEY)) + WHERE NOT (((t0.c0)LIKE((t4.c0)))) +UNION ALL SELECT t4.c1, t0.c0, t0.c1 FROM ONLY t0 + LEFT OUTER JOIN t4 ON CAST(masklen('142.158.96.44') AS BOOLEAN) + RIGHT OUTER JOIN t1 ON ((0.024767844)::MONEY) BETWEEN (t1.c1) AND (CAST(0.0602135 AS MONEY)) + WHERE NOT (NOT (((t0.c0)LIKE((t4.c0))))) +UNION ALL SELECT ALL t4.c1, t0.c0, t0.c1 FROM ONLY t0 + LEFT OUTER JOIN t4 ON (masklen('142.158.96.44'))::BOOLEAN + RIGHT OUTER JOIN t1 ON ((0.024767844)::MONEY) BETWEEN (t1.c1) AND ((0.0602135)::MONEY) + WHERE (NOT (((t0.c0)LIKE((t4.c0))))) ISNULL +) AS foo; + +-- unsupported outer JOIN inside a subquery in WHERE clause +SELECT * FROM distributed_table WHERE buy_count > ( +SELECT count(*) FROM distributed_table a +LEFT JOIN reference_table b ON (true) +RIGHT JOIN reference_table c ON (false)); + +-- unsupported outer JOIN via subqueries +SELECT count(*) FROM (SELECT *, random() FROM distributed_table) AS a +LEFT JOIN (SELECT *, random() FROM reference_table) AS b ON (true) +RIGHT JOIN (SELECT *, random() FROM reference_table) AS c ON (false); + +-- unsupported outer JOIN in a sublevel subquery +SELECT + count(*) +FROM + ( + SELECT a.* FROM distributed_table a JOIN distributed_table b USING (user_id) + ) AS bar +JOIN + ( + SELECT a.* FROM distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (true) + ) AS unsupported_join +ON (true); + +-- unsupported outer JOIN in a sublevel INNER JOIN +SELECT + unsupported_join.* +FROM + (distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (true)) as unsupported_join (x,y,z,t,e,f,q) +JOIN + (reference_table d JOIN reference_table e ON(true)) ON (true); + +-- unsupported outer JOIN in a sublevel LEFT JOIN +SELECT + unsupported_join.* +FROM + (distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (true)) as unsupported_join +LEFT JOIN + (reference_table d JOIN reference_table e ON(true)) ON (true); + +-- unsupported outer JOIN in a sublevel RIGHT JOIN +SELECT + unsupported_join.* +FROM + (distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (false)) as unsupported_join +RIGHT JOIN + (reference_table d JOIN reference_table e ON(true)) ON (true); + +EXPLAIN SELECT + unsupported_join.* +FROM + (distributed_table a + LEFT JOIN reference_table b ON (true) + RIGHT JOIN reference_table c ON (true)) as unsupported_join (x,y,z,t,e,f,q) +JOIN + (reference_table d JOIN reference_table e ON(true)) ON (d.id > 0); + SET client_min_messages TO WARNING; DROP SCHEMA sqlancer_failures CASCADE;