citus/src/test/regress/sql/subquery_in_where.sql

-- ===================================================================
-- test recursive planning functionality with subqueries in WHERE
-- ===================================================================
CREATE SCHEMA subquery_in_where;
SET search_path TO subquery_in_where, public;

SET client_min_messages TO DEBUG1;

--CTEs can be used as a recurring tuple with subqueries in WHERE
WITH event_id
     AS MATERIALIZED (SELECT user_id AS events_user_id,
                time    AS events_time,
                event_type
         FROM   events_table)
SELECT Count(*)
FROM   event_id
WHERE  events_user_id IN (SELECT user_id
                          FROM   users_table);

--Correlated subqueries can not be used in WHERE clause
WITH event_id
     AS(SELECT user_id AS events_user_id,
                time    AS events_time,
                event_type
         FROM   events_table)
SELECT Count(*)
FROM   event_id
WHERE  (events_user_id, random()) IN (SELECT user_id, 1
                          FROM   users_table
                          WHERE  users_table.time = events_time);

-- Recurring tuples as empty join tree
SELECT *
FROM   (SELECT 1 AS id, 2 AS value_1, 3 AS value_3
		UNION ALL SELECT 2 as id, 3 as value_1, 4 as value_3) AS tt1
WHERE  id IN (SELECT user_id
              FROM   events_table);

-- Recurring tuples in from clause as CTE and SET operation in WHERE clause
SELECT Count(*)
FROM   (WITH event_id AS
       (SELECT user_id AS events_user_id, time AS events_time, event_type
        FROM events_table)
       SELECT events_user_id, events_time, event_type
	   FROM event_id
	   ORDER BY 1,2,3
	   LIMIT 10) AS sub_table
WHERE  events_user_id IN (
       (SELECT user_id
        FROM users_table
        ORDER BY 1
        LIMIT 10)
		UNION ALL
       (SELECT value_1
        FROM users_table
        ORDER BY 1
        limit 10));

-- Recurring tuples in from clause as SET operation on recursively plannable
-- queries and CTE in WHERE clause
SELECT
	*
FROM
	(
		(SELECT
			user_id
		FROM
			users_table
		ORDER BY
			user_id ASC
		LIMIT
			10
		)
		UNION ALL
		(SELECT
			value_1
		FROM
			users_table
		ORDER BY
			value_1 ASC
		LIMIT
			10
		)
	) as SUB_TABLE
WHERE
	user_id
IN
	(
	WITH event_id AS (
		SELECT
			user_id as events_user_id, time as events_time, event_type
		FROM
			events_table
	)
	SELECT
		events_user_id
	FROM
		event_id
	ORDER BY
		events_user_id
	LIMIT
		10
	);

-- Complex target list in WHERE clause
SELECT
	COUNT(*)
FROM
	(SELECT
		user_id as events_user_id, time as events_time, event_type
	FROM
		events_table
	ORDER BY
		1,2
	LIMIT
		10
	) as SUB_TABLE
WHERE
	events_user_id
<=ANY (
	SELECT
		max(abs(user_id * 1) + mod(user_id, 3)) as val_1
	FROM
		users_table
	GROUP BY
		user_id
);

-- DISTINCT clause in WHERE
SELECT
	COUNT(*)
FROM
	(SELECT
		user_id as events_user_id, time as events_time, event_type
	FROM
		events_table
	LIMIT
		10
	) as SUB_TABLE
WHERE
	events_user_id
IN (
	SELECT
		distinct user_id
	FROM
		users_table
	GROUP BY
		user_id
);

-- AND in WHERE clause
SELECT
	COUNT(*)
FROM
	(SELECT
		user_id as events_user_id, time as events_time, event_type
	FROM
		events_table
	ORDER BY
		1,2,3
	LIMIT
		10
	) as SUB_TABLE
WHERE
	events_user_id
>=ANY (
	SELECT
		min(user_id)
	FROM
		users_table
	GROUP BY
		user_id
)
AND
	events_user_id
<=ANY (
	SELECT
		max(user_id)
	FROM
		users_table
	GROUP BY
		user_id
);

-- AND in WHERE clause, part of the AND is pushdownable other is not
SELECT
	COUNT(*)
FROM
	(SELECT
		user_id as events_user_id, time as events_time, event_type
	FROM
		events_table
	ORDER BY
		1,2,3
	LIMIT
		10
	) as SUB_TABLE
WHERE
	events_user_id
>=ANY (
	SELECT
		min(user_id)
	FROM
		users_table
	GROUP BY
		user_id
)
AND
	events_user_id
<=ANY (
	SELECT
		max(value_2)
	FROM
		users_table
	GROUP BY
		user_id
);

-- Planning subqueries in WHERE clause in CTE recursively
WITH cte AS (
	SELECT
		*
	FROM
		(SELECT
			*
		FROM
			users_table
		ORDER BY
			user_id ASC,
			value_2 DESC
		LIMIT
			10
		) as sub_table
	WHERE
		user_id
	IN
		(SELECT
			value_2
		FROM
			events_table
		)
)
SELECT
	COUNT(*)
FROM
	cte;

-- Planing subquery in WHERE clause in FROM clause of a subquery recursively
SELECT
	COUNT(*)
FROM
	(SELECT
		*
	FROM
		(SELECT
			*
		FROM
			users_table
		ORDER BY
			user_id ASC,
			value_2 DESC
		LIMIT
			10
		) as sub_table_1
	WHERE
		user_id
	IN
		(SELECT
			value_2
		FROM
			events_table
		)
	) as sub_table_2;

-- Recurring table in the FROM clause of a subquery in the FROM clause
-- Recurring table is created by joining a two recurrign table
SELECT
	SUM(user_id)
FROM
	(SELECT
		*
	FROM
		(SELECT
			user_id
		FROM
			users_table
		ORDER BY
			user_id
		LIMIT 10) as t1
		INNER JOIN
		(SELECT
			user_id as user_id_2
		FROM
			users_table
		ORDER BY
			user_id
		LIMIT
			10) as t2
	ON
		t1.user_id = t2.user_id_2
	WHERE
		t1.user_id
	IN
		(SELECT
			value_2
		FROM
			events_table)
	) as t3
WHERE
	user_id
>ANY
	(SELECT
		min(user_id)
	FROM
		events_table
	GROUP BY
		user_id);

-- Same example with the above query, but now check the rows with EXISTS
SELECT
	SUM(user_id)
FROM
	(SELECT
		*
	FROM
		(SELECT
			user_id
		FROM
			users_table
		ORDER BY
			user_id
		LIMIT 10) as t1
		INNER JOIN
		(SELECT
			user_id as user_id_2
		FROM
			users_table
		ORDER BY
			user_id
		LIMIT
			10) as t2
	ON
		t1.user_id = t2.user_id_2
	WHERE
		t1.user_id
	IN
		(SELECT
			value_2
		FROM
			events_table)
	) as t3
WHERE EXISTS
	(SELECT
		1,2
	FROM
		events_table
	WHERE
		events_table.value_2 = events_table.user_id);

-- Same query with the above one, yet now we check the row's NON-existence
-- by NOT EXISTS. Note that, max value_2 of events_table is 5
SELECT
	SUM(user_id)
FROM
	(SELECT
		*
	FROM
		(SELECT
			user_id
		FROM
			users_table
		ORDER BY
			user_id
		LIMIT 10) as t1
		INNER JOIN
		(SELECT
			user_id as user_id_2
		FROM
			users_table
		ORDER BY
			user_id
		LIMIT
			10) as t2
	ON
		t1.user_id = t2.user_id_2
	WHERE
		t1.user_id
	IN
		(SELECT
			value_2
		FROM
			events_table)
	) as t3
WHERE NOT EXISTS
	(SELECT
		1,2
	FROM
		events_table
	WHERE
		events_table.value_2 = events_table.user_id + 6);

-- Check the existence of row by comparing it with the result of subquery in
-- WHERE clause. Note that subquery is planned recursively since there is no
-- distributed table in the from
SELECT
	*
FROM
	(SELECT
		user_id, value_1
	FROM
		users_table
	ORDER BY
		user_id ASC,
		value_1 ASC
	LIMIT 10) as t3
WHERE row(user_id, value_1) =
	(SELECT
		min(user_id) + 1, min(user_id) + 1
	FROM
		events_table);

-- Recursively plan subquery in WHERE clause when the FROM clause has a subquery
-- generated by generate_series function
SELECT
	*
FROM
	(SELECT
		*
	FROM
		generate_series(1,10)
	) as gst
WHERE
	generate_series
IN
	(SELECT
		value_2
	FROM
		events_table
	)
ORDER BY
	generate_series ASC;

-- Similar to the test above, now we also have a generate_series in WHERE clause
SELECT
	*
FROM
	(SELECT
		*
	FROM
		generate_series(1,10)
	) as gst
WHERE
	generate_series
IN
	(SELECT
		user_id
	FROM
		users_table
	WHERE
		user_id
	IN
		(SELECT
			*
		FROM
			generate_series(1,3)
		)
	)
ORDER BY
	generate_series ASC;

-- non-colocated subquery in WHERE clause ANDed with false
SELECT count(*)
FROM users_Table
WHERE (FALSE AND EXISTS (SELECT * FROM events_table));

-- multiple non-colocated subqueries in WHERE clause ANDed with false
SELECT count(*)
FROM users_Table
WHERE value_1 IN
    (SELECT value_1
     FROM users_Table) OR (FALSE AND EXISTS (SELECT * FROM events_table));

-- multiple non-colocated subqueries in WHERE clause ANDed with false
SELECT count(*)
FROM users_Table
WHERE value_1 IN
    (SELECT value_1
     FROM users_Table) AND (FALSE AND EXISTS (SELECT * FROM events_table));

-- non-colocated subquery in WHERE clause ANDed with true
SELECT count(*)
FROM users_Table
WHERE (TRUE AND EXISTS (SELECT * FROM events_table));

-- multiple non-colocated subqueries in WHERE clause ANDed with true
SELECT count(*)
FROM users_Table
WHERE value_1 IN
    (SELECT value_1
     FROM users_Table) OR (EXISTS (SELECT * FROM events_table));

-- correlated subquery with aggregate in WHERE
SELECT
    *
FROM
    users_table
WHERE
    user_id IN
    (
        SELECT
            SUM(events_table.user_id)
        FROM
            events_table
        WHERE
            users_table.user_id = events_table.user_id
    )
;

-- correlated subquery with aggregate in HAVING
SELECT
    *
FROM
    users_table
WHERE
    user_id IN
    (
        SELECT
            SUM(events_table.user_id)
        FROM
            events_table
        WHERE
            events_table.user_id = users_table.user_id
        HAVING
            MIN(value_2) > 2
    )
;


-- Local tables also planned recursively, so using it as part of the FROM clause
-- make the clause recurring
CREATE TABLE local_table(id int, value_1 int);
INSERT INTO local_table VALUES(1,1), (2,2);

SELECT
	*
FROM
	(SELECT
		*
	FROM
		local_table) as sub_table
WHERE
	id
IN
	(SELECT
		user_id
	FROM
		users_table);

-- Use local table in WHERE clause
SELECT
	COUNT(*)
FROM
	(SELECT
		*
	FROM
		users_table
	ORDER BY
		user_id
	LIMIT
		10) as sub_table
WHERE
	user_id
IN
	(SELECT
		id
	FROM
		local_table);

-- basic NOT IN correlated subquery
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_2 NOT IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id);

-- correlated subquery with limit
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_2 IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id ORDER BY value_2 LIMIT 1);

-- correlated subquery with distinct
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_2 IN (SELECT DISTINCT (value_3) FROM users_table WHERE user_id = e.user_id);

-- correlated subquery with aggregate
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_2 = (SELECT max(value_2) FROM users_table WHERE user_id = e.user_id);

-- correlated subquery with window function
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_2 IN (SELECT row_number() OVER () FROM users_table WHERE user_id = e.user_id);

-- correlated subquery with group by
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2);

SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2);


-- correlated subquery with group by
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2);

-- correlated subquery with having
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT 1));

SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT e.value_3));

-- nested correlated subquery
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (
    SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE user_id = e.user_id) u USING (user_id)
    WHERE u.value_2 > 3
    GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3);

-- not co-located correlated subquery
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (
    SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE value_2 = e.user_id) u USING (user_id)
    WHERE u.value_2 > 3
    GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3);

-- cartesian correlated subquery
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (
    SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id)
    WHERE u.value_2 > 3
    GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3);

-- even more subtle cartesian correlated subquery
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (
    SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id)
    WHERE u.value_2 > 3
    GROUP BY u.value_2 HAVING min(r.value_3) > e.value_3);

-- not a correlated subquery, uses recursive planning
SELECT
  count(*)
FROM
  events_table e
WHERE
  value_3 IN (
    SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id)
    WHERE u.value_2 > 3
    GROUP BY r.value_2 HAVING min(r.value_3) > 0);

-- two levels of correlation should also allow
-- merge step in the subquery
SELECT sum(value_1)
FROM users_table u
WHERE EXISTS
    (SELECT 1
     FROM events_table e
     WHERE u.user_id = e.user_id AND
        EXISTS
         (SELECT 1
          FROM users_table u2
          WHERE u2.user_id = u.user_id AND u2.value_1 = 5
          LIMIT 1));

-- correlated subquery in WHERE, with a slightly
-- different syntax that the result of the subquery
-- is compared with a constant
SELECT sum(value_1)
FROM users_table u1
WHERE (SELECT COUNT(DISTINCT e1.value_2)
     FROM events_table e1
     WHERE e1.user_id = u1.user_id
          ) > 115;


-- a correlated subquery which requires merge step
-- can be pushed down on UPDATE/DELETE queries as well
-- rollback to keep the rest of the tests unchanged
BEGIN;
UPDATE users_table u1
 SET value_1 = (SELECT count(DISTINCT value_2)
             	 	 FROM events_table e1
               		WHERE e1.user_id = u1.user_id);

DELETE FROM users_table u1 WHERE (SELECT count(DISTINCT value_2)
             	 	 FROM events_table e1
               		WHERE e1.user_id = u1.user_id) > 10;

ROLLBACK;

-- a correlated anti-join can also be pushed down even if the subquery
-- has a LIMIT
SELECT avg(value_1)
FROM users_table u
WHERE NOT EXISTS
    (SELECT 'XXX'
     FROM events_table e
     WHERE u.user_id = e.user_id and e.value_2 > 10000 LIMIT 1);

-- a [correlated] lateral join can also be pushed down even if the subquery
-- has an aggregate wout a GROUP BY
SELECT
	max(min_of_val_2), max(u1.value_1)
FROM
	users_table u1
		LEFT JOIN LATERAL
	(SELECT min(e1.value_2) as min_of_val_2 FROM events_table e1 WHERE e1.user_id = u1.user_id)  as foo ON (true);


-- a self join is followed by a correlated subquery
EXPLAIN (COSTS OFF)
SELECT
	*
FROM
	users_table u1 JOIN users_table u2 USING (user_id)
WHERE
	u1.value_1 < u2.value_1 AND
	(SELECT
		count(*)
	FROM
		events_table e1
	WHERE
		e1.user_id = u2.user_id) > 10;

-- when the colocated join of the FROM clause
-- entries happen on WHERE clause, Citus cannot
-- pushdown
-- Likely that the colocation checks should be
-- improved
SELECT
	u1.user_id, u2.user_id
FROM
	users_table u1, users_table u2
WHERE
	u1.value_1 < u2.value_1 AND
	(SELECT
		count(*)
	FROM
		events_table e1
	WHERE
		e1.user_id = u2.user_id AND
		u1.user_id = u2.user_id) > 10
ORDER BY 1,2;


-- create a view that contains correlated subquery
CREATE TEMPORARY VIEW correlated_subquery_view AS
	SELECT u1.user_id
	FROM users_table u1
	WHERE (SELECT COUNT(DISTINCT e1.value_2)
	     FROM events_table e1
	     WHERE e1.user_id = u1.user_id
	          ) > 0;

SELECT sum(user_id) FROM correlated_subquery_view;

-- now, join the view with another correlated subquery
SELECT
	sum(mx)
FROM
	correlated_subquery_view
		LEFT JOIN LATERAL
	(SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (true);

-- as an edge case, JOIN is on false
SELECT
	sum(mx)
FROM
	correlated_subquery_view
		LEFT JOIN LATERAL
	(SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (false);


SELECT sum(value_1)
FROM users_table u1
WHERE (SELECT COUNT(DISTINCT e1.value_2)
     FROM events_table e1
     WHERE e1.user_id = u1.user_id AND false
          ) > 115;

SELECT sum(value_1)
FROM users_table u1
WHERE (SELECT COUNT(DISTINCT e1.value_2)
     FROM events_table e1
     WHERE e1.user_id = u1.user_id
          ) > 115 AND false;

-- Test redundant WHERE clause (fix #7782, #7783)
CREATE TABLE t0 (vkey int4, pkey int4, c0 timestamp);
CREATE TABLE t1 (vkey int4, pkey int4, c4 timestamp, c5 text, c6 text);
CREATE TABLE t3 (vkey int4, pkey int4, c9 timestamp);
CREATE TABLE t7 (vkey int4, pkey int4);

-- DEBUG messages not needed for these tests
SET client_min_messages TO DEFAULT;

INSERT INTO t0 (vkey, pkey, c0) values
(3, 13000, make_timestamp(2032, 9, 4, 13, 38, 0));

INSERT INTO t7 (vkey, pkey) values
(3, 59525);

SELECT create_reference_table('t1');
SELECT create_distributed_table('t3', 'c9');

UPDATE t0 set vkey = 117
where (((t0.pkey) in (select t7.vkey from t7 where false
        union all
        select t3.pkey from t3 where false
        )))
  or TRUE;

-- Local table t0 is updated
SELECT vkey, pkey, c0 FROM t0;

-- MERGE command with redundant join can be planned locally
EXPLAIN (costs off, timing off)
MERGE INTO t0 USING t7 ON
 (((t0.pkey) in (select t7.vkey from t7 where false
        union all
        select t1.pkey from t1 where false
        )))
  or TRUE
WHEN MATCHED THEN
   UPDATE SET vkey = 113;

-- UPDATE via MERGE with redundant join clause:
MERGE INTO t0 USING t7 ON
 (((t0.pkey) in (select t7.vkey from t7 where false
        union all
        select t1.pkey from t1 where false
        )))
  or TRUE
WHEN MATCHED THEN
   UPDATE SET vkey = 113;

-- Local table t0 is updated
SELECT vkey, pkey, c0 FROM t0;

DELETE FROM t0
where TRUE or (((t0.vkey) >= (select
          pg_catalog.regexp_count(ref_0.c5, ref_0.c6)
        from t1 as ref_0 where true)));

-- Local table t0 is now empty (0 rows)
SELECT vkey, pkey, c0 FROM t0;

INSERT INTO t3 (vkey, pkey, c9) values
(3, 13000, make_timestamp(2032, 9, 4, 13, 38, 0));

-- Distributed table update with redundant WHERE
UPDATE t3 set vkey = 117
where (((t3.pkey) in (select t1.vkey from t1 where false
        union all
        select t0.pkey from t0 join t7 on t0.pkey=t7.vkey where false
        )))
  or TRUE;

SELECT vkey, pkey FROM t3;

-- Distributed table delete with redundant WHERE
DELETE FROM t3
where TRUE or (((t3.vkey) >= (select
          pg_catalog.regexp_count(ref_0.c5, ref_0.c6)
        from  t1 as ref_0  where true)) and (select max(vkey) from t0) > 0);

-- Distributed table t3 is now empty
SELECT vkey, pkey FROM t3;

-- Redundant WHERE clause with distributed parititioned table
CREATE TABLE a (a int);
INSERT INTO a VALUES (1);

-- populated distributed partitioned table
create table partitioned_table (a INT UNIQUE) PARTITION BY RANGE(a);
CREATE TABLE par_1 PARTITION OF partitioned_table FOR VALUES FROM (1) TO (41);
CREATE TABLE par_2 PARTITION OF partitioned_table FOR VALUES FROM (41) TO (81);
CREATE TABLE par_3 PARTITION OF partitioned_table FOR VALUES FROM (81) TO (121);
CREATE TABLE par_4 PARTITION OF partitioned_table FOR VALUES FROM (121) TO (161);
SELECT create_distributed_table('partitioned_table', 'a');
insert into partitioned_table(a) select  i from generate_series(1,160) i;

-- test citus table in init plan
-- with redundant WHERE clause
SELECT CASE WHEN EXISTS (
       SELECT * FROM partitioned_table
   ) THEN 1 ELSE 0 END AS table_non_empty
FROM   a
WHERE true;

-- test citus table in init plan
-- with redundant WHERE clause involving
-- a citus table
SELECT CASE WHEN EXISTS (
       SELECT * FROM partitioned_table
   ) THEN 1 ELSE 0 END AS table_non_empty
FROM   a
WHERE true OR NOT EXISTS (SELECT 1 FROM t1);

DROP TABLE local_table;
DROP TABLE t0;
DROP TABLE t1;
DROP TABLE t3;
DROP TABLE t7;
DROP TABLE a;
DROP TABLE partitioned_table CASCADE;
DROP SCHEMA subquery_in_where CASCADE;
SET search_path TO public;