citus/src/test/regress/expected/multi_outer_join_columns_1.out

--- Test for verifying that column references (var nodes) in targets that cannot be pushed down
--- do not cause issues for the postgres planner, in particular postgres versions 16+, where the
--- varnullingrels field of a VAR node may contain relids of join relations that can make the var
--- NULL; in a rewritten distributed query without a join such relids do not have a meaning.
-- This test has an alternative goldfile because of the following feature in Postgres 16:
-- https://github.com/postgres/postgres/commit/1349d2790bf48a4de072931c722f39337e72055e
--
SHOW server_version \gset
SELECT substring(:'server_version', '\d+')::int >= 16 AS server_version_ge_16;
 server_version_ge_16
---------------------------------------------------------------------
 f
(1 row)

CREATE SCHEMA outer_join_columns_testing;
SET search_path to 'outer_join_columns_testing';
SET citus.next_shard_id TO 30070000;
SET citus.shard_replication_factor TO 1;
SET citus.enable_local_execution TO ON;
CREATE TABLE t1 (id INT PRIMARY KEY);
INSERT INTO t1 VALUES (1), (2);
CREATE TABLE t2 (id INT, account_id INT, a2 INT, PRIMARY KEY(id, account_id));
INSERT INTO t2 VALUES (3, 1, 10), (4, 2, 20), (5, 1, NULL);
SELECT create_distributed_table('t1', 'id');
NOTICE:  Copying data from local table...
NOTICE:  copying the data has completed
DETAIL:  The local data in the table is no longer visible, but is still on disk.
HINT:  To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$outer_join_columns_testing.t1$$)
 create_distributed_table
---------------------------------------------------------------------

(1 row)

SELECT create_distributed_table('t2', 'account_id');
NOTICE:  Copying data from local table...
NOTICE:  copying the data has completed
DETAIL:  The local data in the table is no longer visible, but is still on disk.
HINT:  To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$outer_join_columns_testing.t2$$)
 create_distributed_table
---------------------------------------------------------------------

(1 row)

-- Test the issue seen in #7705; a target expression with
-- a window function that cannot be pushed down because the
-- partion by is not on the distribution column also includes
-- a column from the inner side of a left outer join, which
-- produces a non-empty varnullingrels set in PG 16 (and higher)
SELECT  t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id;
 id | max
---------------------------------------------------------------------
  1 |  10
  2 |  20
  1 |
(3 rows)

EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF)
SELECT  t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id;
                                                                                                                                                                             QUERY PLAN
---------------------------------------------------------------------
 WindowAgg
   Output: remote_scan.id, max(remote_scan.max) OVER (?), remote_scan.worker_column_3
   ->  Sort
         Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max
         Sort Key: remote_scan.worker_column_3
         ->  Custom Scan (Citus Adaptive)
               Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max
               Task Count: 4
               Tasks Shown: One of 4
               ->  Task
                     Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery
                     Node: host=localhost port=xxxxx dbname=regression
                     ->  Hash Right Join
                           Output: t1.id, t2.a2, t2.id
                           Inner Unique: true
                           Hash Cond: (t2.account_id = t1.id)
                           ->  Seq Scan on outer_join_columns_testing.t2_30070004 t2
                                 Output: t2.id, t2.account_id, t2.a2
                           ->  Hash
                                 Output: t1.id
                                 ->  Seq Scan on outer_join_columns_testing.t1_30070000 t1
                                       Output: t1.id
(22 rows)

SELECT  t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id)
FROM t2 RIGHT OUTER JOIN t1 ON t1.id = t2.account_id;
 id | max
---------------------------------------------------------------------
  1 |  10
  2 |  20
  1 |
(3 rows)

EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF)
SELECT  t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id)
FROM t2 RIGHT OUTER JOIN t1 ON t1.id = t2.account_id;
                                                                                                                                                                             QUERY PLAN
---------------------------------------------------------------------
 WindowAgg
   Output: remote_scan.id, max(remote_scan.max) OVER (?), remote_scan.worker_column_3
   ->  Sort
         Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max
         Sort Key: remote_scan.worker_column_3
         ->  Custom Scan (Citus Adaptive)
               Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max
               Task Count: 4
               Tasks Shown: One of 4
               ->  Task
                     Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (outer_join_columns_testing.t2_30070004 t2 RIGHT JOIN outer_join_columns_testing.t1_30070000 t1 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery
                     Node: host=localhost port=xxxxx dbname=regression
                     ->  Hash Right Join
                           Output: t1.id, t2.a2, t2.id
                           Inner Unique: true
                           Hash Cond: (t2.account_id = t1.id)
                           ->  Seq Scan on outer_join_columns_testing.t2_30070004 t2
                                 Output: t2.id, t2.account_id, t2.a2
                           ->  Hash
                                 Output: t1.id
                                 ->  Seq Scan on outer_join_columns_testing.t1_30070000 t1
                                       Output: t1.id
(22 rows)

SELECT  DISTINCT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id;
 id | max
---------------------------------------------------------------------
  1 |
  1 |  10
  2 |  20
(3 rows)

EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF)
SELECT DISTINCT t1.id, MAX(t2.a2) OVER (PARTITION BY t2.id)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id;
                                                                                                                                                                                QUERY PLAN
---------------------------------------------------------------------
 HashAggregate
   Output: remote_scan.id, (max(remote_scan.max) OVER (?)), remote_scan.worker_column_3
   Group Key: remote_scan.id, max(remote_scan.max) OVER (?)
   ->  WindowAgg
         Output: remote_scan.id, max(remote_scan.max) OVER (?), remote_scan.worker_column_3
         ->  Sort
               Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max
               Sort Key: remote_scan.worker_column_3
               ->  Custom Scan (Citus Adaptive)
                     Output: remote_scan.worker_column_3, remote_scan.id, remote_scan.max
                     Task Count: 4
                     Tasks Shown: One of 4
                     ->  Task
                           Query: SELECT worker_column_1 AS id, worker_column_2 AS max, worker_column_3 FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2, t2.id AS worker_column_3 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery
                           Node: host=localhost port=xxxxx dbname=regression
                           ->  Hash Right Join
                                 Output: t1.id, t2.a2, t2.id
                                 Inner Unique: true
                                 Hash Cond: (t2.account_id = t1.id)
                                 ->  Seq Scan on outer_join_columns_testing.t2_30070004 t2
                                       Output: t2.id, t2.account_id, t2.a2
                                 ->  Hash
                                       Output: t1.id
                                       ->  Seq Scan on outer_join_columns_testing.t1_30070000 t1
                                             Output: t1.id
(25 rows)

CREATE SEQUENCE test_seq START 101;
CREATE OR REPLACE FUNCTION TEST_F(int) returns INT language sql stable as $$ select $1 + 42; $$ ;
-- Issue #7705 also occurs if a target expression includes a column
-- of a distributed table that is on the inner side of a left outer
-- join and a call to nextval(), because nextval() cannot be pushed
-- down, and must be run on the coordinator
SELECT t1.id, TEST_F(t2.a2 + nextval('test_seq') :: int)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id
ORDER BY t1.id;
 id | test_f
---------------------------------------------------------------------
  1 |    153
  1 |
  2 |    165
(3 rows)

EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF)
SELECT t1.id, TEST_F(t2.a2 + nextval('test_seq') :: int)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id
ORDER BY t1.id;
                                                                                                                                                         QUERY PLAN
---------------------------------------------------------------------
 Result
   Output: remote_scan.id, ((remote_scan.test_f + (nextval('test_seq'::regclass))::integer) + 42)
   ->  Sort
         Output: remote_scan.id, remote_scan.test_f
         Sort Key: remote_scan.id
         ->  Custom Scan (Citus Adaptive)
               Output: remote_scan.id, remote_scan.test_f
               Task Count: 4
               Tasks Shown: One of 4
               ->  Task
                     Query: SELECT worker_column_1 AS id, worker_column_2 AS test_f FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery
                     Node: host=localhost port=xxxxx dbname=regression
                     ->  Hash Right Join
                           Output: t1.id, t2.a2
                           Inner Unique: true
                           Hash Cond: (t2.account_id = t1.id)
                           ->  Seq Scan on outer_join_columns_testing.t2_30070004 t2
                                 Output: t2.id, t2.account_id, t2.a2
                           ->  Hash
                                 Output: t1.id
                                 ->  Seq Scan on outer_join_columns_testing.t1_30070000 t1
                                       Output: t1.id
(22 rows)

SELECT t1.id, CASE nextval('test_seq') % 2 = 0 WHEN true THEN t2.a2 ELSE 1 END
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id
ORDER BY t1.id;
 id | case
---------------------------------------------------------------------
  1 |   10
  1 |    1
  2 |   20
(3 rows)

EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF)
SELECT t1.id, CASE nextval('test_seq') %2 = 0 WHEN true THEN t2.a2 ELSE 1 END
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id
ORDER BY t1.id;
                                                                                                                                                         QUERY PLAN
---------------------------------------------------------------------
 Result
   Output: remote_scan.id, CASE ((nextval('test_seq'::regclass) % '2'::bigint) = 0) WHEN CASE_TEST_EXPR THEN remote_scan."case" ELSE 1 END
   ->  Sort
         Output: remote_scan.id, remote_scan."case"
         Sort Key: remote_scan.id
         ->  Custom Scan (Citus Adaptive)
               Output: remote_scan.id, remote_scan."case"
               Task Count: 4
               Tasks Shown: One of 4
               ->  Task
                     Query: SELECT worker_column_1 AS id, worker_column_2 AS "case" FROM (SELECT t1.id AS worker_column_1, t2.a2 AS worker_column_2 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery
                     Node: host=localhost port=xxxxx dbname=regression
                     ->  Hash Right Join
                           Output: t1.id, t2.a2
                           Inner Unique: true
                           Hash Cond: (t2.account_id = t1.id)
                           ->  Seq Scan on outer_join_columns_testing.t2_30070004 t2
                                 Output: t2.id, t2.account_id, t2.a2
                           ->  Hash
                                 Output: t1.id
                                 ->  Seq Scan on outer_join_columns_testing.t1_30070000 t1
                                       Output: t1.id
(22 rows)

-- Issue #7787: count distinct of a column from the inner side of a
-- left outer join will have a non-empty varnullingrels in the query
-- tree returned by Postgres 16+, so ensure this is not reflected in
-- the worker subquery constructed by Citus; it has just one relation,
-- for the pushed down subquery.
SELECT COUNT(DISTINCT a2)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id;
 count
---------------------------------------------------------------------
     2
(1 row)

EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF)
SELECT COUNT(DISTINCT a2)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id;
                                                                                                                                         QUERY PLAN
---------------------------------------------------------------------
 Aggregate
   Output: count(DISTINCT remote_scan.count)
   ->  Custom Scan (Citus Adaptive)
         Output: remote_scan.count
         Task Count: 4
         Tasks Shown: One of 4
         ->  Task
               Query: SELECT worker_column_1 AS count FROM (SELECT t2.a2 AS worker_column_1 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery GROUP BY worker_column_1
               Node: host=localhost port=xxxxx dbname=regression
               ->  HashAggregate
                     Output: t2.a2
                     Group Key: t2.a2
                     ->  Hash Right Join
                           Output: t2.a2
                           Inner Unique: true
                           Hash Cond: (t2.account_id = t1.id)
                           ->  Seq Scan on outer_join_columns_testing.t2_30070004 t2
                                 Output: t2.id, t2.account_id, t2.a2
                           ->  Hash
                                 Output: t1.id
                                 ->  Seq Scan on outer_join_columns_testing.t1_30070000 t1
                                       Output: t1.id
(22 rows)

-- Issue #7787 also occurs with a HAVING clause
SELECT 1
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id
HAVING COUNT(DISTINCT a2) > 1;
 ?column?
---------------------------------------------------------------------
        1
(1 row)

EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF)
SELECT 1
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id
HAVING COUNT(DISTINCT a2) > 1;
                                                                                                                                                QUERY PLAN
---------------------------------------------------------------------
 Aggregate
   Output: remote_scan."?column?"
   Filter: (count(DISTINCT remote_scan.worker_column_2) > 1)
   ->  Custom Scan (Citus Adaptive)
         Output: remote_scan."?column?", remote_scan.worker_column_2
         Task Count: 4
         Tasks Shown: One of 4
         ->  Task
               Query: SELECT 1, worker_column_1 AS worker_column_2 FROM (SELECT t2.a2 AS worker_column_1 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery GROUP BY worker_column_1
               Node: host=localhost port=xxxxx dbname=regression
               ->  HashAggregate
                     Output: 1, t2.a2
                     Group Key: t2.a2
                     ->  Hash Right Join
                           Output: t2.a2
                           Inner Unique: true
                           Hash Cond: (t2.account_id = t1.id)
                           ->  Seq Scan on outer_join_columns_testing.t2_30070004 t2
                                 Output: t2.id, t2.account_id, t2.a2
                           ->  Hash
                                 Output: t1.id
                                 ->  Seq Scan on outer_join_columns_testing.t1_30070000 t1
                                       Output: t1.id
(23 rows)

-- Check right outer join
SELECT COUNT(DISTINCT a2)
FROM t2 RIGHT OUTER JOIN t1 ON t2.account_id = t1.id;
 count
---------------------------------------------------------------------
     2
(1 row)

EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF)
SELECT COUNT(DISTINCT a2)
FROM t2 RIGHT OUTER JOIN t1 ON t2.account_id = t1.id;
                                                                                                                                          QUERY PLAN
---------------------------------------------------------------------
 Aggregate
   Output: count(DISTINCT remote_scan.count)
   ->  Custom Scan (Citus Adaptive)
         Output: remote_scan.count
         Task Count: 4
         Tasks Shown: One of 4
         ->  Task
               Query: SELECT worker_column_1 AS count FROM (SELECT t2.a2 AS worker_column_1 FROM (outer_join_columns_testing.t2_30070004 t2 RIGHT JOIN outer_join_columns_testing.t1_30070000 t1 ON ((t2.account_id OPERATOR(pg_catalog.=) t1.id)))) worker_subquery GROUP BY worker_column_1
               Node: host=localhost port=xxxxx dbname=regression
               ->  HashAggregate
                     Output: t2.a2
                     Group Key: t2.a2
                     ->  Hash Right Join
                           Output: t2.a2
                           Inner Unique: true
                           Hash Cond: (t2.account_id = t1.id)
                           ->  Seq Scan on outer_join_columns_testing.t2_30070004 t2
                                 Output: t2.id, t2.account_id, t2.a2
                           ->  Hash
                                 Output: t1.id
                                 ->  Seq Scan on outer_join_columns_testing.t1_30070000 t1
                                       Output: t1.id
(22 rows)

-- Check both count distinct and having clause
SELECT COUNT(DISTINCT a2)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id
HAVING COUNT(DISTINCT t2.id) > 1;
 count
---------------------------------------------------------------------
     2
(1 row)

EXPLAIN (VERBOSE, COSTS OFF, TIMING OFF)
SELECT COUNT(DISTINCT a2)
FROM t1 LEFT OUTER JOIN t2 ON t1.id = t2.account_id
HAVING COUNT(DISTINCT t2.id) > 1;
                                                                                                                                                                       QUERY PLAN
---------------------------------------------------------------------
 Aggregate
   Output: count(DISTINCT remote_scan.count)
   Filter: (count(DISTINCT remote_scan.worker_column_2) > 1)
   ->  Custom Scan (Citus Adaptive)
         Output: remote_scan.count, remote_scan.worker_column_2
         Task Count: 4
         Tasks Shown: One of 4
         ->  Task
               Query: SELECT worker_column_1 AS count, worker_column_2 FROM (SELECT t2.a2 AS worker_column_1, t2.id AS worker_column_2 FROM (outer_join_columns_testing.t1_30070000 t1 LEFT JOIN outer_join_columns_testing.t2_30070004 t2 ON ((t1.id OPERATOR(pg_catalog.=) t2.account_id)))) worker_subquery GROUP BY worker_column_1, worker_column_2
               Node: host=localhost port=xxxxx dbname=regression
               ->  HashAggregate
                     Output: t2.a2, t2.id
                     Group Key: t2.a2, t2.id
                     ->  Hash Right Join
                           Output: t2.a2, t2.id
                           Inner Unique: true
                           Hash Cond: (t2.account_id = t1.id)
                           ->  Seq Scan on outer_join_columns_testing.t2_30070004 t2
                                 Output: t2.id, t2.account_id, t2.a2
                           ->  Hash
                                 Output: t1.id
                                 ->  Seq Scan on outer_join_columns_testing.t1_30070000 t1
                                       Output: t1.id
(23 rows)

--- cleanup
\set VERBOSITY TERSE
DROP SCHEMA outer_join_columns_testing CASCADE;
NOTICE:  drop cascades to 4 other objects
RESET all;