Bump Citus version to 10.0.8

Add missing entry for 10.0.8
Add changelog entries for 10.0.8
2023-04-26 16:05:20 +03:00 · 2023-04-26 16:05:20 +03:00 · 2023-04-26 13:31:50 +03:00 · 2023-02-17 14:41:30 +03:00 · 2022-06-13 16:36:59 +03:00 · 2022-05-27 09:33:18 -07:00
221 changed files with 11229 additions and 1482 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -365,7 +365,7 @@ jobs:
          when: on_fail
      - store_artifacts:
          name: 'Save tap logs'
-          path: /home/circleci/project/src/test/recovery/tmp_check/log
+          path: /home/circleci/project/src/test/<< parameters.suite >>/tmp_check/log
          when: on_fail
      - store_artifacts:
          name: 'Save core dumps'
@ -552,6 +552,12 @@ workflows:
          image_tag: '12.4'
          suite: recovery
          requires: [build-12]
+      - tap-test-citus:
+          name: 'test-12_tap-columnar-freezing'
+          pg_major: 12
+          image_tag: '12.4'
+          suite: columnar_freezing
+          requires: [build-12]
      - test-citus:
          name: 'test-12_check-failure'
          pg_major: 12
@ -620,6 +626,12 @@ workflows:
          image_tag: '13.0'
          suite: recovery
          requires: [build-13]
+      - tap-test-citus:
+          name: 'test-13_tap-columnar-freezing'
+          pg_major: 13
+          image_tag: '13.0'
+          suite: columnar_freezing
+          requires: [build-13]
      - test-citus:
          name: 'test-13_check-failure'
          pg_major: 13
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,129 @@
+### citus v10.0.8 (April 20, 2023) ###
+
+* Fixes a bug that could break `DROP SCHEMA/EXTENSON` commands when there is a
+  columnar table (#5458)
+
+* Fixes a crash that occurs when the aggregate that cannot be pushed-down
+  returns empty result from a worker (#5679)
+
+* Fixes columnar freezing/wraparound bug (#5962)
+
+* Fixes memory leak issue with query results that returns single row (#6724)
+
+* Prevents alter table functions from dropping extensions (#5974)
+
+### citus v10.0.6 (November 12, 2021) ###
+
+* Adds missing version checks for columnar tables
+
+* Fixes a bug that caused `worker_append_table_to_shard` to write as superuser
+
+* Fixes a bug with local cached plans on tables with dropped columns
+
+* Fixes a missing `FROM` clause entry error
+
+* Fixes a use after free issue that could happen when altering a distributed
+  table
+
+* Reinstates optimisation for uniform shard interval ranges
+
+### citus v10.0.5 (August 16, 2021) ###
+
+* Allows more graceful failovers when replication factor > 1
+
+* Fixes a bug that causes partitions to have wrong distribution key after
+  `DROP COLUMN`
+
+* Improves citus_update_table_statistics and provides distributed deadlock
+  detection
+
+### citus v10.0.4 (July 14, 2021) ###
+
+* Introduces `citus.local_hostname` GUC for connections to the current node
+
+* Removes dependencies on the existence of public schema
+
+* Removes limits around long partition names
+
+* Fixes a bug that can cause a crash when DEBUG4 logging is enabled
+
+* Fixes a bug that causes pruning incorrect shard of a range distributed table
+
+* Fixes an issue that could cause citus_finish_pg_upgrade to fail
+
+* Fixes FROM ONLY queries on partitioned tables
+
+* Fixes issues caused by public schema being omitted in queries
+
+* Fixes problems with concurrent calls of DropMarkedShards
+
+* Fixes relname null bug when using parallel execution
+
+* Fixes two race conditions in the get_rebalance_progress
+
+### citus v10.0.3 (March 16, 2021) ###
+
+* Prevents infinite recursion for queries that involve `UNION ALL`
+  below `JOIN`
+
+* Fixes a crash in queries with a modifying `CTE` and a `SELECT`
+  without `FROM`
+
+* Fixes upgrade and downgrade paths for `citus_update_table_statistics`
+
+* Fixes a bug that causes `SELECT` queries to use 2PC unnecessarily
+
+* Fixes a bug that might cause self-deadlocks with
+  `CREATE INDEX` / `REINDEX CONCURRENTLY` commands
+
+* Adds `citus.max_cached_connection_lifetime` GUC to set maximum connection
+  lifetime
+
+* Adds `citus.remote_copy_flush_threshold` GUC that controls
+  per-shard memory usages by `COPY`
+
+* Adds `citus_get_active_worker_nodes` UDF to deprecate
+  `master_get_active_worker_nodes`
+
+* Skips 2PC for readonly connections in a transaction
+
+* Makes sure that local execution starts coordinated transaction
+
+* Removes open temporary file warning when cancelling a query with
+  an open tuple store
+
+* Relaxes the locks when adding an existing node
+
+### citus v10.0.2 (March 3, 2021) ###
+
+* Adds a configure flag to enforce security
+
+* Fixes a bug due to cross join without target list
+
+* Fixes a bug with `UNION ALL` on PG 13
+
+* Fixes a compatibility issue with pg_audit in utility calls
+
+* Fixes insert query with CTEs/sublinks/subqueries etc
+
+* Grants `SELECT` permission on `citus_tables` view to `public`
+
+* Grants `SELECT` permission on columnar metadata tables to `public`
+
+* Improves `citus_update_table_statistics` and provides distributed deadlock
+  detection
+
+* Preserves colocation with procedures in `alter_distributed_table`
+
+* Prevents using `alter_columnar_table_set` and `alter_columnar_table_reset`
+  on a columnar table not owned by the user
+
+* Removes limits around long table names
+
+### citus v10.0.1 (February 19, 2021) ###
+
+* Fixes an issue in creation of `pg_catalog.time_partitions` view
+
 ### citus v10.0.0 (February 16, 2021) ###

 * Adds support for per-table option for columnar storage
--- a/Makefile.global.in
+++ b/Makefile.global.in
@ -86,6 +86,7 @@ endif

 # Add options passed to configure or computed therein, to CFLAGS/CPPFLAGS/...
 override CFLAGS += @CFLAGS@ @CITUS_CFLAGS@
+override BITCODE_CFLAGS := $(BITCODE_CFLAGS) @CITUS_BITCODE_CFLAGS@
 ifneq ($(GIT_VERSION),)
    override CFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\"
 endif
--- a/119
+++ b/119
@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for Citus 10.0devel.
+# Generated by GNU Autoconf 2.69 for Citus 10.0.8.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@ -579,8 +579,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='Citus'
 PACKAGE_TARNAME='citus'
-PACKAGE_VERSION='10.0devel'
-PACKAGE_STRING='Citus 10.0devel'
+PACKAGE_VERSION='10.0.8'
+PACKAGE_STRING='Citus 10.0.8'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@ -628,8 +628,10 @@ POSTGRES_BUILDDIR
 POSTGRES_SRCDIR
 CITUS_LDFLAGS
 CITUS_CPPFLAGS
+CITUS_BITCODE_CFLAGS
 CITUS_CFLAGS
 GIT_BIN
+with_security_flags
 with_zstd
 with_lz4
 EGREP
@ -696,6 +698,7 @@ with_libcurl
 with_reports_hostname
 with_lz4
 with_zstd
+with_security_flags
 '
      ac_precious_vars='build_alias
 host_alias
@ -1258,7 +1261,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures Citus 10.0devel to adapt to many kinds of systems.
+\`configure' configures Citus 10.0.8 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@ -1320,7 +1323,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of Citus 10.0devel:";;
+     short | recursive ) echo "Configuration of Citus 10.0.8:";;
   esac
  cat <<\_ACEOF

@ -1342,6 +1345,7 @@ Optional Packages:
                          and update checks
  --without-lz4           do not use lz4
  --without-zstd          do not use zstd
+  --with-security-flags   use security flags

 Some influential environment variables:
  PG_CONFIG   Location to find pg_config for target PostgreSQL instalation
@ -1422,7 +1426,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-Citus configure 10.0devel
+Citus configure 10.0.8
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@ -1905,7 +1909,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by Citus $as_me 10.0devel, which was
+It was created by Citus $as_me 10.0.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@ -4346,6 +4350,48 @@ if test x"$citusac_cv_prog_cc_cflags__Werror_return_type" = x"yes"; then
  CITUS_CFLAGS="$CITUS_CFLAGS -Werror=return-type"
 fi

+# Security flags
+# Flags taken from: https://liquid.microsoft.com/Web/Object/Read/ms.security/Requirements/Microsoft.Security.SystemsADM.10203#guide
+# We do not enforce the following flag because it is only available on GCC>=8
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC supports -fstack-clash-protection" >&5
+$as_echo_n "checking whether $CC supports -fstack-clash-protection... " >&6; }
+if ${citusac_cv_prog_cc_cflags__fstack_clash_protection+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  citusac_save_CFLAGS=$CFLAGS
+flag=-fstack-clash-protection
+case $flag in -Wno*)
+	 flag=-W$(echo $flag | cut -c 6-)
+esac
+CFLAGS="$citusac_save_CFLAGS $flag"
+ac_save_c_werror_flag=$ac_c_werror_flag
+ac_c_werror_flag=yes
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  citusac_cv_prog_cc_cflags__fstack_clash_protection=yes
+else
+  citusac_cv_prog_cc_cflags__fstack_clash_protection=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_c_werror_flag=$ac_save_c_werror_flag
+CFLAGS="$citusac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $citusac_cv_prog_cc_cflags__fstack_clash_protection" >&5
+$as_echo "$citusac_cv_prog_cc_cflags__fstack_clash_protection" >&6; }
+if test x"$citusac_cv_prog_cc_cflags__fstack_clash_protection" = x"yes"; then
+  CITUS_CFLAGS="$CITUS_CFLAGS -fstack-clash-protection"
+fi
+

 #
 # --enable-coverage enables generation of code coverage metrics with gcov
@ -4493,8 +4539,8 @@ if test "$version_num" != '11'; then
 $as_echo "#define HAS_TABLEAM 1" >>confdefs.h

 else
-   { $as_echo "$as_me:${as_lineno-$LINENO}: postgres version does not support table access methodds" >&5
-$as_echo "$as_me: postgres version does not support table access methodds" >&6;}
+   { $as_echo "$as_me:${as_lineno-$LINENO}: postgres version does not support table access methods" >&5
+$as_echo "$as_me: postgres version does not support table access methods" >&6;}
 fi;

 # Require lz4 & zstd only if we are compiling columnar
@ -4687,6 +4733,55 @@ fi

 fi # test "$HAS_TABLEAM" == 'yes'

+
+
+
+
+# Check whether --with-security-flags was given.
+if test "${with_security_flags+set}" = set; then :
+  withval=$with_security_flags;
+  case $withval in
+    yes)
+      :
+      ;;
+    no)
+      :
+      ;;
+    *)
+      as_fn_error $? "no argument expected for --with-security-flags option" "$LINENO" 5
+      ;;
+  esac
+
+else
+  with_security_flags=no
+
+fi
+
+
+
+
+if test "$with_security_flags" = yes; then
+# Flags taken from: https://liquid.microsoft.com/Web/Object/Read/ms.security/Requirements/Microsoft.Security.SystemsADM.10203#guide
+
+# We always want to have some compiler flags for security concerns.
+SECURITY_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -z noexecstack -fpic -shared -Wl,-z,relro -Wl,-z,now -Wformat -Wformat-security -Werror=format-security"
+CITUS_CFLAGS="$CITUS_CFLAGS $SECURITY_CFLAGS"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: Blindly added security flags for linker: $SECURITY_CFLAGS" >&5
+$as_echo "$as_me: Blindly added security flags for linker: $SECURITY_CFLAGS" >&6;}
+
+# We always want to have some clang flags for security concerns.
+# This doesn't include "-Wl,-z,relro -Wl,-z,now" on purpuse, because bitcode is not linked.
+# This doesn't include -fsanitize=cfi because it breaks builds on many distros including
+# Debian/Buster, Debian/Stretch, Ubuntu/Bionic, Ubuntu/Xenial and EL7.
+SECURITY_BITCODE_CFLAGS="-fsanitize=safe-stack -fstack-protector-strong -flto -fPIC -Wformat -Wformat-security -Werror=format-security"
+CITUS_BITCODE_CFLAGS="$CITUS_BITCODE_CFLAGS $SECURITY_BITCODE_CFLAGS"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: Blindly added security flags for llvm: $SECURITY_BITCODE_CFLAGS" >&5
+$as_echo "$as_me: Blindly added security flags for llvm: $SECURITY_BITCODE_CFLAGS" >&6;}
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: If you run into issues during linking or bitcode compilation, you can use --without-security-flags." >&5
+$as_echo "$as_me: WARNING: If you run into issues during linking or bitcode compilation, you can use --without-security-flags." >&2;}
+fi
+
 # Check if git is installed, when installed the gitref of the checkout will be baked in the application
 # Extract the first word of "git", so it can be a program name with args.
 set dummy git; ac_word=$2
@ -4752,6 +4847,8 @@ fi

 CITUS_CFLAGS="$CITUS_CFLAGS"

+CITUS_BITCODE_CFLAGS="$CITUS_BITCODE_CFLAGS"
+
 CITUS_CPPFLAGS="$CITUS_CPPFLAGS"

 CITUS_LDFLAGS="$LIBS $CITUS_LDFLAGS"
@ -5276,7 +5373,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by Citus $as_me 10.0devel, which was
+This file was extended by Citus $as_me 10.0.8, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@ -5338,7 +5435,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-Citus config.status 10.0devel
+Citus config.status 10.0.8
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.in
+++ b/configure.in
@ -5,7 +5,7 @@
 # everyone needing autoconf installed, the resulting files are checked
 # into the SCM.

-AC_INIT([Citus], [10.0devel])
+AC_INIT([Citus], [10.0.8])
 AC_COPYRIGHT([Copyright (c) Citus Data, Inc.])

 # we'll need sed and awk for some of the version commands
@ -174,6 +174,10 @@ CITUSAC_PROG_CC_CFLAGS_OPT([-Werror=vla])  # visual studio does not support thes
 CITUSAC_PROG_CC_CFLAGS_OPT([-Werror=implicit-int])
 CITUSAC_PROG_CC_CFLAGS_OPT([-Werror=implicit-function-declaration])
 CITUSAC_PROG_CC_CFLAGS_OPT([-Werror=return-type])
+# Security flags
+# Flags taken from: https://liquid.microsoft.com/Web/Object/Read/ms.security/Requirements/Microsoft.Security.SystemsADM.10203#guide
+# We do not enforce the following flag because it is only available on GCC>=8
+CITUSAC_PROG_CC_CFLAGS_OPT([-fstack-clash-protection])

 #
 # --enable-coverage enables generation of code coverage metrics with gcov
@ -216,7 +220,7 @@ if test "$version_num" != '11'; then
   HAS_TABLEAM=yes
   AC_DEFINE([HAS_TABLEAM], 1, [Define to 1 to build with table access method support, pg12 and up])
 else
-   AC_MSG_NOTICE([postgres version does not support table access methodds])
+   AC_MSG_NOTICE([postgres version does not support table access methods])
 fi;

 # Require lz4 & zstd only if we are compiling columnar
@ -261,11 +265,36 @@ if test "$HAS_TABLEAM" == 'yes'; then

 fi # test "$HAS_TABLEAM" == 'yes'

+
+PGAC_ARG_BOOL(with, security-flags, no,
+              [use security flags])
+AC_SUBST(with_security_flags)
+
+if test "$with_security_flags" = yes; then
+# Flags taken from: https://liquid.microsoft.com/Web/Object/Read/ms.security/Requirements/Microsoft.Security.SystemsADM.10203#guide
+
+# We always want to have some compiler flags for security concerns.
+SECURITY_CFLAGS="-fstack-protector-strong -D_FORTIFY_SOURCE=2 -O2 -z noexecstack -fpic -shared -Wl,-z,relro -Wl,-z,now -Wformat -Wformat-security -Werror=format-security"
+CITUS_CFLAGS="$CITUS_CFLAGS $SECURITY_CFLAGS"
+AC_MSG_NOTICE([Blindly added security flags for linker: $SECURITY_CFLAGS])
+
+# We always want to have some clang flags for security concerns.
+# This doesn't include "-Wl,-z,relro -Wl,-z,now" on purpuse, because bitcode is not linked.
+# This doesn't include -fsanitize=cfi because it breaks builds on many distros including
+# Debian/Buster, Debian/Stretch, Ubuntu/Bionic, Ubuntu/Xenial and EL7.
+SECURITY_BITCODE_CFLAGS="-fsanitize=safe-stack -fstack-protector-strong -flto -fPIC -Wformat -Wformat-security -Werror=format-security"
+CITUS_BITCODE_CFLAGS="$CITUS_BITCODE_CFLAGS $SECURITY_BITCODE_CFLAGS"
+AC_MSG_NOTICE([Blindly added security flags for llvm: $SECURITY_BITCODE_CFLAGS])
+
+AC_MSG_WARN([If you run into issues during linking or bitcode compilation, you can use --without-security-flags.])
+fi
+
 # Check if git is installed, when installed the gitref of the checkout will be baked in the application
 AC_PATH_PROG(GIT_BIN, git)
 AC_CHECK_FILE(.git,[HAS_DOTGIT=yes], [HAS_DOTGIT=])

 AC_SUBST(CITUS_CFLAGS, "$CITUS_CFLAGS")
+AC_SUBST(CITUS_BITCODE_CFLAGS, "$CITUS_BITCODE_CFLAGS")
 AC_SUBST(CITUS_CPPFLAGS, "$CITUS_CPPFLAGS")
 AC_SUBST(CITUS_LDFLAGS, "$LIBS $CITUS_LDFLAGS")
 AC_SUBST(POSTGRES_SRCDIR, "$POSTGRES_SRCDIR")
--- a/src/backend/columnar/cstore_metadata_tables.c
+++ b/src/backend/columnar/cstore_metadata_tables.c
@ -311,8 +311,13 @@ DeleteColumnarTableOptions(Oid regclass, bool missingOk)
 	 */
 	Assert(!IsBinaryUpgrade);

-	Relation columnarOptions = relation_open(ColumnarOptionsRelationId(),
-											 RowExclusiveLock);
+	Relation columnarOptions = try_relation_open(ColumnarOptionsRelationId(),
+												 RowExclusiveLock);
+	if (columnarOptions == NULL)
+	{
+		/* extension has been dropped */
+		return false;
+	}

 	/* find existing item to remove */
 	ScanKeyData scanKey[1] = { 0 };
@ -1087,7 +1092,11 @@ DatumToBytea(Datum value, Form_pg_attribute attrForm)
 	{
 		if (attrForm->attbyval)
 		{
-			store_att_byval(VARDATA(result), value, attrForm->attlen);
+			Datum tmp;
+			store_att_byval(&tmp, value, attrForm->attlen);
+
+			memcpy_s(VARDATA(result), datumLength + VARHDRSZ,
+					 &tmp, attrForm->attlen);
 		}
 		else
 		{
--- a/src/backend/columnar/cstore_reader.c
+++ b/src/backend/columnar/cstore_reader.c
@ -29,6 +29,7 @@
 #else
 #include "optimizer/clauses.h"
 #include "optimizer/predtest.h"
+#include "optimizer/var.h"
 #endif
 #include "optimizer/restrictinfo.h"
 #include "storage/fd.h"
@ -62,6 +63,8 @@ struct TableReadState
 	List *projectedColumnList;

 	List *whereClauseList;
+	List *whereClauseVars;
+
 	MemoryContext stripeReadContext;
 	StripeBuffers *stripeBuffers;
 	uint32 readStripeCount;
@ -77,6 +80,7 @@ static StripeBuffers * LoadFilteredStripeBuffers(Relation relation,
 												 TupleDesc tupleDescriptor,
 												 List *projectedColumnList,
 												 List *whereClauseList,
+												 List *whereClauseVars,
 												 int64 *chunkGroupsFiltered);
 static void ReadStripeNextRow(StripeBuffers *stripeBuffers, List *projectedColumnList,
 							  uint64 chunkIndex, uint64 chunkRowIndex,
@ -87,10 +91,11 @@ static ColumnBuffers * LoadColumnBuffers(Relation relation,
 										 uint32 chunkCount, uint64 stripeOffset,
 										 Form_pg_attribute attributeForm);
 static bool * SelectedChunkMask(StripeSkipList *stripeSkipList,
-								List *projectedColumnList, List *whereClauseList,
+								List *whereClauseList, List *whereClauseVars,
 								int64 *chunkGroupsFiltered);
 static List * BuildRestrictInfoList(List *whereClauseList);
 static Node * BuildBaseConstraint(Var *variable);
+static List * GetClauseVars(List *clauses, int natts);
 static OpExpr * MakeOpExpression(Var *variable, int16 strategyNumber);
 static Oid GetOperatorByType(Oid typeId, Oid accessMethodId, int16 strategyNumber);
 static void UpdateConstraint(Node *baseConstraint, Datum minValue, Datum maxValue);
@ -142,6 +147,7 @@ ColumnarBeginRead(Relation relation, TupleDesc tupleDescriptor,
 	readState->stripeList = stripeList;
 	readState->projectedColumnList = projectedColumnList;
 	readState->whereClauseList = whereClauseList;
+	readState->whereClauseVars = GetClauseVars(whereClauseList, tupleDescriptor->natts);
 	readState->stripeBuffers = NULL;
 	readState->readStripeCount = 0;
 	readState->stripeReadRowCount = 0;
@ -218,6 +224,8 @@ ColumnarReadNextRow(TableReadState *readState, Datum *columnValues, bool *column
 																 projectedColumnList,
 																 readState->
 																 whereClauseList,
+																 readState->
+																 whereClauseVars,
 																 &readState->
 																 chunkGroupsFiltered);
 		readState->readStripeCount++;
@ -400,7 +408,8 @@ ColumnarTableRowCount(Relation relation)
 static StripeBuffers *
 LoadFilteredStripeBuffers(Relation relation, StripeMetadata *stripeMetadata,
 						  TupleDesc tupleDescriptor, List *projectedColumnList,
-						  List *whereClauseList, int64 *chunkGroupsFiltered)
+						  List *whereClauseList, List *whereClauseVars,
+						  int64 *chunkGroupsFiltered)
 {
 	uint32 columnIndex = 0;
 	uint32 columnCount = tupleDescriptor->natts;
@ -412,8 +421,8 @@ LoadFilteredStripeBuffers(Relation relation, StripeMetadata *stripeMetadata,
 														tupleDescriptor,
 														stripeMetadata->chunkCount);

-	bool *selectedChunkMask = SelectedChunkMask(stripeSkipList, projectedColumnList,
-												whereClauseList, chunkGroupsFiltered);
+	bool *selectedChunkMask = SelectedChunkMask(stripeSkipList, whereClauseList,
+												whereClauseVars, chunkGroupsFiltered);

 	StripeSkipList *selectedChunkSkipList =
 		SelectedChunkSkipList(stripeSkipList, projectedColumnMask,
@ -551,8 +560,8 @@ LoadColumnBuffers(Relation relation, ColumnChunkSkipNode *chunkSkipNodeArray,
 * the chunk can be refuted by the given qualifier conditions.
 */
 static bool *
-SelectedChunkMask(StripeSkipList *stripeSkipList, List *projectedColumnList,
-				  List *whereClauseList, int64 *chunkGroupsFiltered)
+SelectedChunkMask(StripeSkipList *stripeSkipList, List *whereClauseList,
+				  List *whereClauseVars, int64 *chunkGroupsFiltered)
 {
 	ListCell *columnCell = NULL;
 	uint32 chunkIndex = 0;
@ -561,7 +570,7 @@ SelectedChunkMask(StripeSkipList *stripeSkipList, List *projectedColumnList,
 	bool *selectedChunkMask = palloc0(stripeSkipList->chunkCount * sizeof(bool));
 	memset(selectedChunkMask, true, stripeSkipList->chunkCount * sizeof(bool));

-	foreach(columnCell, projectedColumnList)
+	foreach(columnCell, whereClauseVars)
 	{
 		Var *column = lfirst(columnCell);
 		uint32 columnIndex = column->varattno - 1;
@ -693,6 +702,58 @@ BuildBaseConstraint(Var *variable)
 }


+/*
+ * GetClauseVars extracts the Vars from the given clauses for the purpose of
+ * building constraints that can be refuted by predicate_refuted_by(). It also
+ * deduplicates and sorts them.
+ */
+static List *
+GetClauseVars(List *whereClauseList, int natts)
+{
+	/*
+	 * We don't recurse into or include aggregates, window functions, or
+	 * PHVs. We don't expect any PHVs during execution; and Vars found inside
+	 * an aggregate or window function aren't going to be useful in forming
+	 * constraints that can be refuted.
+	 */
+	int flags = 0;
+	List *vars = pull_var_clause((Node *) whereClauseList, flags);
+	Var **deduplicate = palloc0(sizeof(Var *) * natts);
+
+	ListCell *lc;
+	foreach(lc, vars)
+	{
+		Node *node = lfirst(lc);
+		Assert(IsA(node, Var));
+
+		Var *var = (Var *) node;
+		int idx = var->varattno - 1;
+
+		if (deduplicate[idx] != NULL)
+		{
+			/* if they have the same varattno, the rest should be identical */
+			Assert(equal(var, deduplicate[idx]));
+		}
+
+		deduplicate[idx] = var;
+	}
+
+	List *whereClauseVars = NIL;
+	for (int i = 0; i < natts; i++)
+	{
+		Var *var = deduplicate[i];
+		if (var != NULL)
+		{
+			whereClauseVars = lappend(whereClauseVars, var);
+		}
+	}
+
+	pfree(deduplicate);
+
+	return whereClauseVars;
+}
+
+
 /*
 * MakeOpExpression builds an operator expression node. This operator expression
 * implements the operator clause as defined by the variable and the strategy
--- a/src/backend/columnar/cstore_tableam.c
+++ b/src/backend/columnar/cstore_tableam.c
@ -160,6 +160,8 @@ columnar_beginscan(Relation relation, Snapshot snapshot,
 				   ParallelTableScanDesc parallel_scan,
 				   uint32 flags)
 {
+	CheckCitusVersion(ERROR);
+
 	int natts = relation->rd_att->natts;
 	Bitmapset *attr_needed = NULL;

@ -419,6 +421,8 @@ static bool
 columnar_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
 								  Snapshot snapshot)
 {
+	CheckCitusVersion(ERROR);
+
 	return true;
 }

@ -436,6 +440,8 @@ static void
 columnar_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
 					  int options, BulkInsertState bistate)
 {
+	CheckCitusVersion(ERROR);
+
 	/*
 	 * columnar_init_write_state allocates the write state in a longer
 	 * lasting context, so no need to worry about it.
@ -481,6 +487,8 @@ static void
 columnar_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 					  CommandId cid, int options, BulkInsertState bistate)
 {
+	CheckCitusVersion(ERROR);
+
 	TableWriteState *writeState = columnar_init_write_state(relation,
 															RelationGetDescr(relation),
 															GetCurrentSubTransactionId());
@ -552,6 +560,8 @@ columnar_relation_set_new_filenode(Relation rel,
 								   TransactionId *freezeXid,
 								   MultiXactId *minmulti)
 {
+	CheckCitusVersion(ERROR);
+
 	if (persistence != RELPERSISTENCE_PERMANENT)
 	{
 		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
@ -581,6 +591,8 @@ columnar_relation_set_new_filenode(Relation rel,
 static void
 columnar_relation_nontransactional_truncate(Relation rel)
 {
+	CheckCitusVersion(ERROR);
+
 	RelFileNode relfilenode = rel->rd_node;

 	NonTransactionDropWriteState(relfilenode.relNode);
@ -625,6 +637,8 @@ columnar_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 								   double *tups_vacuumed,
 								   double *tups_recently_dead)
 {
+	CheckCitusVersion(ERROR);
+
 	TupleDesc sourceDesc = RelationGetDescr(OldHeap);
 	TupleDesc targetDesc = RelationGetDescr(NewHeap);

@ -670,6 +684,27 @@ columnar_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 }


+/*
+ * ColumnarTableTupleCount returns the number of tuples that columnar
+ * table with relationId has by using stripe metadata.
+ */
+static uint64
+ColumnarTableTupleCount(Relation relation)
+{
+	List *stripeList = StripesForRelfilenode(relation->rd_node);
+	uint64 tupleCount = 0;
+
+	ListCell *lc = NULL;
+	foreach(lc, stripeList)
+	{
+		StripeMetadata *stripe = lfirst(lc);
+		tupleCount += stripe->rowCount;
+	}
+
+	return tupleCount;
+}
+
+
 /*
 * columnar_vacuum_rel implements VACUUM without FULL option.
 */
@ -677,6 +712,18 @@ static void
 columnar_vacuum_rel(Relation rel, VacuumParams *params,
 					BufferAccessStrategy bstrategy)
 {
+	if (!CheckCitusVersion(WARNING))
+	{
+		/*
+		 * Skip if the extension catalogs are not up-to-date, but avoid
+		 * erroring during auto-vacuum.
+		 */
+		return;
+	}
+
+	pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
+								  RelationGetRelid(rel));
+
 	int elevel = (params->options & VACOPT_VERBOSE) ? INFO : DEBUG2;

 	/* this should have been resolved by vacuum.c until now */
@ -692,6 +739,52 @@ columnar_vacuum_rel(Relation rel, VacuumParams *params,
 	{
 		TruncateColumnar(rel, elevel);
 	}
+
+	RelationOpenSmgr(rel);
+	BlockNumber new_rel_pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
+
+	/* get the number of indexes */
+	List *indexList = RelationGetIndexList(rel);
+	int nindexes = list_length(indexList);
+
+	TransactionId oldestXmin;
+	TransactionId freezeLimit;
+	MultiXactId multiXactCutoff;
+
+	/* initialize xids */
+	TransactionId xidFullScanLimit;
+	MultiXactId mxactFullScanLimit;
+	vacuum_set_xid_limits(rel,
+						  params->freeze_min_age,
+						  params->freeze_table_age,
+						  params->multixact_freeze_min_age,
+						  params->multixact_freeze_table_age,
+						  &oldestXmin, &freezeLimit, &xidFullScanLimit,
+						  &multiXactCutoff, &mxactFullScanLimit);
+
+	Assert(TransactionIdPrecedesOrEquals(freezeLimit, oldestXmin));
+
+	/*
+	 * Columnar storage doesn't hold any transaction IDs, so we can always
+	 * just advance to the most aggressive value.
+	 */
+	TransactionId newRelFrozenXid = oldestXmin;
+	MultiXactId newRelminMxid = multiXactCutoff;
+
+	double new_live_tuples = ColumnarTableTupleCount(rel);
+
+	/* all visible pages are always 0 */
+	BlockNumber new_rel_allvisible = 0;
+
+	vac_update_relstats(rel, new_rel_pages, new_live_tuples,
+						new_rel_allvisible, nindexes > 0,
+						newRelFrozenXid, newRelminMxid, false);
+
+	pgstat_report_vacuum(RelationGetRelid(rel),
+						 rel->rd_rel->relisshared,
+						 Max(new_live_tuples, 0),
+						 0);
+	pgstat_progress_end_command();
 }


@ -1006,6 +1099,8 @@ columnar_index_validate_scan(Relation heapRelation,
 static uint64
 columnar_relation_size(Relation rel, ForkNumber forkNumber)
 {
+	CheckCitusVersion(ERROR);
+
 	uint64 nblocks = 0;

 	/* Open it at the smgr level if not already done */
@ -1031,6 +1126,8 @@ columnar_relation_size(Relation rel, ForkNumber forkNumber)
 static bool
 columnar_relation_needs_toast_table(Relation rel)
 {
+	CheckCitusVersion(ERROR);
+
 	return false;
 }

@ -1040,6 +1137,8 @@ columnar_estimate_rel_size(Relation rel, int32 *attr_widths,
 						   BlockNumber *pages, double *tuples,
 						   double *allvisfrac)
 {
+	CheckCitusVersion(ERROR);
+
 	RelationOpenSmgr(rel);
 	*pages = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
 	*tuples = ColumnarTableRowCount(rel);
@ -1218,6 +1317,8 @@ ColumnarTableDropHook(Oid relid)

 	if (IsColumnarTableAmTable(relid))
 	{
+		CheckCitusVersion(ERROR);
+
 		/*
 		 * Drop metadata. No need to drop storage here since for
 		 * tableam tables storage is managed by postgres.
@ -1653,6 +1754,8 @@ PG_FUNCTION_INFO_V1(alter_columnar_table_set);
 Datum
 alter_columnar_table_set(PG_FUNCTION_ARGS)
 {
+	CheckCitusVersion(ERROR);
+
 	Oid relationId = PG_GETARG_OID(0);

 	Relation rel = table_open(relationId, AccessExclusiveLock); /* ALTER TABLE LOCK */
@ -1662,6 +1765,8 @@ alter_columnar_table_set(PG_FUNCTION_ARGS)
 							   quote_identifier(RelationGetRelationName(rel)))));
 	}

+	EnsureTableOwner(relationId);
+
 	ColumnarOptions options = { 0 };
 	if (!ReadColumnarOptions(relationId, &options))
 	{
@ -1760,6 +1865,8 @@ PG_FUNCTION_INFO_V1(alter_columnar_table_reset);
 Datum
 alter_columnar_table_reset(PG_FUNCTION_ARGS)
 {
+	CheckCitusVersion(ERROR);
+
 	Oid relationId = PG_GETARG_OID(0);

 	Relation rel = table_open(relationId, AccessExclusiveLock); /* ALTER TABLE LOCK */
@ -1769,6 +1876,8 @@ alter_columnar_table_reset(PG_FUNCTION_ARGS)
 							   quote_identifier(RelationGetRelationName(rel)))));
 	}

+	EnsureTableOwner(relationId);
+
 	ColumnarOptions options = { 0 };
 	if (!ReadColumnarOptions(relationId, &options))
 	{
--- a/src/backend/columnar/sql/columnar--10.0-1--10.0-2.sql
+++ b/src/backend/columnar/sql/columnar--10.0-1--10.0-2.sql
@ -0,0 +1,5 @@
+/* columnar--10.0-1--10.0-2.sql */
+
+-- grant read access for columnar metadata tables to unprivileged user
+GRANT USAGE ON SCHEMA columnar TO PUBLIC;
+GRANT SELECT ON ALL tables IN SCHEMA columnar TO PUBLIC ;
--- a/src/backend/columnar/sql/downgrades/columnar--10.0-2--10.0-1.sql
+++ b/src/backend/columnar/sql/downgrades/columnar--10.0-2--10.0-1.sql
@ -0,0 +1,5 @@
+/* columnar--10.0-2--10.0-1.sql */
+
+-- revoke read access for columnar metadata tables from unprivileged user
+REVOKE USAGE ON SCHEMA columnar FROM PUBLIC;
+REVOKE SELECT ON ALL tables IN SCHEMA columnar FROM PUBLIC;
--- a/src/backend/distributed/citus.control
+++ b/src/backend/distributed/citus.control
@ -1,6 +1,6 @@
 # Citus extension
 comment = 'Citus distributed database'
-default_version = '10.0-1'
+default_version = '10.0-4'
 module_pathname = '$libdir/citus'
 relocatable = false
 schema = pg_catalog
--- a/src/backend/distributed/commands/alter_table.c
+++ b/src/backend/distributed/commands/alter_table.c
@ -29,9 +29,12 @@
 #include "fmgr.h"

 #include "access/hash.h"
+#include "access/htup_details.h"
 #include "access/xact.h"
 #include "catalog/dependency.h"
 #include "catalog/pg_am.h"
+#include "catalog/pg_depend.h"
+#include "catalog/pg_rewrite_d.h"
 #include "columnar/columnar.h"
 #include "columnar/columnar_tableam.h"
 #include "distributed/colocation_utils.h"
@ -43,12 +46,15 @@
 #include "distributed/listutils.h"
 #include "distributed/local_executor.h"
 #include "distributed/metadata/dependency.h"
+#include "distributed/metadata/distobject.h"
 #include "distributed/metadata_cache.h"
 #include "distributed/metadata_sync.h"
 #include "distributed/multi_executor.h"
 #include "distributed/multi_logical_planner.h"
 #include "distributed/multi_partitioning_utils.h"
 #include "distributed/reference_table_utils.h"
+#include "distributed/relation_access_tracking.h"
+#include "distributed/shard_utils.h"
 #include "distributed/worker_protocol.h"
 #include "distributed/worker_transaction.h"
 #include "executor/spi.h"
@ -175,6 +181,8 @@ static TableConversionReturn * AlterDistributedTable(TableConversionParameters *
 static TableConversionReturn * AlterTableSetAccessMethod(
 	TableConversionParameters *params);
 static TableConversionReturn * ConvertTable(TableConversionState *con);
+static bool SwitchToSequentialAndLocalExecutionIfShardNameTooLong(char *relationName,
+																  char *longestShardName);
 static void EnsureTableNotReferencing(Oid relationId, char conversionType);
 static void EnsureTableNotReferenced(Oid relationId, char conversionType);
 static void EnsureTableNotForeign(Oid relationId);
@ -198,6 +206,8 @@ static bool WillRecreateForeignKeyToReferenceTable(Oid relationId,
 												   CascadeToColocatedOption cascadeOption);
 static void WarningsForDroppingForeignKeysWithDistributedTables(Oid relationId);
 static void ExecuteQueryViaSPI(char *query, int SPIOK);
+static void ErrorIfUnsupportedCascadeObjects(Oid relationId);
+static bool DoesCascadeDropUnsupportedObject(Oid classId, Oid id, HTAB *nodeMap);

 PG_FUNCTION_INFO_V1(undistribute_table);
 PG_FUNCTION_INFO_V1(alter_distributed_table);
@ -375,6 +385,8 @@ UndistributeTable(TableConversionParameters *params)
 		ErrorIfAnyPartitionRelationInvolvedInNonInheritedFKey(partitionList);
 	}

+	ErrorIfUnsupportedCascadeObjects(params->relationId);
+
 	params->conversionType = UNDISTRIBUTE_TABLE;
 	params->shardCountIsNull = true;
 	TableConversionState *con = CreateTableConversion(params);
@ -406,6 +418,8 @@ AlterDistributedTable(TableConversionParameters *params)
 	EnsureTableNotPartition(params->relationId);
 	EnsureHashDistributedTable(params->relationId);

+	ErrorIfUnsupportedCascadeObjects(params->relationId);
+
 	params->conversionType = ALTER_DISTRIBUTED_TABLE;
 	TableConversionState *con = CreateTableConversion(params);
 	CheckAlterDistributedTableConversionParameters(con);
@ -467,6 +481,8 @@ AlterTableSetAccessMethod(TableConversionParameters *params)
 		}
 	}

+	ErrorIfUnsupportedCascadeObjects(params->relationId);
+
 	params->conversionType = ALTER_TABLE_SET_ACCESS_METHOD;
 	params->shardCountIsNull = true;
 	TableConversionState *con = CreateTableConversion(params);
@ -511,6 +527,10 @@ ConvertTable(TableConversionState *con)
 	bool oldEnableLocalReferenceForeignKeys = EnableLocalReferenceForeignKeys;
 	SetLocalEnableLocalReferenceForeignKeys(false);

+	/* switch to sequential execution if shard names will be too long */
+	SwitchToSequentialAndLocalExecutionIfRelationNameTooLong(con->relationId,
+															 con->relationName);
+
 	if (con->conversionType == UNDISTRIBUTE_TABLE && con->cascadeViaForeignKeys &&
 		(TableReferencing(con->relationId) || TableReferenced(con->relationId)))
 	{
@ -673,7 +693,7 @@ ConvertTable(TableConversionState *con)
 		Node *parseTree = ParseTreeNode(tableCreationSql);

 		RelayEventExtendNames(parseTree, con->schemaName, con->hashOfName);
-		ProcessUtilityParseTree(parseTree, tableCreationSql, PROCESS_UTILITY_TOPLEVEL,
+		ProcessUtilityParseTree(parseTree, tableCreationSql, PROCESS_UTILITY_QUERY,
 								NULL, None_Receiver, NULL);
 	}

@ -711,6 +731,32 @@ ConvertTable(TableConversionState *con)
 		CreateCitusTableLike(con);
 	}

+	/* preserve colocation with procedures/functions */
+	if (con->conversionType == ALTER_DISTRIBUTED_TABLE)
+	{
+		/*
+		 * Updating the colocationId of functions is always desirable for
+		 * the following scenario:
+		 *    we have shardCount or colocateWith change
+		 *    AND  entire co-location group is altered
+		 * The reason for the second condition is because we currently don't
+		 * remember the original table specified in the colocateWith when
+		 * distributing the function. We only remember the colocationId in
+		 * pg_dist_object table.
+		 */
+		if ((!con->shardCountIsNull || con->colocateWith != NULL) &&
+			(con->cascadeToColocated == CASCADE_TO_COLOCATED_YES || list_length(
+				 con->colocatedTableList) == 1) && con->distributionColumn == NULL)
+		{
+			/*
+			 * Update the colocationId from the one of the old relation to the one
+			 * of the new relation for all tuples in citus.pg_dist_object
+			 */
+			UpdateDistributedObjectColocationId(TableColocationId(con->relationId),
+												TableColocationId(con->newRelationId));
+		}
+	}
+
 	ReplaceTable(con->relationId, con->newRelationId, justBeforeDropCommands,
 				 con->suppressNoticeMessages);

@ -728,7 +774,7 @@ ConvertTable(TableConversionState *con)
 		Node *parseTree = ParseTreeNode(attachPartitionCommand);

 		ProcessUtilityParseTree(parseTree, attachPartitionCommand,
-								PROCESS_UTILITY_TOPLEVEL,
+								PROCESS_UTILITY_QUERY,
 								NULL, None_Receiver, NULL);
 	}

@ -1042,6 +1088,30 @@ CreateDistributedTableLike(TableConversionState *con)
 	{
 		newShardCount = con->shardCount;
 	}
+
+	Oid originalRelationId = con->relationId;
+	if (con->originalDistributionKey != NULL && PartitionTable(originalRelationId))
+	{
+		/*
+		 * Due to dropped columns, the partition tables might have different
+		 * distribution keys than their parents, see issue #5123 for details.
+		 *
+		 * At this point, we get the partitioning information from the
+		 * originalRelationId, but we get the distribution key for newRelationId.
+		 *
+		 * We have to do this, because the newRelationId is just a placeholder
+		 * at this moment, but that's going to be the table in pg_dist_partition.
+		 */
+		Oid parentRelationId = PartitionParentOid(originalRelationId);
+		Var *parentDistKey = DistPartitionKey(parentRelationId);
+		char *parentDistKeyColumnName =
+			ColumnToColumnName(parentRelationId, nodeToString(parentDistKey));
+
+		newDistributionKey =
+			FindColumnWithNameOnTargetRelation(parentRelationId, parentDistKeyColumnName,
+											   con->newRelationId);
+	}
+
 	char partitionMethod = PartitionMethod(con->relationId);
 	CreateDistributedTable(con->newRelationId, newDistributionKey, partitionMethod,
 						   newShardCount, newColocateWith, false);
@ -1077,6 +1147,94 @@ CreateCitusTableLike(TableConversionState *con)
 }


+/*
+ * ErrorIfUnsupportedCascadeObjects gets oid of a relation, finds the objects
+ * that dropping this relation cascades into and errors if there are any extensions
+ * that would be dropped.
+ */
+static void
+ErrorIfUnsupportedCascadeObjects(Oid relationId)
+{
+	HASHCTL info;
+	memset(&info, 0, sizeof(info));
+	info.keysize = sizeof(Oid);
+	info.entrysize = sizeof(Oid);
+	info.hash = oid_hash;
+	uint32 hashFlags = (HASH_ELEM | HASH_FUNCTION);
+	HTAB *nodeMap = hash_create("object dependency map (oid)", 64, &info, hashFlags);
+
+	bool unsupportedObjectInDepGraph =
+		DoesCascadeDropUnsupportedObject(RelationRelationId, relationId, nodeMap);
+
+	if (unsupportedObjectInDepGraph)
+	{
+		ereport(ERROR, (errmsg("cannot alter table because an extension depends on it")));
+	}
+}
+
+
+/*
+ * DoesCascadeDropUnsupportedObject walks through the objects that depend on the
+ * object with object id and returns true if it finds any unsupported objects.
+ *
+ * This function only checks extensions as unsupported objects.
+ *
+ * Extension dependency is different than the rest. If an object depends on an extension
+ * dropping the object would drop the extension too.
+ * So we check with IsObjectAddressOwnedByExtension function.
+ */
+static bool
+DoesCascadeDropUnsupportedObject(Oid classId, Oid objectId, HTAB *nodeMap)
+{
+	bool found = false;
+	hash_search(nodeMap, &objectId, HASH_ENTER, &found);
+
+	if (found)
+	{
+		return false;
+	}
+
+	ObjectAddress objectAddress = { 0 };
+	ObjectAddressSet(objectAddress, classId, objectId);
+
+	if (IsObjectAddressOwnedByExtension(&objectAddress, NULL))
+	{
+		return true;
+	}
+
+	Oid targetObjectClassId = classId;
+	Oid targetObjectId = objectId;
+	List *dependencyTupleList = GetPgDependTuplesForDependingObjects(targetObjectClassId,
+																	 targetObjectId);
+
+	HeapTuple depTup = NULL;
+	foreach_ptr(depTup, dependencyTupleList)
+	{
+		Form_pg_depend pg_depend = (Form_pg_depend) GETSTRUCT(depTup);
+
+		Oid dependingOid = InvalidOid;
+		Oid dependingClassId = InvalidOid;
+
+		if (pg_depend->classid == RewriteRelationId)
+		{
+			dependingOid = GetDependingView(pg_depend);
+			dependingClassId = RelationRelationId;
+		}
+		else
+		{
+			dependingOid = pg_depend->objid;
+			dependingClassId = pg_depend->classid;
+		}
+
+		if (DoesCascadeDropUnsupportedObject(dependingClassId, dependingOid, nodeMap))
+		{
+			return true;
+		}
+	}
+	return false;
+}
+
+
 /*
 * GetViewCreationCommandsOfTable takes a table oid generates the CREATE VIEW
 * commands for views that depend to the given table. This includes the views
@ -1134,7 +1292,7 @@ ReplaceTable(Oid sourceId, Oid targetId, List *justBeforeDropCommands,
 	{
 		if (!suppressNoticeMessages)
 		{
-			ereport(NOTICE, (errmsg("Moving the data of %s",
+			ereport(NOTICE, (errmsg("moving the data of %s",
 									quote_qualified_identifier(schemaName, sourceName))));
 		}

@ -1207,7 +1365,7 @@ ReplaceTable(Oid sourceId, Oid targetId, List *justBeforeDropCommands,

 	if (!suppressNoticeMessages)
 	{
-		ereport(NOTICE, (errmsg("Dropping the old %s",
+		ereport(NOTICE, (errmsg("dropping the old %s",
 								quote_qualified_identifier(schemaName, sourceName))));
 	}

@ -1218,7 +1376,7 @@ ReplaceTable(Oid sourceId, Oid targetId, List *justBeforeDropCommands,

 	if (!suppressNoticeMessages)
 	{
-		ereport(NOTICE, (errmsg("Renaming the new table to %s",
+		ereport(NOTICE, (errmsg("renaming the new table to %s",
 								quote_qualified_identifier(schemaName, sourceName))));
 	}

@ -1572,3 +1730,132 @@ ExecuteQueryViaSPI(char *query, int SPIOK)
 		ereport(ERROR, (errmsg("could not finish SPI connection")));
 	}
 }
+
+
+/*
+ * SwitchToSequentialAndLocalExecutionIfRelationNameTooLong generates the longest shard name
+ * on the shards of a distributed table, and if exceeds the limit switches to sequential and
+ * local execution to prevent self-deadlocks.
+ *
+ * In case of a RENAME, the relation name parameter should store the new table name, so
+ * that the function can generate shard names of the renamed relations
+ */
+void
+SwitchToSequentialAndLocalExecutionIfRelationNameTooLong(Oid relationId,
+														 char *finalRelationName)
+{
+	if (!IsCitusTable(relationId))
+	{
+		return;
+	}
+
+	if (ShardIntervalCount(relationId) == 0)
+	{
+		/*
+		 * Relation has no shards, so we cannot run into "long shard relation
+		 * name" issue.
+		 */
+		return;
+	}
+
+	char *longestShardName = GetLongestShardName(relationId, finalRelationName);
+	bool switchedToSequentialAndLocalExecution =
+		SwitchToSequentialAndLocalExecutionIfShardNameTooLong(finalRelationName,
+															  longestShardName);
+
+	if (switchedToSequentialAndLocalExecution)
+	{
+		return;
+	}
+
+	if (PartitionedTable(relationId))
+	{
+		Oid longestNamePartitionId = PartitionWithLongestNameRelationId(relationId);
+		if (!OidIsValid(longestNamePartitionId))
+		{
+			/* no partitions have been created yet */
+			return;
+		}
+
+		char *longestPartitionName = get_rel_name(longestNamePartitionId);
+		char *longestPartitionShardName = NULL;
+
+		/*
+		 * Use the shardId values of the partition if it is distributed, otherwise use
+		 * hypothetical values
+		 */
+		if (IsCitusTable(longestNamePartitionId) &&
+			ShardIntervalCount(longestNamePartitionId) > 0)
+		{
+			longestPartitionShardName =
+				GetLongestShardName(longestNamePartitionId, longestPartitionName);
+		}
+		else
+		{
+			longestPartitionShardName =
+				GetLongestShardNameForLocalPartition(relationId, longestPartitionName);
+		}
+
+		SwitchToSequentialAndLocalExecutionIfShardNameTooLong(longestPartitionName,
+															  longestPartitionShardName);
+	}
+}
+
+
+/*
+ * SwitchToSequentialAndLocalExecutionIfShardNameTooLong switches to sequential and local
+ * execution if the shard name is too long.
+ *
+ * returns true if switched to sequential and local execution.
+ */
+static bool
+SwitchToSequentialAndLocalExecutionIfShardNameTooLong(char *relationName,
+													  char *longestShardName)
+{
+	if (strlen(longestShardName) >= NAMEDATALEN - 1)
+	{
+		if (ParallelQueryExecutedInTransaction())
+		{
+			/*
+			 * If there has already been a parallel query executed, the sequential mode
+			 * would still use the already opened parallel connections to the workers,
+			 * thus contradicting our purpose of using sequential mode.
+			 */
+			ereport(ERROR, (errmsg(
+								"Shard name (%s) for table (%s) is too long and could "
+								"lead to deadlocks when executed in a transaction "
+								"block after a parallel query", longestShardName,
+								relationName),
+							errhint("Try re-running the transaction with "
+									"\"SET LOCAL citus.multi_shard_modify_mode TO "
+									"\'sequential\';\"")));
+		}
+		else
+		{
+			elog(DEBUG1, "the name of the shard (%s) for relation (%s) is too long, "
+						 "switching to sequential and local execution mode to prevent "
+						 "self deadlocks",
+				 longestShardName, relationName);
+
+			SetLocalMultiShardModifyModeToSequential();
+			SetLocalExecutionStatus(LOCAL_EXECUTION_REQUIRED);
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
+
+/*
+ * SwitchToSequentialAndLocalExecutionIfPartitionNameTooLong is a wrapper for new
+ * partitions that will be distributed after attaching to a distributed partitioned table
+ */
+void
+SwitchToSequentialAndLocalExecutionIfPartitionNameTooLong(Oid parentRelationId,
+														  Oid partitionRelationId)
+{
+	SwitchToSequentialAndLocalExecutionIfRelationNameTooLong(
+		parentRelationId, get_rel_name(partitionRelationId));
+}
--- a/src/backend/distributed/commands/cascade_table_operation_for_connected_relations.c
+++ b/src/backend/distributed/commands/cascade_table_operation_for_connected_relations.c
@ -510,6 +510,6 @@ ExecuteForeignKeyCreateCommand(const char *commandString, bool skip_validation)
 								"command \"%s\"", commandString)));
 	}

-	ProcessUtilityParseTree(parseTree, commandString, PROCESS_UTILITY_TOPLEVEL,
+	ProcessUtilityParseTree(parseTree, commandString, PROCESS_UTILITY_QUERY,
 							NULL, None_Receiver, NULL);
 }
--- a/src/backend/distributed/commands/create_distributed_table.c
+++ b/src/backend/distributed/commands/create_distributed_table.c
@ -412,6 +412,24 @@ CreateDistributedTable(Oid relationId, Var *distributionColumn, char distributio
 	char replicationModel = DecideReplicationModel(distributionMethod,
 												   viaDeprecatedAPI);

+
+	/*
+	 * Due to dropping columns, the parent's distribution key may not match the
+	 * partition's distribution key. The input distributionColumn belongs to
+	 * the parent. That's why we override the distribution column of partitions
+	 * here. See issue #5123 for details.
+	 */
+	if (PartitionTable(relationId))
+	{
+		Oid parentRelationId = PartitionParentOid(relationId);
+		char *distributionColumnName =
+			ColumnToColumnName(parentRelationId, nodeToString(distributionColumn));
+
+		distributionColumn =
+			FindColumnWithNameOnTargetRelation(parentRelationId, distributionColumnName,
+											   relationId);
+	}
+
 	/*
 	 * ColocationIdForNewTable assumes caller acquires lock on relationId. In our case,
 	 * our caller already acquired lock on relationId.
--- a/src/backend/distributed/commands/index.c
+++ b/src/backend/distributed/commands/index.c
@ -411,15 +411,16 @@ static char *
 GenerateLongestShardPartitionIndexName(IndexStmt *createIndexStatement)
 {
 	Oid relationId = CreateIndexStmtGetRelationId(createIndexStatement);
-	char *longestPartitionName = LongestPartitionName(relationId);
-	if (longestPartitionName == NULL)
+	Oid longestNamePartitionId = PartitionWithLongestNameRelationId(relationId);
+	if (!OidIsValid(longestNamePartitionId))
 	{
 		/* no partitions have been created yet */
 		return NULL;
 	}

-	char *longestPartitionShardName = pstrdup(longestPartitionName);
-	ShardInterval *shardInterval = LoadShardIntervalWithLongestShardName(relationId);
+	char *longestPartitionShardName = get_rel_name(longestNamePartitionId);
+	ShardInterval *shardInterval = LoadShardIntervalWithLongestShardName(
+		longestNamePartitionId);
 	AppendShardIdToName(&longestPartitionShardName, shardInterval->shardId);

 	IndexStmt *createLongestShardIndexStmt = copyObject(createIndexStatement);
--- a/src/backend/distributed/commands/multi_copy.c
+++ b/src/backend/distributed/commands/multi_copy.c
@ -2244,7 +2244,7 @@ CitusCopyDestReceiverStartup(DestReceiver *dest, int operation,
 	if (cacheEntry->replicationModel == REPLICATION_MODEL_2PC ||
 		MultiShardCommitProtocol == COMMIT_PROTOCOL_2PC)
 	{
-		CoordinatedTransactionUse2PC();
+		CoordinatedTransactionShouldUse2PC();
 	}

 	/* define how tuples will be serialised */
--- a/src/backend/distributed/commands/rename.c
+++ b/src/backend/distributed/commands/rename.c
@ -109,6 +109,13 @@ PreprocessRenameStmt(Node *node, const char *renameCommand,
 	 */
 	ErrorIfUnsupportedRenameStmt(renameStmt);

+	if (renameStmt->renameType == OBJECT_TABLE ||
+		renameStmt->renameType == OBJECT_FOREIGN_TABLE)
+	{
+		SwitchToSequentialAndLocalExecutionIfRelationNameTooLong(tableRelationId,
+																 renameStmt->newname);
+	}
+
 	DDLJob *ddlJob = palloc0(sizeof(DDLJob));
 	ddlJob->targetRelationId = tableRelationId;
 	ddlJob->concurrentIndexCmd = false;
--- a/src/backend/distributed/commands/table.c
+++ b/src/backend/distributed/commands/table.c
@ -26,6 +26,7 @@
 #include "distributed/commands/utility_hook.h"
 #include "distributed/deparser.h"
 #include "distributed/deparse_shard_query.h"
+#include "distributed/distribution_column.h"
 #include "distributed/listutils.h"
 #include "distributed/coordinator_protocol.h"
 #include "distributed/metadata_sync.h"
@ -324,6 +325,9 @@ PostprocessCreateTableStmtPartitionOf(CreateStmt *createStatement, const
 		char *parentRelationName = generate_qualified_relation_name(parentRelationId);
 		bool viaDeprecatedAPI = false;

+		SwitchToSequentialAndLocalExecutionIfPartitionNameTooLong(parentRelationId,
+																  relationId);
+
 		CreateDistributedTable(relationId, parentDistributionColumn,
 							   parentDistributionMethod, ShardCount,
 							   parentRelationName, viaDeprecatedAPI);
@ -398,6 +402,9 @@ PostprocessAlterTableStmtAttachPartition(AlterTableStmt *alterTableStatement,
 				char *parentRelationName = generate_qualified_relation_name(relationId);
 				bool viaDeprecatedAPI = false;

+				SwitchToSequentialAndLocalExecutionIfPartitionNameTooLong(
+					relationId, partitionRelationId);
+
 				CreateDistributedTable(partitionRelationId, distributionColumn,
 									   distributionMethod, ShardCount,
 									   parentRelationName, viaDeprecatedAPI);
--- a/src/backend/distributed/commands/utility_hook.c
+++ b/src/backend/distributed/commands/utility_hook.c
@ -910,6 +910,26 @@ ExecuteDistributedDDLJob(DDLJob *ddlJob)
 		 */
 		if (ddlJob->startNewTransaction)
 		{
+			/*
+			 * If cache is not populated, system catalog lookups will cause
+			 * the xmin of current backend to change. Then the last phase
+			 * of CREATE INDEX CONCURRENTLY, which is in a separate backend,
+			 * will hang waiting for our backend and result in a deadlock.
+			 *
+			 * We populate the cache before starting the next transaction to
+			 * avoid this. Most of the metadata has already been resolved in
+			 * planning phase, we only need to lookup metadata needed for
+			 * connection establishment.
+			 */
+			(void) CurrentDatabaseName();
+
+			/*
+			 * ConnParams (AuthInfo and PoolInfo) gets a snapshot, which
+			 * will blocks the remote connections to localhost. Hence we warm up
+			 * the cache here so that after we start a new transaction, the entries
+			 * will already be in the hash table, hence we won't be holding any snapshots.
+			 */
+			WarmUpConnParamsHash();
 			CommitTransactionCommand();
 			StartTransactionCommand();
 		}
--- a/src/backend/distributed/connection/connection_configuration.c
+++ b/src/backend/distributed/connection/connection_configuration.c
@ -21,6 +21,7 @@

 /* stores the string representation of our node connection GUC */
 char *NodeConninfo = "";
+char *LocalHostName = "localhost";

 /* represents a list of libpq parameter settings */
 typedef struct ConnParamsInfo
--- a/src/backend/distributed/connection/connection_management.c
+++ b/src/backend/distributed/connection/connection_management.c
@ -32,6 +32,7 @@
 #include "distributed/shared_connection_stats.h"
 #include "distributed/cancel_utils.h"
 #include "distributed/remote_commands.h"
+#include "distributed/time_constants.h"
 #include "distributed/version_compat.h"
 #include "distributed/worker_log_messages.h"
 #include "mb/pg_wchar.h"
@ -43,6 +44,7 @@

 int NodeConnectionTimeout = 30000;
 int MaxCachedConnectionsPerWorker = 1;
+int MaxCachedConnectionLifetime = 10 * MS_PER_MINUTE;

 HTAB *ConnectionHash = NULL;
 HTAB *ConnParamsHash = NULL;
@ -85,6 +87,7 @@ static WaitEventSet * WaitEventSetFromMultiConnectionStates(List *connections,
 static void CloseNotReadyMultiConnectionStates(List *connectionStates);
 static uint32 MultiConnectionStateEventMask(MultiConnectionPollState *connectionState);
 static void CitusPQFinish(MultiConnection *connection);
+static ConnParamsHashEntry * FindOrCreateConnParamsEntry(ConnectionHashKey *key);

 /*
 * Initialize per-backend connection management infrastructure.
@ -1127,9 +1130,62 @@ ConnectionHashCompare(const void *a, const void *b, Size keysize)
 static void
 StartConnectionEstablishment(MultiConnection *connection, ConnectionHashKey *key)
 {
-	bool found = false;
 	static uint64 connectionId = 1;

+	ConnParamsHashEntry *entry = FindOrCreateConnParamsEntry(key);
+
+	strlcpy(connection->hostname, key->hostname, MAX_NODE_LENGTH);
+	connection->port = key->port;
+	strlcpy(connection->database, key->database, NAMEDATALEN);
+	strlcpy(connection->user, key->user, NAMEDATALEN);
+
+	connection->pgConn = PQconnectStartParams((const char **) entry->keywords,
+											  (const char **) entry->values,
+											  false);
+	connection->connectionStart = GetCurrentTimestamp();
+	connection->connectionId = connectionId++;
+
+	/*
+	 * To avoid issues with interrupts not getting caught all our connections
+	 * are managed in a non-blocking manner. remote_commands.c provides
+	 * wrappers emulating blocking behaviour.
+	 */
+	PQsetnonblocking(connection->pgConn, true);
+
+	SetCitusNoticeReceiver(connection);
+}
+
+
+/*
+ * WarmUpConnParamsHash warms up the ConnParamsHash by loading all the
+ * conn params for active primary nodes.
+ */
+void
+WarmUpConnParamsHash(void)
+{
+	List *workerNodeList = ActivePrimaryNodeList(AccessShareLock);
+	WorkerNode *workerNode = NULL;
+	foreach_ptr(workerNode, workerNodeList)
+	{
+		ConnectionHashKey key;
+		strlcpy(key.hostname, workerNode->workerName, MAX_NODE_LENGTH);
+		key.port = workerNode->workerPort;
+		strlcpy(key.database, CurrentDatabaseName(), NAMEDATALEN);
+		strlcpy(key.user, CurrentUserName(), NAMEDATALEN);
+		FindOrCreateConnParamsEntry(&key);
+	}
+}
+
+
+/*
+ * FindOrCreateConnParamsEntry searches ConnParamsHash for the given key,
+ * if it is not found, it is created.
+ */
+static ConnParamsHashEntry *
+FindOrCreateConnParamsEntry(ConnectionHashKey *key)
+{
+	bool found = false;
+
 	/* search our cache for precomputed connection settings */
 	ConnParamsHashEntry *entry = hash_search(ConnParamsHash, key, HASH_ENTER, &found);
 	if (!found || !entry->isValid)
@ -1157,25 +1213,7 @@ StartConnectionEstablishment(MultiConnection *connection, ConnectionHashKey *key
 		entry->isValid = true;
 	}

-	strlcpy(connection->hostname, key->hostname, MAX_NODE_LENGTH);
-	connection->port = key->port;
-	strlcpy(connection->database, key->database, NAMEDATALEN);
-	strlcpy(connection->user, key->user, NAMEDATALEN);
-
-	connection->pgConn = PQconnectStartParams((const char **) entry->keywords,
-											  (const char **) entry->values,
-											  false);
-	connection->connectionStart = GetCurrentTimestamp();
-	connection->connectionId = connectionId++;
-
-	/*
-	 * To avoid issues with interrupts not getting caught all our connections
-	 * are managed in a non-blocking manner. remote_commands.c provides
-	 * wrappers emulating blocking behaviour.
-	 */
-	PQsetnonblocking(connection->pgConn, true);
-
-	SetCitusNoticeReceiver(connection);
+	return entry;
 }


@ -1288,6 +1326,7 @@ AfterXactHostConnectionHandling(ConnectionHashEntry *entry, bool isCommit)
 * - Connection is forced to close at the end of transaction
 * - Connection is not in OK state
 * - A transaction is still in progress (usually because we are cancelling a distributed transaction)
+ * - A connection reached its maximum lifetime
 */
 static bool
 ShouldShutdownConnection(MultiConnection *connection, const int cachedConnectionCount)
@ -1303,7 +1342,10 @@ ShouldShutdownConnection(MultiConnection *connection, const int cachedConnection
 		   cachedConnectionCount >= MaxCachedConnectionsPerWorker ||
 		   connection->forceCloseAtTransactionEnd ||
 		   PQstatus(connection->pgConn) != CONNECTION_OK ||
-		   !RemoteTransactionIdle(connection);
+		   !RemoteTransactionIdle(connection) ||
+		   (MaxCachedConnectionLifetime >= 0 &&
+			TimestampDifferenceExceeds(connection->connectionStart, GetCurrentTimestamp(),
+									   MaxCachedConnectionLifetime));
 }


--- a/src/backend/distributed/connection/remote_commands.c
+++ b/src/backend/distributed/connection/remote_commands.c
@ -25,7 +25,11 @@
 #include "utils/palloc.h"


-#define MAX_PUT_COPY_DATA_BUFFER_SIZE (8 * 1024 * 1024)
+/*
+ * Setting that controls how many bytes of COPY data libpq is allowed to buffer
+ * internally before we force a flush.
+ */
+int RemoteCopyFlushThreshold = 8 * 1024 * 1024;


 /* GUC, determining whether statements sent to remote nodes are logged */
@ -620,7 +624,7 @@ PutRemoteCopyData(MultiConnection *connection, const char *buffer, int nbytes)
 	 */

 	connection->copyBytesWrittenSinceLastFlush += nbytes;
-	if (connection->copyBytesWrittenSinceLastFlush > MAX_PUT_COPY_DATA_BUFFER_SIZE)
+	if (connection->copyBytesWrittenSinceLastFlush > RemoteCopyFlushThreshold)
 	{
 		connection->copyBytesWrittenSinceLastFlush = 0;
 		return FinishConnectionIO(connection, allowInterrupts);
--- a/src/backend/distributed/deparser/ruleutils_11.c
+++ b/src/backend/distributed/deparser/ruleutils_11.c
@ -7055,9 +7055,10 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
 					ExtractRangeTblExtraData(rte, NULL, &fragmentSchemaName, &fragmentTableName, NULL);

 					/* use schema and table name from the remote alias */
-					appendStringInfoString(buf,
-										   generate_fragment_name(fragmentSchemaName,
-																  fragmentTableName));
+					appendStringInfo(buf, "%s%s",
+									 only_marker(rte),
+									 generate_fragment_name(fragmentSchemaName,
+															fragmentTableName));
 					break;
 				}

--- a/src/backend/distributed/deparser/ruleutils_12.c
+++ b/src/backend/distributed/deparser/ruleutils_12.c
@ -7057,9 +7057,10 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
 					ExtractRangeTblExtraData(rte, NULL, &fragmentSchemaName, &fragmentTableName, NULL);

 					/* use schema and table name from the remote alias */
-					appendStringInfoString(buf,
-										   generate_fragment_name(fragmentSchemaName,
-																  fragmentTableName));
+					appendStringInfo(buf, "%s%s",
+									 only_marker(rte),
+									 generate_fragment_name(fragmentSchemaName,
+															fragmentTableName));
 					break;
 				}

--- a/src/backend/distributed/deparser/ruleutils_13.c
+++ b/src/backend/distributed/deparser/ruleutils_13.c
@ -7115,9 +7115,10 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
 					ExtractRangeTblExtraData(rte, NULL, &fragmentSchemaName, &fragmentTableName, NULL);

 					/* use schema and table name from the remote alias */
-					appendStringInfoString(buf,
-										   generate_fragment_name(fragmentSchemaName,
-																  fragmentTableName));
+					appendStringInfo(buf, "%s%s",
+									 only_marker(rte),
+									 generate_fragment_name(fragmentSchemaName,
+															fragmentTableName));
 					break;
 				}

--- a/src/backend/distributed/executor/adaptive_executor.c
+++ b/src/backend/distributed/executor/adaptive_executor.c
@ -174,6 +174,8 @@
 #include "utils/timestamp.h"

 #define SLOW_START_DISABLED 0
+#define WAIT_EVENT_SET_INDEX_NOT_INITIALIZED -1
+#define WAIT_EVENT_SET_INDEX_FAILED -2


 /*
@ -638,6 +640,10 @@ static int UsableConnectionCount(WorkerPool *workerPool);
 static long NextEventTimeout(DistributedExecution *execution);
 static WaitEventSet * BuildWaitEventSet(List *sessionList);
 static void RebuildWaitEventSetFlags(WaitEventSet *waitEventSet, List *sessionList);
+static int CitusAddWaitEventSetToSet(WaitEventSet *set, uint32 events, pgsocket fd,
+									 Latch *latch, void *user_data);
+static bool CitusModifyWaitEvent(WaitEventSet *set, int pos, uint32 events,
+								 Latch *latch);
 static TaskPlacementExecution * PopPlacementExecution(WorkerSession *session);
 static TaskPlacementExecution * PopAssignedPlacementExecution(WorkerSession *session);
 static TaskPlacementExecution * PopUnassignedPlacementExecution(WorkerPool *workerPool);
@ -671,6 +677,8 @@ static void ExtractParametersForRemoteExecution(ParamListInfo paramListInfo,
 												Oid **parameterTypes,
 												const char ***parameterValues);
 static int GetEventSetSize(List *sessionList);
+static bool ProcessSessionsWithFailedWaitEventSetOperations(
+	DistributedExecution *execution);
 static int RebuildWaitEventSet(DistributedExecution *execution);
 static void ProcessWaitEvents(DistributedExecution *execution, WaitEvent *events, int
 							  eventCount, bool *cancellationReceived);
@ -1165,23 +1173,6 @@ DecideTransactionPropertiesForTaskList(RowModifyLevel modLevel, List *taskList,
 		return xactProperties;
 	}

-	if (GetCurrentLocalExecutionStatus() == LOCAL_EXECUTION_REQUIRED)
-	{
-		/*
-		 * In case localExecutionHappened, we force the executor to use 2PC.
-		 * The primary motivation is that at this point we're definitely expanding
-		 * the nodes participated in the transaction. And, by re-generating the
-		 * remote task lists during local query execution, we might prevent the adaptive
-		 * executor to kick-in 2PC (or even start coordinated transaction, that's why
-		 * we prefer adding this check here instead of
-		 * Activate2PCIfModifyingTransactionExpandsToNewNode()).
-		 */
-		xactProperties.errorOnAnyFailure = true;
-		xactProperties.useRemoteTransactionBlocks = TRANSACTION_BLOCKS_REQUIRED;
-		xactProperties.requires2PC = true;
-		return xactProperties;
-	}
-
 	if (DistributedExecutionRequiresRollback(taskList))
 	{
 		/* transaction blocks are required if the task list needs to roll back */
@ -1240,7 +1231,7 @@ StartDistributedExecution(DistributedExecution *execution)

 	if (xactProperties->requires2PC)
 	{
-		CoordinatedTransactionUse2PC();
+		CoordinatedTransactionShouldUse2PC();
 	}

 	/*
@ -2092,6 +2083,7 @@ FindOrCreateWorkerSession(WorkerPool *workerPool, MultiConnection *connection)
 	session->connection = connection;
 	session->workerPool = workerPool;
 	session->commandsSent = 0;
+	session->waitEventSetIndex = WAIT_EVENT_SET_INDEX_NOT_INITIALIZED;

 	dlist_init(&session->pendingTaskQueue);
 	dlist_init(&session->readyTaskQueue);
@ -2236,6 +2228,7 @@ RunDistributedExecution(DistributedExecution *execution)
 				ManageWorkerPool(workerPool);
 			}

+			bool skipWaitEvents = false;
 			if (execution->remoteTaskList == NIL)
 			{
 				/*
@ -2257,11 +2250,28 @@ RunDistributedExecution(DistributedExecution *execution)
 				}
 				eventSetSize = RebuildWaitEventSet(execution);
 				events = palloc0(eventSetSize * sizeof(WaitEvent));
+
+				skipWaitEvents =
+					ProcessSessionsWithFailedWaitEventSetOperations(execution);
 			}
 			else if (execution->waitFlagsChanged)
 			{
 				RebuildWaitEventSetFlags(execution->waitEventSet, execution->sessionList);
 				execution->waitFlagsChanged = false;
+
+				skipWaitEvents =
+					ProcessSessionsWithFailedWaitEventSetOperations(execution);
+			}
+
+			if (skipWaitEvents)
+			{
+				/*
+				 * Some operation on the wait event set is failed, retry
+				 * as we already removed the problematic connections.
+				 */
+				execution->rebuildWaitEventSet = true;
+
+				continue;
 			}

 			/* wait for I/O events */
@ -2310,6 +2320,51 @@ RunDistributedExecution(DistributedExecution *execution)
 }


+/*
+ * ProcessSessionsWithFailedEventSetOperations goes over the session list and
+ * processes sessions with failed wait event set operations.
+ *
+ * Failed sessions are not going to generate any further events, so it is our
+ * only chance to process the failure by calling into `ConnectionStateMachine`.
+ *
+ * The function returns true if any session failed.
+ */
+static bool
+ProcessSessionsWithFailedWaitEventSetOperations(DistributedExecution *execution)
+{
+	bool foundFailedSession = false;
+	WorkerSession *session = NULL;
+	foreach_ptr(session, execution->sessionList)
+	{
+		if (session->waitEventSetIndex == WAIT_EVENT_SET_INDEX_FAILED)
+		{
+			/*
+			 * We can only lost only already connected connections,
+			 * others are regular failures.
+			 */
+			MultiConnection *connection = session->connection;
+			if (connection->connectionState == MULTI_CONNECTION_CONNECTED)
+			{
+				connection->connectionState = MULTI_CONNECTION_LOST;
+			}
+			else
+			{
+				connection->connectionState = MULTI_CONNECTION_FAILED;
+			}
+
+
+			ConnectionStateMachine(session);
+
+			session->waitEventSetIndex = WAIT_EVENT_SET_INDEX_NOT_INITIALIZED;
+
+			foundFailedSession = true;
+		}
+	}
+
+	return foundFailedSession;
+}
+
+
 /*
 * RebuildWaitEventSet updates the waitEventSet for the distributed execution.
 * This happens when the connection set for the distributed execution is changed,
@ -3197,7 +3252,7 @@ Activate2PCIfModifyingTransactionExpandsToNewNode(WorkerSession *session)
 		 * just opened, which means we're now going to make modifications
 		 * over multiple connections. Activate 2PC!
 		 */
-		CoordinatedTransactionUse2PC();
+		CoordinatedTransactionShouldUse2PC();
 	}
 }

@ -3875,6 +3930,7 @@ ReceiveResults(WorkerSession *session, bool storeRows)
 		TupleDesc tupleDescriptor = tupleDest->tupleDescForQuery(tupleDest, queryIndex);
 		if (tupleDescriptor == NULL)
 		{
+			PQclear(result);
 			continue;
 		}

@ -4693,18 +4749,79 @@ BuildWaitEventSet(List *sessionList)
 			continue;
 		}

-		int waitEventSetIndex = AddWaitEventToSet(waitEventSet, connection->waitFlags,
-												  sock, NULL, (void *) session);
+		int waitEventSetIndex =
+			CitusAddWaitEventSetToSet(waitEventSet, connection->waitFlags, sock,
+									  NULL, (void *) session);
 		session->waitEventSetIndex = waitEventSetIndex;
 	}

-	AddWaitEventToSet(waitEventSet, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL);
-	AddWaitEventToSet(waitEventSet, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	CitusAddWaitEventSetToSet(waitEventSet, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL,
+							  NULL);
+	CitusAddWaitEventSetToSet(waitEventSet, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch,
+							  NULL);

 	return waitEventSet;
 }


+/*
+ * CitusAddWaitEventSetToSet is a wrapper around Postgres' AddWaitEventToSet().
+ *
+ * AddWaitEventToSet() may throw hard errors. For example, when the
+ * underlying socket for a connection is closed by the remote server
+ * and already reflected by the OS, however Citus hasn't had a chance
+ * to get this information. In that case, if replication factor is >1,
+ * Citus can failover to other nodes for executing the query. Even if
+ * replication factor = 1, Citus can give much nicer errors.
+ *
+ * So CitusAddWaitEventSetToSet simply puts ModifyWaitEvent into a
+ * PG_TRY/PG_CATCH block in order to catch any hard errors, and
+ * returns this information to the caller.
+ */
+static int
+CitusAddWaitEventSetToSet(WaitEventSet *set, uint32 events, pgsocket fd,
+						  Latch *latch, void *user_data)
+{
+	volatile int waitEventSetIndex = WAIT_EVENT_SET_INDEX_NOT_INITIALIZED;
+	MemoryContext savedContext = CurrentMemoryContext;
+
+	PG_TRY();
+	{
+		waitEventSetIndex =
+			AddWaitEventToSet(set, events, fd, latch, (void *) user_data);
+	}
+	PG_CATCH();
+	{
+		/*
+		 * We might be in an arbitrary memory context when the
+		 * error is thrown and we should get back to one we had
+		 * at PG_TRY() time, especially because we are not
+		 * re-throwing the error.
+		 */
+		MemoryContextSwitchTo(savedContext);
+
+		FlushErrorState();
+
+		if (user_data != NULL)
+		{
+			WorkerSession *workerSession = (WorkerSession *) user_data;
+
+			ereport(DEBUG1, (errcode(ERRCODE_CONNECTION_FAILURE),
+							 errmsg("Adding wait event for node %s:%d failed. "
+									"The socket was: %d",
+									workerSession->workerPool->nodeName,
+									workerSession->workerPool->nodePort, fd)));
+		}
+
+		/* let the callers know about the failure */
+		waitEventSetIndex = WAIT_EVENT_SET_INDEX_FAILED;
+	}
+	PG_END_TRY();
+
+	return waitEventSetIndex;
+}
+
+
 /*
 * GetEventSetSize returns the event set size for a list of sessions.
 */
@ -4748,11 +4865,68 @@ RebuildWaitEventSetFlags(WaitEventSet *waitEventSet, List *sessionList)
 			continue;
 		}

-		ModifyWaitEvent(waitEventSet, waitEventSetIndex, connection->waitFlags, NULL);
+		bool success =
+			CitusModifyWaitEvent(waitEventSet, waitEventSetIndex,
+								 connection->waitFlags, NULL);
+		if (!success)
+		{
+			ereport(DEBUG1, (errcode(ERRCODE_CONNECTION_FAILURE),
+							 errmsg("Modifying wait event for node %s:%d failed. "
+									"The wait event index was: %d",
+									connection->hostname, connection->port,
+									waitEventSetIndex)));
+
+			session->waitEventSetIndex = WAIT_EVENT_SET_INDEX_FAILED;
+		}
 	}
 }


+/*
+ * CitusModifyWaitEvent is a wrapper around Postgres' ModifyWaitEvent().
+ *
+ * ModifyWaitEvent may throw hard errors. For example, when the underlying
+ * socket for a connection is closed by the remote server and already
+ * reflected by the OS, however Citus hasn't had a chance to get this
+ * information. In that case, if repliction factor is >1, Citus can
+ * failover to other nodes for executing the query. Even if replication
+ * factor = 1, Citus can give much nicer errors.
+ *
+ * So CitusModifyWaitEvent simply puts ModifyWaitEvent into a PG_TRY/PG_CATCH
+ * block in order to catch any hard errors, and returns this information to the
+ * caller.
+ */
+static bool
+CitusModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
+{
+	volatile bool success = true;
+	MemoryContext savedContext = CurrentMemoryContext;
+
+	PG_TRY();
+	{
+		ModifyWaitEvent(set, pos, events, latch);
+	}
+	PG_CATCH();
+	{
+		/*
+		 * We might be in an arbitrary memory context when the
+		 * error is thrown and we should get back to one we had
+		 * at PG_TRY() time, especially because we are not
+		 * re-throwing the error.
+		 */
+		MemoryContextSwitchTo(savedContext);
+
+		FlushErrorState();
+
+		/* let the callers know about the failure */
+		success = false;
+	}
+	PG_END_TRY();
+
+	return success;
+}
+
+
 /*
 * SetLocalForceMaxQueryParallelization is simply a C interface for setting
 * the following:
--- a/src/backend/distributed/executor/citus_custom_scan.c
+++ b/src/backend/distributed/executor/citus_custom_scan.c
@ -300,7 +300,8 @@ CitusBeginReadOnlyScan(CustomScanState *node, EState *estate, int eflags)
 		 * The plan will be cached across executions when originalDistributedPlan
 		 * represents a prepared statement.
 		 */
-		CacheLocalPlanForShardQuery(task, originalDistributedPlan);
+		CacheLocalPlanForShardQuery(task, originalDistributedPlan,
+									estate->es_param_list_info);
 	}
 }

@ -342,9 +343,12 @@ CitusBeginModifyScan(CustomScanState *node, EState *estate, int eflags)
 		/*
 		 * At this point, we're about to do the shard pruning for fast-path queries.
 		 * Given that pruning is deferred always for INSERTs, we get here
-		 * !EnableFastPathRouterPlanner  as well.
+		 * !EnableFastPathRouterPlanner  as well. Given that INSERT statements with
+		 * CTEs/sublinks etc are not eligible for fast-path router plan, we get here
+		 * jobQuery->commandType == CMD_INSERT as well.
 		 */
-		Assert(currentPlan->fastPathRouterPlan || !EnableFastPathRouterPlanner);
+		Assert(currentPlan->fastPathRouterPlan || !EnableFastPathRouterPlanner ||
+			   jobQuery->commandType == CMD_INSERT);

 		/*
 		 * We can only now decide which shard to use, so we need to build a new task
@ -406,7 +410,8 @@ CitusBeginModifyScan(CustomScanState *node, EState *estate, int eflags)
 		 * The plan will be cached across executions when originalDistributedPlan
 		 * represents a prepared statement.
 		 */
-		CacheLocalPlanForShardQuery(task, originalDistributedPlan);
+		CacheLocalPlanForShardQuery(task, originalDistributedPlan,
+									estate->es_param_list_info);
 	}
 }

--- a/src/backend/distributed/executor/local_executor.c
+++ b/src/backend/distributed/executor/local_executor.c
@ -125,9 +125,6 @@ static void LogLocalCommand(Task *task);
 static uint64 LocallyPlanAndExecuteMultipleQueries(List *queryStrings,
 												   TupleDestination *tupleDest,
 												   Task *task);
-static void ExtractParametersForLocalExecution(ParamListInfo paramListInfo,
-											   Oid **parameterTypes,
-											   const char ***parameterValues);
 static void ExecuteUdfTaskQuery(Query *localUdfCommandQuery);
 static void EnsureTransitionPossible(LocalExecutionStatus from,
 									 LocalExecutionStatus to);
@ -209,6 +206,19 @@ ExecuteLocalTaskListExtended(List *taskList,
 	Oid *parameterTypes = NULL;
 	uint64 totalRowsProcessed = 0;

+	/*
+	 * Even if we are executing local tasks, we still enable
+	 * coordinated transaction. This is because
+	 *  (a) we might be in a transaction, and the next commands may
+	 *      require coordinated transaction
+	 *  (b) we might be executing some tasks locally and the others
+	 *      via remote execution
+	 *
+	 * Also, there is no harm enabling coordinated transaction even if
+	 * we only deal with local tasks in the transaction.
+	 */
+	UseCoordinatedTransaction();
+
 	if (paramListInfo != NULL)
 	{
 		/* not used anywhere, so declare here */
@ -236,6 +246,17 @@ ExecuteLocalTaskListExtended(List *taskList,
 		{
 			SetLocalExecutionStatus(LOCAL_EXECUTION_REQUIRED);
 		}
+
+		if (!ReadOnlyTask(task->taskType))
+		{
+			/*
+			 * Any modification on the local execution should enable 2PC. If remote
+			 * queries are also ReadOnly, our 2PC logic is smart enough to skip sending
+			 * PREPARE to those connections.
+			 */
+			CoordinatedTransactionShouldUse2PC();
+		}
+
 		LogLocalCommand(task);

 		if (isUtilityCommand)
@ -362,7 +383,7 @@ LocallyPlanAndExecuteMultipleQueries(List *queryStrings, TupleDestination *tuple
 * value arrays. It does not change the oid of custom types, because the
 * query will be run locally.
 */
-static void
+void
 ExtractParametersForLocalExecution(ParamListInfo paramListInfo, Oid **parameterTypes,
 								   const char ***parameterValues)
 {
@ -406,7 +427,7 @@ ExecuteUtilityCommand(const char *taskQueryCommand)
 			 * process utility.
 			 */
 			ProcessUtilityParseTree(taskRawParseTree, taskQueryCommand,
-									PROCESS_UTILITY_TOPLEVEL, NULL, None_Receiver,
+									PROCESS_UTILITY_QUERY, NULL, None_Receiver,
 									NULL);
 		}
 	}
--- a/src/backend/distributed/metadata/dependency.c
+++ b/src/backend/distributed/metadata/dependency.c
@ -156,7 +156,6 @@ static void ApplyAddToDependencyList(ObjectAddressCollector *collector,
 static List * ExpandCitusSupportedTypes(ObjectAddressCollector *collector,
 										ObjectAddress target);
 static ViewDependencyNode * BuildViewDependencyGraph(Oid relationId, HTAB *nodeMap);
-static Oid GetDependingView(Form_pg_depend pg_depend);


 /*
@ -1204,18 +1203,31 @@ GetDependingView(Form_pg_depend pg_depend)
 										   true, NULL, 1, rkey);

 	HeapTuple rewriteTup = systable_getnext(rscan);
+	if (!HeapTupleIsValid(rewriteTup))
+	{
+		/*
+		 * This function already verified that objid's classid is
+		 * RewriteRelationId, so it should exists. But be on the
+		 * safe side.
+		 */
+		ereport(ERROR, (errmsg("catalog lookup failed for view %u",
+							   pg_depend->objid)));
+	}
+
 	Form_pg_rewrite pg_rewrite = (Form_pg_rewrite) GETSTRUCT(rewriteTup);

 	bool isView = get_rel_relkind(pg_rewrite->ev_class) == RELKIND_VIEW;
 	bool isMatView = get_rel_relkind(pg_rewrite->ev_class) == RELKIND_MATVIEW;
 	bool isDifferentThanRef = pg_rewrite->ev_class != pg_depend->refobjid;

+	Oid dependingView = InvalidOid;
+	if ((isView || isMatView) && isDifferentThanRef)
+	{
+		dependingView = pg_rewrite->ev_class;
+	}
+
 	systable_endscan(rscan);
 	relation_close(rewriteRel, AccessShareLock);

-	if ((isView || isMatView) && isDifferentThanRef)
-	{
-		return pg_rewrite->ev_class;
-	}
-	return InvalidOid;
+	return dependingView;
 }
--- a/src/backend/distributed/metadata/distobject.c
+++ b/src/backend/distributed/metadata/distobject.c
@ -373,3 +373,56 @@ GetDistributedObjectAddressList(void)

 	return objectAddressList;
 }
+
+
+/*
+ * UpdateDistributedObjectColocationId gets an old and a new colocationId
+ * and updates the colocationId of all tuples in citus.pg_dist_object which
+ * have the old colocationId to the new colocationId.
+ */
+void
+UpdateDistributedObjectColocationId(uint32 oldColocationId,
+									uint32 newColocationId)
+{
+	const bool indexOK = false;
+	ScanKeyData scanKey[1];
+	Relation pgDistObjectRel = table_open(DistObjectRelationId(),
+										  RowExclusiveLock);
+	TupleDesc tupleDescriptor = RelationGetDescr(pgDistObjectRel);
+
+	/* scan pg_dist_object for colocationId equal to old colocationId */
+	ScanKeyInit(&scanKey[0], Anum_pg_dist_object_colocationid,
+				BTEqualStrategyNumber,
+				F_INT4EQ, UInt32GetDatum(oldColocationId));
+
+	SysScanDesc scanDescriptor = systable_beginscan(pgDistObjectRel,
+													InvalidOid,
+													indexOK,
+													NULL, 1, scanKey);
+	HeapTuple heapTuple;
+	while (HeapTupleIsValid(heapTuple = systable_getnext(scanDescriptor)))
+	{
+		Datum values[Natts_pg_dist_object];
+		bool isnull[Natts_pg_dist_object];
+		bool replace[Natts_pg_dist_object];
+
+		memset(replace, 0, sizeof(replace));
+
+		replace[Anum_pg_dist_object_colocationid - 1] = true;
+
+		/* update the colocationId to the new one */
+		values[Anum_pg_dist_object_colocationid - 1] = UInt32GetDatum(newColocationId);
+
+		isnull[Anum_pg_dist_object_colocationid - 1] = false;
+
+		heapTuple = heap_modify_tuple(heapTuple, tupleDescriptor, values, isnull,
+									  replace);
+
+		CatalogTupleUpdate(pgDistObjectRel, &heapTuple->t_self, heapTuple);
+		CitusInvalidateRelcacheByRelid(DistObjectRelationId());
+	}
+
+	systable_endscan(scanDescriptor);
+	table_close(pgDistObjectRel, NoLock);
+	CommandCounterIncrement();
+}
--- a/src/backend/distributed/metadata/metadata_utility.c
+++ b/src/backend/distributed/metadata/metadata_utility.c
@ -79,14 +79,24 @@ static bool DistributedTableSizeOnWorker(WorkerNode *workerNode, Oid relationId,
 										 char *sizeQuery, bool failOnError,
 										 uint64 *tableSize);
 static List * ShardIntervalsOnWorkerGroup(WorkerNode *workerNode, Oid relationId);
-static char * GenerateShardNameAndSizeQueryForShardList(List *shardIntervalList);
-static char * GenerateAllShardNameAndSizeQueryForNode(WorkerNode *workerNode);
-static List * GenerateShardSizesQueryList(List *workerNodeList);
+static char * GenerateShardStatisticsQueryForShardList(List *shardIntervalList, bool
+													   useShardMinMaxQuery);
+static char * GenerateAllShardStatisticsQueryForNode(WorkerNode *workerNode,
+													 List *citusTableIds, bool
+													 useShardMinMaxQuery);
+static List * GenerateShardStatisticsQueryList(List *workerNodeList, List *citusTableIds,
+											   bool useShardMinMaxQuery);
 static void ErrorIfNotSuitableToGetSize(Oid relationId);
 static List * OpenConnectionToNodes(List *workerNodeList);
 static void ReceiveShardNameAndSizeResults(List *connectionList,
 										   Tuplestorestate *tupleStore,
 										   TupleDesc tupleDescriptor);
+static void AppendShardSizeMinMaxQuery(StringInfo selectQuery, uint64 shardId,
+									   ShardInterval *
+									   shardInterval, char *shardName,
+									   char *quotedShardName);
+static void AppendShardSizeQuery(StringInfo selectQuery, ShardInterval *shardInterval,
+								 char *quotedShardName);

 /* exports for SQL callable functions */
 PG_FUNCTION_INFO_V1(citus_table_size);
@ -102,25 +112,16 @@ citus_shard_sizes(PG_FUNCTION_ARGS)
 {
 	CheckCitusVersion(ERROR);

-	List *workerNodeList = ActivePrimaryNodeList(NoLock);
+	List *allCitusTableIds = AllCitusTableIds();

-	List *shardSizesQueryList = GenerateShardSizesQueryList(workerNodeList);
+	/* we don't need a distributed transaction here */
+	bool useDistributedTransaction = false;

-	List *connectionList = OpenConnectionToNodes(workerNodeList);
-	FinishConnectionListEstablishment(connectionList);
-
-
-	/* send commands in parallel */
-	for (int i = 0; i < list_length(connectionList); i++)
-	{
-		MultiConnection *connection = (MultiConnection *) list_nth(connectionList, i);
-		char *shardSizesQuery = (char *) list_nth(shardSizesQueryList, i);
-		int querySent = SendRemoteCommand(connection, shardSizesQuery);
-		if (querySent == 0)
-		{
-			ReportConnectionError(connection, WARNING);
-		}
-	}
+	/* we only want the shard sizes here so useShardMinMaxQuery parameter is false */
+	bool useShardMinMaxQuery = false;
+	List *connectionList = SendShardStatisticsQueriesInParallel(allCitusTableIds,
+																useDistributedTransaction,
+																useShardMinMaxQuery);

 	TupleDesc tupleDescriptor = NULL;
 	Tuplestorestate *tupleStore = SetupTuplestore(fcinfo, &tupleDescriptor);
@ -225,6 +226,59 @@ citus_relation_size(PG_FUNCTION_ARGS)
 }


+/*
+ * SendShardStatisticsQueriesInParallel generates query lists for obtaining shard
+ * statistics and then sends the commands in parallel by opening connections
+ * to available nodes. It returns the connection list.
+ */
+List *
+SendShardStatisticsQueriesInParallel(List *citusTableIds, bool useDistributedTransaction,
+									 bool
+									 useShardMinMaxQuery)
+{
+	List *workerNodeList = ActivePrimaryNodeList(NoLock);
+
+	List *shardSizesQueryList = GenerateShardStatisticsQueryList(workerNodeList,
+																 citusTableIds,
+																 useShardMinMaxQuery);
+
+	List *connectionList = OpenConnectionToNodes(workerNodeList);
+	FinishConnectionListEstablishment(connectionList);
+
+	if (useDistributedTransaction)
+	{
+		/*
+		 * For now, in the case we want to include shard min and max values, we also
+		 * want to update the entries in pg_dist_placement and pg_dist_shard with the
+		 * latest statistics. In order to detect distributed deadlocks, we assign a
+		 * distributed transaction ID to the current transaction
+		 */
+		UseCoordinatedTransaction();
+	}
+
+	/* send commands in parallel */
+	for (int i = 0; i < list_length(connectionList); i++)
+	{
+		MultiConnection *connection = (MultiConnection *) list_nth(connectionList, i);
+		char *shardSizesQuery = (char *) list_nth(shardSizesQueryList, i);
+
+		if (useDistributedTransaction)
+		{
+			/* run the size query in a distributed transaction */
+			RemoteTransactionBeginIfNecessary(connection);
+		}
+
+		int querySent = SendRemoteCommand(connection, shardSizesQuery);
+
+		if (querySent == 0)
+		{
+			ReportConnectionError(connection, WARNING);
+		}
+	}
+	return connectionList;
+}
+
+
 /*
 * OpenConnectionToNodes opens a single connection per node
 * for the given workerNodeList.
@ -250,20 +304,25 @@ OpenConnectionToNodes(List *workerNodeList)


 /*
- * GenerateShardSizesQueryList generates a query per node that
- * will return all shard_name, shard_size pairs from the node.
+ * GenerateShardStatisticsQueryList generates a query per node that will return:
+ * - all shard_name, shard_size pairs from the node (if includeShardMinMax is false)
+ * - all shard_id, shard_minvalue, shard_maxvalue, shard_size quartuples from the node (if true)
 */
 static List *
-GenerateShardSizesQueryList(List *workerNodeList)
+GenerateShardStatisticsQueryList(List *workerNodeList, List *citusTableIds, bool
+								 useShardMinMaxQuery)
 {
-	List *shardSizesQueryList = NIL;
+	List *shardStatisticsQueryList = NIL;
 	WorkerNode *workerNode = NULL;
 	foreach_ptr(workerNode, workerNodeList)
 	{
-		char *shardSizesQuery = GenerateAllShardNameAndSizeQueryForNode(workerNode);
-		shardSizesQueryList = lappend(shardSizesQueryList, shardSizesQuery);
+		char *shardStatisticsQuery = GenerateAllShardStatisticsQueryForNode(workerNode,
+																			citusTableIds,
+																			useShardMinMaxQuery);
+		shardStatisticsQueryList = lappend(shardStatisticsQueryList,
+										   shardStatisticsQuery);
 	}
-	return shardSizesQueryList;
+	return shardStatisticsQueryList;
 }


@ -572,37 +631,50 @@ GenerateSizeQueryOnMultiplePlacements(List *shardIntervalList, char *sizeQuery)


 /*
- * GenerateAllShardNameAndSizeQueryForNode generates a query that returns all
- * shard_name, shard_size pairs for the given node.
+ * GenerateAllShardStatisticsQueryForNode generates a query that returns:
+ * - all shard_name, shard_size pairs for the given node (if useShardMinMaxQuery is false)
+ * - all shard_id, shard_minvalue, shard_maxvalue, shard_size quartuples (if true)
 */
 static char *
-GenerateAllShardNameAndSizeQueryForNode(WorkerNode *workerNode)
+GenerateAllShardStatisticsQueryForNode(WorkerNode *workerNode, List *citusTableIds, bool
+									   useShardMinMaxQuery)
 {
-	List *allCitusTableIds = AllCitusTableIds();
-
-	StringInfo allShardNameAndSizeQuery = makeStringInfo();
+	StringInfo allShardStatisticsQuery = makeStringInfo();

 	Oid relationId = InvalidOid;
-	foreach_oid(relationId, allCitusTableIds)
+	foreach_oid(relationId, citusTableIds)
 	{
 		List *shardIntervalsOnNode = ShardIntervalsOnWorkerGroup(workerNode, relationId);
-		char *shardNameAndSizeQuery =
-			GenerateShardNameAndSizeQueryForShardList(shardIntervalsOnNode);
-		appendStringInfoString(allShardNameAndSizeQuery, shardNameAndSizeQuery);
+		char *shardStatisticsQuery =
+			GenerateShardStatisticsQueryForShardList(shardIntervalsOnNode,
+													 useShardMinMaxQuery);
+		appendStringInfoString(allShardStatisticsQuery, shardStatisticsQuery);
 	}

 	/* Add a dummy entry so that UNION ALL doesn't complain */
-	appendStringInfo(allShardNameAndSizeQuery, "SELECT NULL::text, 0::bigint;");
-	return allShardNameAndSizeQuery->data;
+	if (useShardMinMaxQuery)
+	{
+		/* 0 for shard_id, NULL for min, NULL for text, 0 for shard_size */
+		appendStringInfo(allShardStatisticsQuery,
+						 "SELECT 0::bigint, NULL::text, NULL::text, 0::bigint;");
+	}
+	else
+	{
+		/* NULL for shard_name, 0 for shard_size */
+		appendStringInfo(allShardStatisticsQuery, "SELECT NULL::text, 0::bigint;");
+	}
+	return allShardStatisticsQuery->data;
 }


 /*
- * GenerateShardNameAndSizeQueryForShardList generates a SELECT shard_name - shard_size query to get
- * size of multiple tables.
+ * GenerateShardStatisticsQueryForShardList generates one of the two types of queries:
+ * - SELECT shard_name - shard_size (if useShardMinMaxQuery is false)
+ * - SELECT shard_id, shard_minvalue, shard_maxvalue, shard_size (if true)
 */
 static char *
-GenerateShardNameAndSizeQueryForShardList(List *shardIntervalList)
+GenerateShardStatisticsQueryForShardList(List *shardIntervalList, bool
+										 useShardMinMaxQuery)
 {
 	StringInfo selectQuery = makeStringInfo();

@ -618,8 +690,15 @@ GenerateShardNameAndSizeQueryForShardList(List *shardIntervalList)
 		char *shardQualifiedName = quote_qualified_identifier(schemaName, shardName);
 		char *quotedShardName = quote_literal_cstr(shardQualifiedName);

-		appendStringInfo(selectQuery, "SELECT %s AS shard_name, ", quotedShardName);
-		appendStringInfo(selectQuery, PG_RELATION_SIZE_FUNCTION, quotedShardName);
+		if (useShardMinMaxQuery)
+		{
+			AppendShardSizeMinMaxQuery(selectQuery, shardId, shardInterval, shardName,
+									   quotedShardName);
+		}
+		else
+		{
+			AppendShardSizeQuery(selectQuery, shardInterval, quotedShardName);
+		}
 		appendStringInfo(selectQuery, " UNION ALL ");
 	}

@ -627,6 +706,54 @@ GenerateShardNameAndSizeQueryForShardList(List *shardIntervalList)
 }


+/*
+ * AppendShardSizeMinMaxQuery appends a query in the following form to selectQuery
+ * SELECT shard_id, shard_minvalue, shard_maxvalue, shard_size
+ */
+static void
+AppendShardSizeMinMaxQuery(StringInfo selectQuery, uint64 shardId,
+						   ShardInterval *shardInterval, char *shardName,
+						   char *quotedShardName)
+{
+	if (IsCitusTableType(shardInterval->relationId, APPEND_DISTRIBUTED))
+	{
+		/* fill in the partition column name */
+		const uint32 unusedTableId = 1;
+		Var *partitionColumn = PartitionColumn(shardInterval->relationId,
+											   unusedTableId);
+		char *partitionColumnName = get_attname(shardInterval->relationId,
+												partitionColumn->varattno, false);
+		appendStringInfo(selectQuery,
+						 "SELECT " UINT64_FORMAT
+						 " AS shard_id, min(%s)::text AS shard_minvalue, max(%s)::text AS shard_maxvalue, pg_relation_size(%s) AS shard_size FROM %s ",
+						 shardId, partitionColumnName,
+						 partitionColumnName,
+						 quotedShardName, shardName);
+	}
+	else
+	{
+		/* we don't need to update min/max for non-append distributed tables because they don't change */
+		appendStringInfo(selectQuery,
+						 "SELECT " UINT64_FORMAT
+						 " AS shard_id, NULL::text AS shard_minvalue, NULL::text AS shard_maxvalue, pg_relation_size(%s) AS shard_size ",
+						 shardId, quotedShardName);
+	}
+}
+
+
+/*
+ * AppendShardSizeQuery appends a query in the following form to selectQuery
+ * SELECT shard_name, shard_size
+ */
+static void
+AppendShardSizeQuery(StringInfo selectQuery, ShardInterval *shardInterval,
+					 char *quotedShardName)
+{
+	appendStringInfo(selectQuery, "SELECT %s AS shard_name, ", quotedShardName);
+	appendStringInfo(selectQuery, PG_RELATION_SIZE_FUNCTION, quotedShardName);
+}
+
+
 /*
 * ErrorIfNotSuitableToGetSize determines whether the table is suitable to find
 * its' size with internal functions.
@ -924,6 +1051,26 @@ ShardLength(uint64 shardId)
 }


+/*
+ * NodeGroupHasLivePlacements returns true if there is any placement
+ * on the given node group which is not a SHARD_STATE_TO_DELETE placement.
+ */
+bool
+NodeGroupHasLivePlacements(int32 groupId)
+{
+	List *shardPlacements = AllShardPlacementsOnNodeGroup(groupId);
+	GroupShardPlacement *placement = NULL;
+	foreach_ptr(placement, shardPlacements)
+	{
+		if (placement->shardState != SHARD_STATE_TO_DELETE)
+		{
+			return true;
+		}
+	}
+	return false;
+}
+
+
 /*
 * NodeGroupHasShardPlacements returns whether any active shards are placed on the group
 */
--- a/src/backend/distributed/metadata/node_metadata.c
+++ b/src/backend/distributed/metadata/node_metadata.c
@ -112,7 +112,7 @@ static bool UnsetMetadataSyncedForAll(void);
 static void ErrorIfCoordinatorMetadataSetFalse(WorkerNode *workerNode, Datum value,
 											   char *field);
 static WorkerNode * SetShouldHaveShards(WorkerNode *workerNode, bool shouldHaveShards);
-
+static void RemoveOldShardPlacementForNodeGroup(int groupId);

 /* declarations for dynamic loading */
 PG_FUNCTION_INFO_V1(citus_set_coordinator_host);
@ -1291,9 +1291,7 @@ RemoveNodeFromCluster(char *nodeName, int32 nodePort)
 			 */
 			DeleteAllReferenceTablePlacementsFromNodeGroup(workerNode->groupId);
 		}
-		bool onlyConsiderActivePlacements = false;
-		if (NodeGroupHasShardPlacements(workerNode->groupId,
-										onlyConsiderActivePlacements))
+		if (NodeGroupHasLivePlacements(workerNode->groupId))
 		{
 			if (ClusterHasReferenceTable())
 			{
@ -1320,6 +1318,8 @@ RemoveNodeFromCluster(char *nodeName, int32 nodePort)

 	DeleteNodeRow(workerNode->workerName, nodePort);

+	RemoveOldShardPlacementForNodeGroup(workerNode->groupId);
+
 	char *nodeDeleteCommand = NodeDeleteCommand(workerNode->nodeId);

 	/* make sure we don't have any lingering session lifespan connections */
@ -1329,6 +1329,29 @@ RemoveNodeFromCluster(char *nodeName, int32 nodePort)
 }


+/*
+ * RemoveOldShardPlacementForNodeGroup removes all old shard placements
+ * for the given node group from pg_dist_placement.
+ */
+static void
+RemoveOldShardPlacementForNodeGroup(int groupId)
+{
+	/*
+	 * Prevent concurrent deferred drop
+	 */
+	LockPlacementCleanup();
+	List *shardPlacementsOnNode = AllShardPlacementsOnNodeGroup(groupId);
+	GroupShardPlacement *placement = NULL;
+	foreach_ptr(placement, shardPlacementsOnNode)
+	{
+		if (placement->shardState == SHARD_STATE_TO_DELETE)
+		{
+			DeleteShardPlacementRow(placement->placementId);
+		}
+	}
+}
+
+
 /*
 * CanRemoveReferenceTablePlacements returns true if active primary
 * node count is more than 1, which means that even if we remove a node
@ -1384,16 +1407,34 @@ AddNodeMetadata(char *nodeName, int32 nodePort,
 	*nodeAlreadyExists = false;

 	/*
-	 * Take an exclusive lock on pg_dist_node to serialize node changes.
+	 * Prevent / wait for concurrent modification before checking whether
+	 * the worker already exists in pg_dist_node.
+	 */
+	LockRelationOid(DistNodeRelationId(), RowShareLock);
+
+	WorkerNode *workerNode = FindWorkerNodeAnyCluster(nodeName, nodePort);
+	if (workerNode != NULL)
+	{
+		/* return early without holding locks when the node already exists */
+		*nodeAlreadyExists = true;
+
+		return workerNode->nodeId;
+	}
+
+	/*
+	 * We are going to change pg_dist_node, prevent any concurrent reads that
+	 * are not tolerant to concurrent node addition by taking an exclusive
+	 * lock (conflicts with all but AccessShareLock).
+	 *
 	 * We may want to relax or have more fine-grained locking in the future
 	 * to allow users to add multiple nodes concurrently.
 	 */
 	LockRelationOid(DistNodeRelationId(), ExclusiveLock);

-	WorkerNode *workerNode = FindWorkerNodeAnyCluster(nodeName, nodePort);
+	/* recheck in case 2 node additions pass the first check concurrently */
+	workerNode = FindWorkerNodeAnyCluster(nodeName, nodePort);
 	if (workerNode != NULL)
 	{
-		/* fill return data and return */
 		*nodeAlreadyExists = true;

 		return workerNode->nodeId;
@ -1800,7 +1841,7 @@ InsertPlaceholderCoordinatorRecord(void)
 	bool nodeAlreadyExists = false;

 	/* as long as there is a single node, localhost should be ok */
-	AddNodeMetadata("localhost", PostPortNumber, &nodeMetadata, &nodeAlreadyExists);
+	AddNodeMetadata(LocalHostName, PostPortNumber, &nodeMetadata, &nodeAlreadyExists);
 }


--- a/src/backend/distributed/operations/delete_protocol.c
+++ b/src/backend/distributed/operations/delete_protocol.c
@ -332,7 +332,7 @@ DropShards(Oid relationId, char *schemaName, char *relationName,
 	 */
 	if (MultiShardCommitProtocol == COMMIT_PROTOCOL_2PC)
 	{
-		CoordinatedTransactionUse2PC();
+		CoordinatedTransactionShouldUse2PC();
 	}

 	List *dropTaskList = DropTaskList(relationId, schemaName, relationName,
--- a/src/backend/distributed/operations/node_protocol.c
+++ b/src/backend/distributed/operations/node_protocol.c
@ -85,6 +85,7 @@ PG_FUNCTION_INFO_V1(master_get_table_ddl_events);
 PG_FUNCTION_INFO_V1(master_get_new_shardid);
 PG_FUNCTION_INFO_V1(master_get_new_placementid);
 PG_FUNCTION_INFO_V1(master_get_active_worker_nodes);
+PG_FUNCTION_INFO_V1(citus_get_active_worker_nodes);
 PG_FUNCTION_INFO_V1(master_get_round_robin_candidate_nodes);
 PG_FUNCTION_INFO_V1(master_stage_shard_row);
 PG_FUNCTION_INFO_V1(master_stage_shard_placement_row);
@ -442,12 +443,12 @@ master_stage_shard_placement_row(PG_FUNCTION_ARGS)


 /*
- * master_get_active_worker_nodes returns a set of active worker host names and
+ * citus_get_active_worker_nodes returns a set of active worker host names and
 * port numbers in deterministic order. Currently we assume that all worker
 * nodes in pg_dist_node are active.
 */
 Datum
-master_get_active_worker_nodes(PG_FUNCTION_ARGS)
+citus_get_active_worker_nodes(PG_FUNCTION_ARGS)
 {
 	FuncCallContext *functionContext = NULL;
 	uint32 workerNodeIndex = 0;
@ -512,6 +513,16 @@ master_get_active_worker_nodes(PG_FUNCTION_ARGS)
 }


+/*
+ * master_get_active_worker_nodes is a wrapper function for old UDF name.
+ */
+Datum
+master_get_active_worker_nodes(PG_FUNCTION_ARGS)
+{
+	return citus_get_active_worker_nodes(fcinfo);
+}
+
+
 /* Finds the relationId from a potentially qualified relation name. */
 Oid
 ResolveRelationId(text *relationName, bool missingOk)
--- a/src/backend/distributed/operations/shard_cleaner.c
+++ b/src/backend/distributed/operations/shard_cleaner.c
@ -16,6 +16,7 @@
 #include "distributed/coordinator_protocol.h"
 #include "distributed/metadata_cache.h"
 #include "distributed/shard_cleaner.h"
+#include "distributed/resource_lock.h"
 #include "distributed/worker_transaction.h"


@ -23,7 +24,7 @@
 PG_FUNCTION_INFO_V1(master_defer_delete_shards);


-static int DropMarkedShards(void);
+static int DropMarkedShards(bool waitForCleanupLock);


 /*
@ -44,7 +45,8 @@ master_defer_delete_shards(PG_FUNCTION_ARGS)
 	CheckCitusVersion(ERROR);
 	EnsureCoordinator();

-	int droppedShardCount = DropMarkedShards();
+	bool waitForCleanupLock = true;
+	int droppedShardCount = DropMarkedShards(waitForCleanupLock);

 	PG_RETURN_INT32(droppedShardCount);
 }
@ -55,14 +57,14 @@ master_defer_delete_shards(PG_FUNCTION_ARGS)
 * any errors to make it safe to use in the maintenance daemon.
 */
 int
-TryDropMarkedShards(void)
+TryDropMarkedShards(bool waitForCleanupLock)
 {
 	int droppedShardCount = 0;
 	MemoryContext savedContext = CurrentMemoryContext;

 	PG_TRY();
 	{
-		droppedShardCount = DropMarkedShards();
+		droppedShardCount = DropMarkedShards(waitForCleanupLock);
 	}
 	PG_CATCH();
 	{
@ -88,9 +90,15 @@ TryDropMarkedShards(void)
 * group and continues with others. The group that has been skipped will be
 * removed at a later time when there are no locks held anymore on those
 * placements.
+ *
+ * Before doing any of this it will take an exclusive PlacementCleanup lock.
+ * This is to ensure that this function is not being run concurrently.
+ * Otherwise really bad race conditions are possible, such as removing all
+ * placements of a shard. waitForCleanupLock indicates if this function should
+ * wait for this lock or returns with a warning.
 */
 static int
-DropMarkedShards(void)
+DropMarkedShards(bool waitForCleanupLock)
 {
 	int removedShardCount = 0;
 	ListCell *shardPlacementCell = NULL;
@ -100,6 +108,16 @@ DropMarkedShards(void)
 		return removedShardCount;
 	}

+	if (waitForCleanupLock)
+	{
+		LockPlacementCleanup();
+	}
+	else if (!TryLockPlacementCleanup())
+	{
+		ereport(WARNING, (errmsg("could not acquire lock to cleanup placements")));
+		return 0;
+	}
+
 	List *shardPlacementList = AllShardPlacementsWithShardPlacementState(
 		SHARD_STATE_TO_DELETE);
 	foreach(shardPlacementCell, shardPlacementList)
--- a/src/backend/distributed/operations/shard_rebalancer.c
+++ b/src/backend/distributed/operations/shard_rebalancer.c
@ -30,7 +30,6 @@
 #include "distributed/connection_management.h"
 #include "distributed/enterprise.h"
 #include "distributed/hash_helpers.h"
-#include "distributed/intermediate_result_pruning.h"
 #include "distributed/listutils.h"
 #include "distributed/coordinator_protocol.h"
 #include "distributed/metadata_cache.h"
@ -647,12 +646,12 @@ SetupRebalanceMonitor(List *placementUpdateList, Oid relationId)
 	List *colocatedUpdateList = GetColocatedRebalanceSteps(placementUpdateList);
 	ListCell *colocatedUpdateCell = NULL;

-	ProgressMonitorData *monitor = CreateProgressMonitor(REBALANCE_ACTIVITY_MAGIC_NUMBER,
-														 list_length(colocatedUpdateList),
-														 sizeof(
-															 PlacementUpdateEventProgress),
-														 relationId);
-	PlacementUpdateEventProgress *rebalanceSteps = monitor->steps;
+	dsm_handle dsmHandle;
+	ProgressMonitorData *monitor = CreateProgressMonitor(
+		list_length(colocatedUpdateList),
+		sizeof(PlacementUpdateEventProgress),
+		&dsmHandle);
+	PlacementUpdateEventProgress *rebalanceSteps = ProgressMonitorSteps(monitor);

 	int32 eventIndex = 0;
 	foreach(colocatedUpdateCell, colocatedUpdateList)
@ -670,6 +669,7 @@ SetupRebalanceMonitor(List *placementUpdateList, Oid relationId)

 		eventIndex++;
 	}
+	RegisterProgressMonitor(REBALANCE_ACTIVITY_MAGIC_NUMBER, relationId, dsmHandle);
 }


@ -814,7 +814,7 @@ citus_drain_node(PG_FUNCTION_ARGS)

 	char *nodeName = text_to_cstring(nodeNameText);
 	int connectionFlag = FORCE_NEW_CONNECTION;
-	MultiConnection *connection = GetNodeConnection(connectionFlag, LOCAL_HOST_NAME,
+	MultiConnection *connection = GetNodeConnection(connectionFlag, LocalHostName,
 													PostPortNumber);

 	/*
@ -972,7 +972,6 @@ Datum
 get_rebalance_progress(PG_FUNCTION_ARGS)
 {
 	List *segmentList = NIL;
-	ListCell *rebalanceMonitorCell = NULL;
 	TupleDesc tupdesc;
 	Tuplestorestate *tupstore = SetupTuplestore(fcinfo, &tupdesc);

@ -980,11 +979,11 @@ get_rebalance_progress(PG_FUNCTION_ARGS)
 	List *rebalanceMonitorList = ProgressMonitorList(REBALANCE_ACTIVITY_MAGIC_NUMBER,
 													 &segmentList);

-	foreach(rebalanceMonitorCell, rebalanceMonitorList)
+	ProgressMonitorData *monitor = NULL;
+	foreach_ptr(monitor, rebalanceMonitorList)
 	{
-		ProgressMonitorData *monitor = lfirst(rebalanceMonitorCell);
-		PlacementUpdateEventProgress *placementUpdateEvents = monitor->steps;
-
+		PlacementUpdateEventProgress *placementUpdateEvents = ProgressMonitorSteps(
+			monitor);
 		for (int eventIndex = 0; eventIndex < monitor->stepCount; eventIndex++)
 		{
 			PlacementUpdateEventProgress *step = placementUpdateEvents + eventIndex;
@ -1201,7 +1200,7 @@ UpdateShardPlacement(PlacementUpdateEvent *placementUpdateEvent,
 										  sourceNode->workerPort,
 										  REBALANCE_PROGRESS_MOVING);

-	MultiConnection *connection = GetNodeConnection(connectionFlag, LOCAL_HOST_NAME,
+	MultiConnection *connection = GetNodeConnection(connectionFlag, LocalHostName,
 													PostPortNumber);

 	/*
@ -2141,9 +2140,9 @@ UpdateColocatedShardPlacementProgress(uint64 shardId, char *sourceName, int sour
 {
 	ProgressMonitorData *header = GetCurrentProgressMonitor();

-	if (header != NULL && header->steps != NULL)
+	if (header != NULL)
 	{
-		PlacementUpdateEventProgress *steps = header->steps;
+		PlacementUpdateEventProgress *steps = ProgressMonitorSteps(header);
 		ListCell *colocatedShardIntervalCell = NULL;

 		ShardInterval *shardInterval = LoadShardInterval(shardId);
--- a/src/backend/distributed/operations/stage_protocol.c
+++ b/src/backend/distributed/operations/stage_protocol.c
@ -32,7 +32,9 @@
 #include "distributed/connection_management.h"
 #include "distributed/deparse_shard_query.h"
 #include "distributed/distributed_planner.h"
+#include "distributed/foreign_key_relationship.h"
 #include "distributed/listutils.h"
+#include "distributed/lock_graph.h"
 #include "distributed/multi_client_executor.h"
 #include "distributed/multi_executor.h"
 #include "distributed/metadata_utility.h"
@ -65,12 +67,22 @@ static List * RelationShardListForShardCreate(ShardInterval *shardInterval);
 static bool WorkerShardStats(ShardPlacement *placement, Oid relationId,
 							 const char *shardName, uint64 *shardSize,
 							 text **shardMinValue, text **shardMaxValue);
+static void UpdateTableStatistics(Oid relationId);
+static void ReceiveAndUpdateShardsSizeAndMinMax(List *connectionList);
+static void UpdateShardSizeAndMinMax(uint64 shardId, ShardInterval *shardInterval, Oid
+									 relationId, List *shardPlacementList, uint64
+									 shardSize, text *shardMinValue,
+									 text *shardMaxValue);
+static bool ProcessShardStatisticsRow(PGresult *result, int64 rowIndex, uint64 *shardId,
+									  text **shardMinValue, text **shardMaxValue,
+									  uint64 *shardSize);

 /* exports for SQL callable functions */
 PG_FUNCTION_INFO_V1(master_create_empty_shard);
 PG_FUNCTION_INFO_V1(master_append_table_to_shard);
 PG_FUNCTION_INFO_V1(citus_update_shard_statistics);
 PG_FUNCTION_INFO_V1(master_update_shard_statistics);
+PG_FUNCTION_INFO_V1(citus_update_table_statistics);


 /*
@ -361,6 +373,23 @@ citus_update_shard_statistics(PG_FUNCTION_ARGS)
 }


+/*
+ * citus_update_table_statistics updates metadata (shard size and shard min/max
+ * values) of the shards of the given table
+ */
+Datum
+citus_update_table_statistics(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+
+	CheckCitusVersion(ERROR);
+
+	UpdateTableStatistics(distributedTableId);
+
+	PG_RETURN_VOID();
+}
+
+
 /*
 * master_update_shard_statistics is a wrapper function for old UDF name.
 */
@ -782,7 +811,6 @@ UpdateShardStatistics(int64 shardId)
 {
 	ShardInterval *shardInterval = LoadShardInterval(shardId);
 	Oid relationId = shardInterval->relationId;
-	char storageType = shardInterval->storageType;
 	bool statsOK = false;
 	uint64 shardSize = 0;
 	text *minValue = NULL;
@ -825,17 +853,166 @@ UpdateShardStatistics(int64 shardId)
 						  errdetail("Setting shard statistics to NULL")));
 	}

-	/* make sure we don't process cancel signals */
-	HOLD_INTERRUPTS();
+	UpdateShardSizeAndMinMax(shardId, shardInterval, relationId, shardPlacementList,
+							 shardSize, minValue, maxValue);
+	return shardSize;
+}

-	/* update metadata for each shard placement we appended to */
+
+/*
+ * UpdateTableStatistics updates metadata (shard size and shard min/max values)
+ * of the shards of the given table. Follows a similar logic to citus_shard_sizes function.
+ */
+static void
+UpdateTableStatistics(Oid relationId)
+{
+	List *citusTableIds = NIL;
+	citusTableIds = lappend_oid(citusTableIds, relationId);
+
+	/* we want to use a distributed transaction here to detect distributed deadlocks */
+	bool useDistributedTransaction = true;
+
+	/* we also want shard min/max values for append distributed tables */
+	bool useShardMinMaxQuery = true;
+
+	List *connectionList = SendShardStatisticsQueriesInParallel(citusTableIds,
+																useDistributedTransaction,
+																useShardMinMaxQuery);
+
+	ReceiveAndUpdateShardsSizeAndMinMax(connectionList);
+}
+
+
+/*
+ * ReceiveAndUpdateShardsSizeAndMinMax receives shard id, size
+ * and min max results from the given connection list, and updates
+ * respective entries in pg_dist_placement and pg_dist_shard
+ */
+static void
+ReceiveAndUpdateShardsSizeAndMinMax(List *connectionList)
+{
+	/*
+	 * From the connection list, we will not get all the shards, but
+	 * all the placements. We use a hash table to remember already visited shard ids
+	 * since we update all the different placements of a shard id at once.
+	 */
+	HTAB *alreadyVisitedShardPlacements = CreateOidVisitedHashSet();
+
+	MultiConnection *connection = NULL;
+	foreach_ptr(connection, connectionList)
+	{
+		if (PQstatus(connection->pgConn) != CONNECTION_OK)
+		{
+			continue;
+		}
+
+		bool raiseInterrupts = true;
+		PGresult *result = GetRemoteCommandResult(connection, raiseInterrupts);
+		if (!IsResponseOK(result))
+		{
+			ReportResultError(connection, result, WARNING);
+			continue;
+		}
+
+		int64 rowCount = PQntuples(result);
+		int64 colCount = PQnfields(result);
+
+		/* Although it is not expected */
+		if (colCount != UPDATE_SHARD_STATISTICS_COLUMN_COUNT)
+		{
+			ereport(WARNING, (errmsg("unexpected number of columns from "
+									 "citus_update_table_statistics")));
+			continue;
+		}
+
+		for (int64 rowIndex = 0; rowIndex < rowCount; rowIndex++)
+		{
+			uint64 shardId = 0;
+			text *shardMinValue = NULL;
+			text *shardMaxValue = NULL;
+			uint64 shardSize = 0;
+
+			if (!ProcessShardStatisticsRow(result, rowIndex, &shardId, &shardMinValue,
+										   &shardMaxValue, &shardSize))
+			{
+				/* this row has no valid shard statistics */
+				continue;
+			}
+
+			if (OidVisited(alreadyVisitedShardPlacements, shardId))
+			{
+				/* We have already updated this placement list */
+				continue;
+			}
+
+			VisitOid(alreadyVisitedShardPlacements, shardId);
+
+			ShardInterval *shardInterval = LoadShardInterval(shardId);
+			Oid relationId = shardInterval->relationId;
+			List *shardPlacementList = ActiveShardPlacementList(shardId);
+
+			UpdateShardSizeAndMinMax(shardId, shardInterval, relationId,
+									 shardPlacementList, shardSize, shardMinValue,
+									 shardMaxValue);
+		}
+		PQclear(result);
+		ForgetResults(connection);
+	}
+	hash_destroy(alreadyVisitedShardPlacements);
+}
+
+
+/*
+ * ProcessShardStatisticsRow processes a row of shard statistics of the input PGresult
+ * - it returns true if this row belongs to a valid shard
+ * - it returns false if this row has no valid shard statistics (shardId = INVALID_SHARD_ID)
+ */
+static bool
+ProcessShardStatisticsRow(PGresult *result, int64 rowIndex, uint64 *shardId,
+						  text **shardMinValue, text **shardMaxValue, uint64 *shardSize)
+{
+	*shardId = ParseIntField(result, rowIndex, 0);
+
+	/* check for the dummy entries we put so that UNION ALL wouldn't complain */
+	if (*shardId == INVALID_SHARD_ID)
+	{
+		/* this row has no valid shard statistics */
+		return false;
+	}
+
+	char *minValueResult = PQgetvalue(result, rowIndex, 1);
+	char *maxValueResult = PQgetvalue(result, rowIndex, 2);
+	*shardMinValue = cstring_to_text(minValueResult);
+	*shardMaxValue = cstring_to_text(maxValueResult);
+	*shardSize = ParseIntField(result, rowIndex, 3);
+	return true;
+}
+
+
+/*
+ * UpdateShardSizeAndMinMax updates the shardlength (shard size) of the given
+ * shard and its placements in pg_dist_placement, and updates the shard min value
+ * and shard max value of the given shard in pg_dist_shard if the relationId belongs
+ * to an append-distributed table
+ */
+static void
+UpdateShardSizeAndMinMax(uint64 shardId, ShardInterval *shardInterval, Oid relationId,
+						 List *shardPlacementList, uint64 shardSize, text *shardMinValue,
+						 text *shardMaxValue)
+{
+	char storageType = shardInterval->storageType;
+
+	ShardPlacement *placement = NULL;
+
+	/* update metadata for each shard placement */
 	foreach_ptr(placement, shardPlacementList)
 	{
 		uint64 placementId = placement->placementId;
 		int32 groupId = placement->groupId;

 		DeleteShardPlacementRow(placementId);
-		InsertShardPlacementRow(shardId, placementId, SHARD_STATE_ACTIVE, shardSize,
+		InsertShardPlacementRow(shardId, placementId, SHARD_STATE_ACTIVE,
+								shardSize,
 								groupId);
 	}

@ -843,18 +1020,9 @@ UpdateShardStatistics(int64 shardId)
 	if (IsCitusTableType(relationId, APPEND_DISTRIBUTED))
 	{
 		DeleteShardRow(shardId);
-		InsertShardRow(relationId, shardId, storageType, minValue, maxValue);
+		InsertShardRow(relationId, shardId, storageType, shardMinValue,
+					   shardMaxValue);
 	}
-
-	if (QueryCancelPending)
-	{
-		ereport(WARNING, (errmsg("cancel requests are ignored during metadata update")));
-		QueryCancelPending = false;
-	}
-
-	RESUME_INTERRUPTS();
-
-	return shardSize;
 }


--- a/src/backend/distributed/planner/deparse_shard_query.c
+++ b/src/backend/distributed/planner/deparse_shard_query.c
@ -38,10 +38,8 @@
 #include "utils/rel.h"
 #include "utils/syscache.h"

-static void UpdateTaskQueryString(Query *query, Oid distributedTableId,
-								  RangeTblEntry *valuesRTE, Task *task);
-static bool ReplaceRelationConstraintByShardConstraint(List *relationShardList,
-													   OnConflictExpr *onConflict);
+
+static void UpdateTaskQueryString(Query *query, Task *task);
 static RelationShard * FindRelationShard(Oid inputRelationId, List *relationShardList);
 static void ConvertRteToSubqueryWithEmptyResult(RangeTblEntry *rte);
 static bool ShouldLazyDeparseQuery(Task *task);
@ -57,27 +55,43 @@ RebuildQueryStrings(Job *workerJob)
 {
 	Query *originalQuery = workerJob->jobQuery;
 	List *taskList = workerJob->taskList;
-	Oid relationId = ((RangeTblEntry *) linitial(originalQuery->rtable))->relid;
-	RangeTblEntry *valuesRTE = ExtractDistributedInsertValuesRTE(originalQuery);
-
 	Task *task = NULL;

+	if (originalQuery->commandType == CMD_INSERT)
+	{
+		AddInsertAliasIfNeeded(originalQuery);
+	}
+
 	foreach_ptr(task, taskList)
 	{
 		Query *query = originalQuery;

-		if (UpdateOrDeleteQuery(query) && list_length(taskList) > 1)
+		/*
+		 * Copy the query if there are multiple tasks. If there is a single
+		 * task, we scribble on the original query to avoid the copying
+		 * overhead.
+		 */
+		if (list_length(taskList) > 1)
 		{
 			query = copyObject(originalQuery);
 		}
+
+		if (UpdateOrDeleteQuery(query))
+		{
+			/*
+			 * For UPDATE and DELETE queries, we may have subqueries and joins, so
+			 * we use relation shard list to update shard names and call
+			 * pg_get_query_def() directly.
+			 */
+			List *relationShardList = task->relationShardList;
+			UpdateRelationToShardNames((Node *) query, relationShardList);
+		}
 		else if (query->commandType == CMD_INSERT && task->modifyWithSubquery)
 		{
 			/* for INSERT..SELECT, adjust shard names in SELECT part */
 			List *relationShardList = task->relationShardList;
 			ShardInterval *shardInterval = LoadShardInterval(task->anchorShardId);

-			query = copyObject(originalQuery);
-
 			RangeTblEntry *copiedInsertRte = ExtractResultRelationRTEOrError(query);
 			RangeTblEntry *copiedSubqueryRte = ExtractSelectRangeTableEntry(query);
 			Query *copiedSubquery = copiedSubqueryRte->subquery;
@ -90,29 +104,18 @@ RebuildQueryStrings(Job *workerJob)

 			ReorderInsertSelectTargetLists(query, copiedInsertRte, copiedSubqueryRte);

-			/* setting an alias simplifies deparsing of RETURNING */
-			if (copiedInsertRte->alias == NULL)
-			{
-				Alias *alias = makeAlias(CITUS_TABLE_ALIAS, NIL);
-				copiedInsertRte->alias = alias;
-			}
-
 			UpdateRelationToShardNames((Node *) copiedSubquery, relationShardList);
 		}
-		else if (query->commandType == CMD_INSERT && (query->onConflict != NULL ||
-													  valuesRTE != NULL))
+
+		if (query->commandType == CMD_INSERT)
 		{
+			RangeTblEntry *modifiedRelationRTE = linitial(originalQuery->rtable);
+
 			/*
-			 * Always an alias in UPSERTs and multi-row INSERTs to avoid
-			 * deparsing issues (e.g. RETURNING might reference the original
-			 * table name, which has been replaced by a shard name).
+			 * We store the modified relaiton ID in the task so we can lazily call
+			 * deparse_shard_query when the string is needed
 			 */
-			RangeTblEntry *rangeTableEntry = linitial(query->rtable);
-			if (rangeTableEntry->alias == NULL)
-			{
-				Alias *alias = makeAlias(CITUS_TABLE_ALIAS, NIL);
-				rangeTableEntry->alias = alias;
-			}
+			task->anchorDistributedTableId = modifiedRelationRTE->relid;
 		}

 		bool isQueryObjectOrText = GetTaskQueryType(task) == TASK_QUERY_TEXT ||
@ -122,7 +125,7 @@ RebuildQueryStrings(Job *workerJob)
 								? "(null)"
 								: ApplyLogRedaction(TaskQueryString(task)))));

-		UpdateTaskQueryString(query, relationId, valuesRTE, task);
+		UpdateTaskQueryString(query, task);

 		/*
 		 * If parameters were resolved in the job query, then they are now also
@ -136,54 +139,69 @@ RebuildQueryStrings(Job *workerJob)
 }


+/*
+ * AddInsertAliasIfNeeded adds an alias in UPSERTs and multi-row INSERTs to avoid
+ * deparsing issues (e.g. RETURNING might reference the original table name,
+ * which has been replaced by a shard name).
+ */
+void
+AddInsertAliasIfNeeded(Query *query)
+{
+	Assert(query->commandType == CMD_INSERT);
+
+	if (query->onConflict == NULL &&
+		ExtractDistributedInsertValuesRTE(query) == NULL)
+	{
+		/* simple single-row insert does not need an alias */
+		return;
+	}
+
+	RangeTblEntry *rangeTableEntry = linitial(query->rtable);
+	if (rangeTableEntry->alias != NULL)
+	{
+		/* INSERT already has an alias */
+		return;
+	}
+
+	Alias *alias = makeAlias(CITUS_TABLE_ALIAS, NIL);
+	rangeTableEntry->alias = alias;
+}
+
+
 /*
 * UpdateTaskQueryString updates the query string stored within the provided
 * Task. If the Task has row values from a multi-row INSERT, those are injected
- * into the provided query (using the provided valuesRTE, which must belong to
- * the query) before deparse occurs (the query's full VALUES list will be
- * restored before this function returns).
+ * into the provided query before deparse occurs (the query's full VALUES list
+ * will be restored before this function returns).
 */
 static void
-UpdateTaskQueryString(Query *query, Oid distributedTableId, RangeTblEntry *valuesRTE,
-					  Task *task)
+UpdateTaskQueryString(Query *query, Task *task)
 {
 	List *oldValuesLists = NIL;
-
-	if (valuesRTE != NULL)
-	{
-		Assert(valuesRTE->rtekind == RTE_VALUES);
-		Assert(task->rowValuesLists != NULL);
-
-		oldValuesLists = valuesRTE->values_lists;
-		valuesRTE->values_lists = task->rowValuesLists;
-	}
-
-	if (query->commandType != CMD_INSERT)
-	{
-		/*
-		 * For UPDATE and DELETE queries, we may have subqueries and joins, so
-		 * we use relation shard list to update shard names and call
-		 * pg_get_query_def() directly.
-		 */
-		List *relationShardList = task->relationShardList;
-		UpdateRelationToShardNames((Node *) query, relationShardList);
-	}
-	else if (ShouldLazyDeparseQuery(task))
-	{
-		/*
-		 * not all insert queries are copied before calling this
-		 * function, so we do it here
-		 */
-		query = copyObject(query);
-	}
+	RangeTblEntry *valuesRTE = NULL;

 	if (query->commandType == CMD_INSERT)
 	{
-		/*
-		 * We store this in the task so we can lazily call
-		 * deparse_shard_query when the string is needed
-		 */
-		task->anchorDistributedTableId = distributedTableId;
+		/* extract the VALUES from the INSERT */
+		valuesRTE = ExtractDistributedInsertValuesRTE(query);
+
+		if (valuesRTE != NULL)
+		{
+			Assert(valuesRTE->rtekind == RTE_VALUES);
+			Assert(task->rowValuesLists != NULL);
+
+			oldValuesLists = valuesRTE->values_lists;
+			valuesRTE->values_lists = task->rowValuesLists;
+		}
+
+		if (ShouldLazyDeparseQuery(task))
+		{
+			/*
+			 * not all insert queries are copied before calling this
+			 * function, so we do it here
+			 */
+			query = copyObject(query);
+		}
 	}

 	SetTaskQueryIfShouldLazyDeparse(task, query);
@ -266,124 +284,6 @@ UpdateRelationToShardNames(Node *node, List *relationShardList)
 }


-/*
- * UpdateRelationsToLocalShardTables walks over the query tree and appends shard ids to
- * relations. The caller is responsible for ensuring that the resulting Query can
- * be executed locally.
- */
-bool
-UpdateRelationsToLocalShardTables(Node *node, List *relationShardList)
-{
-	if (node == NULL)
-	{
-		return false;
-	}
-
-	/* want to look at all RTEs, even in subqueries, CTEs and such */
-	if (IsA(node, Query))
-	{
-		return query_tree_walker((Query *) node, UpdateRelationsToLocalShardTables,
-								 relationShardList, QTW_EXAMINE_RTES_BEFORE);
-	}
-
-	if (IsA(node, OnConflictExpr))
-	{
-		OnConflictExpr *onConflict = (OnConflictExpr *) node;
-
-		return ReplaceRelationConstraintByShardConstraint(relationShardList, onConflict);
-	}
-
-	if (!IsA(node, RangeTblEntry))
-	{
-		return expression_tree_walker(node, UpdateRelationsToLocalShardTables,
-									  relationShardList);
-	}
-
-	RangeTblEntry *newRte = (RangeTblEntry *) node;
-
-	if (newRte->rtekind != RTE_RELATION)
-	{
-		return false;
-	}
-
-	RelationShard *relationShard = FindRelationShard(newRte->relid,
-													 relationShardList);
-
-	/* the function should only be called with local shards */
-	if (relationShard == NULL)
-	{
-		return true;
-	}
-
-	Oid shardOid = GetTableLocalShardOid(relationShard->relationId,
-										 relationShard->shardId);
-
-	newRte->relid = shardOid;
-
-	return false;
-}
-
-
-/*
- * ReplaceRelationConstraintByShardConstraint replaces given OnConflictExpr's
- * constraint id with constraint id of the corresponding shard.
- */
-static bool
-ReplaceRelationConstraintByShardConstraint(List *relationShardList,
-										   OnConflictExpr *onConflict)
-{
-	Oid constraintId = onConflict->constraint;
-
-	if (!OidIsValid(constraintId))
-	{
-		return false;
-	}
-
-	Oid constraintRelationId = InvalidOid;
-
-	HeapTuple heapTuple = SearchSysCache1(CONSTROID, ObjectIdGetDatum(constraintId));
-	if (HeapTupleIsValid(heapTuple))
-	{
-		Form_pg_constraint contup = (Form_pg_constraint) GETSTRUCT(heapTuple);
-
-		constraintRelationId = contup->conrelid;
-		ReleaseSysCache(heapTuple);
-	}
-
-	/*
-	 * We can return here without calling the walker function, since we know there
-	 * will be no possible tables or constraints after this point, by the syntax.
-	 */
-	if (!OidIsValid(constraintRelationId))
-	{
-		ereport(ERROR, (errmsg("Invalid relation id (%u) for constraint: %s",
-							   constraintRelationId, get_constraint_name(constraintId))));
-	}
-
-	RelationShard *relationShard = FindRelationShard(constraintRelationId,
-													 relationShardList);
-
-	if (relationShard != NULL)
-	{
-		char *constraintName = get_constraint_name(constraintId);
-
-		AppendShardIdToName(&constraintName, relationShard->shardId);
-
-		Oid shardOid = GetTableLocalShardOid(relationShard->relationId,
-											 relationShard->shardId);
-
-		Oid shardConstraintId = get_relation_constraint_oid(shardOid, constraintName,
-															false);
-
-		onConflict->constraint = shardConstraintId;
-
-		return false;
-	}
-
-	return true;
-}
-
-
 /*
 * FindRelationShard finds the RelationShard for shard relation with
 * given Oid if exists in given relationShardList. Otherwise, returns NULL.
--- a/src/backend/distributed/planner/distributed_planner.c
+++ b/src/backend/distributed/planner/distributed_planner.c
@ -49,6 +49,7 @@
 #include "executor/executor.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
+#include "nodes/pg_list.h"
 #include "parser/parsetree.h"
 #include "parser/parse_type.h"
 #if PG_VERSION_NUM >= PG_VERSION_12
@ -98,6 +99,7 @@ static PlannedStmt * FinalizeNonRouterPlan(PlannedStmt *localPlan,
 										   DistributedPlan *distributedPlan,
 										   CustomScan *customScan);
 static PlannedStmt * FinalizeRouterPlan(PlannedStmt *localPlan, CustomScan *customScan);
+static AppendRelInfo * FindTargetAppendRelInfo(PlannerInfo *root, int relationRteIndex);
 static List * makeTargetListFromCustomScanList(List *custom_scan_tlist);
 static List * makeCustomScanTargetlistFromExistingTargetList(List *existingTargetlist);
 static int32 BlessRecordExpressionList(List *exprs);
@ -124,6 +126,7 @@ static PlannedStmt * PlanFastPathDistributedStmt(DistributedPlanningContext *pla
 static PlannedStmt * PlanDistributedStmt(DistributedPlanningContext *planContext,
 										 int rteIdCounter);
 static RTEListProperties * GetRTEListProperties(List *rangeTableList);
+static List * TranslatedVars(PlannerInfo *root, int relationIndex);


 /* Distributed planner hook */
@ -165,30 +168,29 @@ distributed_planner(Query *parse,
 		.boundParams = boundParams,
 	};

-	if (fastPathRouterQuery)
-	{
-		/*
-		 *  We need to copy the parse tree because the FastPathPlanner modifies
-		 *  it. In the next branch we do the same for other distributed queries
-		 *  too, but for those it needs to be done AFTER calling
-		 *  AssignRTEIdentities.
-		 */
-		planContext.originalQuery = copyObject(parse);
-	}
-	else if (needsDistributedPlanning)
+	if (needsDistributedPlanning)
 	{
 		/*
 		 * standard_planner scribbles on it's input, but for deparsing we need the
-		 * unmodified form. Note that before copying we call
-		 * AssignRTEIdentities, which is needed because these identities need
-		 * to be present in the copied query too.
+		 * unmodified form. Before copying we call AssignRTEIdentities to be able
+		 * to match RTEs in the rewritten query tree with those in the original
+		 * tree.
 		 */
 		rteIdCounter = AssignRTEIdentities(rangeTableList, rteIdCounter);
+
 		planContext.originalQuery = copyObject(parse);

-		bool setPartitionedTablesInherited = false;
-		AdjustPartitioningForDistributedPlanning(rangeTableList,
-												 setPartitionedTablesInherited);
+		/*
+		 * When there are partitioned tables (not applicable to fast path),
+		 * pretend that they are regular tables to avoid unnecessary work
+		 * in standard_planner.
+		 */
+		if (!fastPathRouterQuery)
+		{
+			bool setPartitionedTablesInherited = false;
+			AdjustPartitioningForDistributedPlanning(rangeTableList,
+													 setPartitionedTablesInherited);
+		}
 	}

 	/*
@ -447,7 +449,7 @@ AssignRTEIdentity(RangeTblEntry *rangeTableEntry, int rteIdentifier)
 {
 	Assert(rangeTableEntry->rtekind == RTE_RELATION);

-	rangeTableEntry->values_lists = list_make1_int(rteIdentifier);
+	rangeTableEntry->values_lists = list_make2_int(rteIdentifier, rangeTableEntry->inh);
 }


@ -458,12 +460,24 @@ GetRTEIdentity(RangeTblEntry *rte)
 	Assert(rte->rtekind == RTE_RELATION);
 	Assert(rte->values_lists != NIL);
 	Assert(IsA(rte->values_lists, IntList));
-	Assert(list_length(rte->values_lists) == 1);
+	Assert(list_length(rte->values_lists) == 2);

 	return linitial_int(rte->values_lists);
 }


+/*
+ * GetOriginalInh gets the original value of the inheritance flag set by
+ * AssignRTEIdentity. The planner resets this flag in the rewritten query,
+ * but we need it during deparsing.
+ */
+bool
+GetOriginalInh(RangeTblEntry *rte)
+{
+	return lsecond_int(rte->values_lists);
+}
+
+
 /*
 * GetQueryLockMode returns the necessary lock mode to be acquired for the
 * given query. (See comment written in RangeTblEntry->rellockmode)
@ -1814,6 +1828,8 @@ multi_relation_restriction_hook(PlannerInfo *root, RelOptInfo *relOptInfo,

 	/* see comments on GetVarFromAssignedParam() */
 	relationRestriction->outerPlanParamsList = OuterPlanParamsList(root);
+	relationRestriction->translatedVars = TranslatedVars(root,
+														 relationRestriction->index);

 	RelationRestrictionContext *relationRestrictionContext =
 		plannerRestrictionContext->relationRestrictionContext;
@ -1837,6 +1853,61 @@ multi_relation_restriction_hook(PlannerInfo *root, RelOptInfo *relOptInfo,
 }


+/*
+ * TranslatedVars deep copies the translated vars for the given relation index
+ * if there is any append rel list.
+ */
+static List *
+TranslatedVars(PlannerInfo *root, int relationIndex)
+{
+	List *translatedVars = NIL;
+
+	if (root->append_rel_list != NIL)
+	{
+		AppendRelInfo *targetAppendRelInfo =
+			FindTargetAppendRelInfo(root, relationIndex);
+		if (targetAppendRelInfo != NULL)
+		{
+			/* postgres deletes translated_vars after pg13, hence we deep copy them here */
+			Node *targetNode = NULL;
+			foreach_ptr(targetNode, targetAppendRelInfo->translated_vars)
+			{
+				translatedVars =
+					lappend(translatedVars, copyObject(targetNode));
+			}
+		}
+	}
+	return translatedVars;
+}
+
+
+/*
+ * FindTargetAppendRelInfo finds the target append rel info for the given
+ * relation rte index.
+ */
+static AppendRelInfo *
+FindTargetAppendRelInfo(PlannerInfo *root, int relationRteIndex)
+{
+	AppendRelInfo *appendRelInfo = NULL;
+
+	/* iterate on the queries that are part of UNION ALL subselects */
+	foreach_ptr(appendRelInfo, root->append_rel_list)
+	{
+		/*
+		 * We're only interested in the child rel that is equal to the
+		 * relation we're investigating. Here we don't need to find the offset
+		 * because postgres adds an offset to child_relid and parent_relid after
+		 * calling multi_relation_restriction_hook.
+		 */
+		if (appendRelInfo->child_relid == relationRteIndex)
+		{
+			return appendRelInfo;
+		}
+	}
+	return NULL;
+}
+
+
 /*
 * AdjustReadIntermediateResultCost adjusts the row count and total cost
 * of a read_intermediate_result call based on the file size.
@ -2143,6 +2214,33 @@ CreateAndPushPlannerRestrictionContext(void)
 }


+/*
+ * TranslatedVarsForRteIdentity gets an rteIdentity and returns the
+ * translatedVars that belong to the range table relation. If no
+ * translatedVars found, the function returns NIL;
+ */
+List *
+TranslatedVarsForRteIdentity(int rteIdentity)
+{
+	PlannerRestrictionContext *currentPlannerRestrictionContext =
+		CurrentPlannerRestrictionContext();
+
+	List *relationRestrictionList =
+		currentPlannerRestrictionContext->relationRestrictionContext->
+		relationRestrictionList;
+	RelationRestriction *relationRestriction = NULL;
+	foreach_ptr(relationRestriction, relationRestrictionList)
+	{
+		if (GetRTEIdentity(relationRestriction->rte) == rteIdentity)
+		{
+			return relationRestriction->translatedVars;
+		}
+	}
+
+	return NIL;
+}
+
+
 /*
 * CurrentRestrictionContext returns the most recently added
 * PlannerRestrictionContext from the plannerRestrictionContextList list.
--- a/src/backend/distributed/planner/local_plan_cache.c
+++ b/src/backend/distributed/planner/local_plan_cache.c
@ -16,7 +16,9 @@
 #include "distributed/local_plan_cache.h"
 #include "distributed/deparse_shard_query.h"
 #include "distributed/citus_ruleutils.h"
+#include "distributed/insert_select_planner.h"
 #include "distributed/metadata_cache.h"
+#include "distributed/multi_executor.h"
 #include "distributed/version_compat.h"
 #if PG_VERSION_NUM >= PG_VERSION_12
 #include "optimizer/optimizer.h"
@ -26,13 +28,21 @@
 #include "optimizer/clauses.h"


+static Query * GetLocalShardQueryForCache(Query *jobQuery, Task *task,
+										  ParamListInfo paramListInfo);
+static char * DeparseLocalShardQuery(Query *jobQuery, List *relationShardList,
+									 Oid anchorDistributedTableId, int64 anchorShardId);
+static int ExtractParameterTypesForParamListInfo(ParamListInfo originalParamListInfo,
+												 Oid **parameterTypes);
+
 /*
 * CacheLocalPlanForShardQuery replaces the relation OIDs in the job query
 * with shard relation OIDs and then plans the query and caches the result
 * in the originalDistributedPlan (which may be preserved across executions).
 */
 void
-CacheLocalPlanForShardQuery(Task *task, DistributedPlan *originalDistributedPlan)
+CacheLocalPlanForShardQuery(Task *task, DistributedPlan *originalDistributedPlan,
+							ParamListInfo paramListInfo)
 {
 	PlannedStmt *localPlan = GetCachedLocalPlan(task, originalDistributedPlan);
 	if (localPlan != NULL)
@ -58,14 +68,14 @@ CacheLocalPlanForShardQuery(Task *task, DistributedPlan *originalDistributedPlan
 	 * We prefer to use jobQuery (over task->query) because we don't want any
 	 * functions/params to have been evaluated in the cached plan.
 	 */
-	Query *shardQuery = copyObject(originalDistributedPlan->workerJob->jobQuery);
+	Query *jobQuery = copyObject(originalDistributedPlan->workerJob->jobQuery);

-	UpdateRelationsToLocalShardTables((Node *) shardQuery, task->relationShardList);
+	Query *localShardQuery = GetLocalShardQueryForCache(jobQuery, task, paramListInfo);

-	LOCKMODE lockMode = GetQueryLockMode(shardQuery);
+	LOCKMODE lockMode = GetQueryLockMode(localShardQuery);

 	/* fast path queries can only have a single RTE by definition */
-	RangeTblEntry *rangeTableEntry = (RangeTblEntry *) linitial(shardQuery->rtable);
+	RangeTblEntry *rangeTableEntry = (RangeTblEntry *) linitial(localShardQuery->rtable);

 	/*
 	 * If the shard has been created in this transction, we wouldn't see the relationId
@ -73,24 +83,16 @@ CacheLocalPlanForShardQuery(Task *task, DistributedPlan *originalDistributedPlan
 	 */
 	if (rangeTableEntry->relid == InvalidOid)
 	{
-		pfree(shardQuery);
+		pfree(jobQuery);
+		pfree(localShardQuery);
 		MemoryContextSwitchTo(oldContext);
 		return;
 	}

-	if (IsLoggableLevel(DEBUG5))
-	{
-		StringInfo queryString = makeStringInfo();
-		pg_get_query_def(shardQuery, queryString);
-
-		ereport(DEBUG5, (errmsg("caching plan for query: %s",
-								queryString->data)));
-	}
-
 	LockRelationOid(rangeTableEntry->relid, lockMode);

 	LocalPlannedStatement *localPlannedStatement = CitusMakeNode(LocalPlannedStatement);
-	localPlan = planner_compat(shardQuery, 0, NULL);
+	localPlan = planner_compat(localShardQuery, 0, NULL);
 	localPlannedStatement->localPlan = localPlan;
 	localPlannedStatement->shardId = task->anchorShardId;
 	localPlannedStatement->localGroupId = GetLocalGroupId();
@ -103,6 +105,130 @@ CacheLocalPlanForShardQuery(Task *task, DistributedPlan *originalDistributedPlan
 }


+/*
+ * GetLocalShardQueryForCache is a helper function which generates
+ * the local shard query based on the jobQuery. The function should
+ * not be used for generic purposes, it is specialized for local cached
+ * queries.
+ *
+ * It is not guaranteed to have consistent attribute numbers on the shards
+ * and on the shell (e.g., distributed/reference tables) due to DROP COLUMN
+ * commands.
+ *
+ * To avoid any edge cases due to such discrepancies, we first deparse the
+ * jobQuery with the tables replaced to shards, and parse the query string
+ * back. This is normally a very expensive operation, however we only do it
+ * once per cached local plan, which is acceptable.
+ */
+static Query *
+GetLocalShardQueryForCache(Query *jobQuery, Task *task, ParamListInfo orig_paramListInfo)
+{
+	char *shardQueryString =
+		DeparseLocalShardQuery(jobQuery, task->relationShardList,
+							   task->anchorDistributedTableId,
+							   task->anchorShardId);
+	ereport(DEBUG5, (errmsg("Local shard query that is going to be cached: %s",
+							shardQueryString)));
+
+	Oid *parameterTypes = NULL;
+	int numberOfParameters =
+		ExtractParameterTypesForParamListInfo(orig_paramListInfo, &parameterTypes);
+
+	Query *localShardQuery =
+		ParseQueryString(shardQueryString, parameterTypes, numberOfParameters);
+
+	return localShardQuery;
+}
+
+
+/*
+ * DeparseLocalShardQuery is a helper function to deparse given jobQuery for the shard(s)
+ * identified by the relationShardList, anchorDistributedTableId and anchorShardId.
+ *
+ * For the details and comparison with TaskQueryString(), see the comments in the function.
+ */
+static char *
+DeparseLocalShardQuery(Query *jobQuery, List *relationShardList, Oid
+					   anchorDistributedTableId, int64 anchorShardId)
+{
+	StringInfo queryString = makeStringInfo();
+
+	/*
+	 * We imitate what TaskQueryString() does, but we cannot rely on that function
+	 * as the parameters might have been already resolved on the QueryTree in the
+	 * task. Instead, we operate on the jobQuery where are sure that the
+	 * coordination evaluation has not happened.
+	 *
+	 * Local shard queries are only applicable for local cached query execution.
+	 * In the local cached query execution mode, we can use a query structure
+	 * (or query string) with unevaluated expressions as we allow function calls
+	 * to be evaluated when the query on the shard is executed (e.g., do no have
+	 * coordinator evaluation, instead let Postgres executor evaluate values).
+	 *
+	 * Additionally, we can allow them to be evaluated again because they are stable,
+	 * and we do not cache plans / use unevaluated query strings for queries containing
+	 * volatile functions.
+	 */
+	if (jobQuery->commandType == CMD_INSERT)
+	{
+		/*
+		 * We currently do not support INSERT .. SELECT here. To support INSERT..SELECT
+		 * queries, we should update the relation names to shard names in the SELECT
+		 * clause (e.g., UpdateRelationToShardNames()).
+		 */
+		Assert(!CheckInsertSelectQuery(jobQuery));
+
+		AddInsertAliasIfNeeded(jobQuery);
+
+		/*
+		 * For INSERT queries we cannot use pg_get_query_def. Mainly because we
+		 * cannot run UpdateRelationToShardNames on an INSERT query. This is
+		 * because the PG deparsing logic fails when trying to insert into a
+		 * RTE_FUNCTION (which is what will happen if you call
+		 * UpdateRelationToShardNames).
+		 */
+		deparse_shard_query(jobQuery, anchorDistributedTableId, anchorShardId,
+							queryString);
+	}
+	else
+	{
+		UpdateRelationToShardNames((Node *) jobQuery, relationShardList);
+
+		pg_get_query_def(jobQuery, queryString);
+	}
+
+	return queryString->data;
+}
+
+
+/*
+ * ExtractParameterTypesForParamListInfo is a helper function which helps to
+ * extract the parameter types of the given ParamListInfo via the second
+ * parameter of the function.
+ *
+ * The function also returns the number of parameters. If no parameter exists,
+ * the function returns 0.
+ */
+static int
+ExtractParameterTypesForParamListInfo(ParamListInfo originalParamListInfo,
+									  Oid **parameterTypes)
+{
+	*parameterTypes = NULL;
+
+	int numberOfParameters = 0;
+	if (originalParamListInfo != NULL)
+	{
+		const char **parameterValues = NULL;
+		ParamListInfo paramListInfo = copyParamList(originalParamListInfo);
+		ExtractParametersForLocalExecution(paramListInfo, parameterTypes,
+										   &parameterValues);
+		numberOfParameters = paramListInfo->numParams;
+	}
+
+	return numberOfParameters;
+}
+
+
 /*
 * GetCachedLocalPlan is a helper function which return the cached
 * plan in the distributedPlan for the given task if exists.
--- a/src/backend/distributed/planner/multi_logical_optimizer.c
+++ b/src/backend/distributed/planner/multi_logical_optimizer.c
@ -322,10 +322,6 @@ static Node * WorkerLimitCount(Node *limitCount, Node *limitOffset, OrderByLimit
 static List * WorkerSortClauseList(Node *limitCount,
 								   List *groupClauseList, List *sortClauseList,
 								   OrderByLimitReference orderByLimitReference);
-static List * GenerateNewTargetEntriesForSortClauses(List *originalTargetList,
-													 List *sortClauseList,
-													 AttrNumber *targetProjectionNumber,
-													 Index *nextSortGroupRefIndex);
 static bool CanPushDownLimitApproximate(List *sortClauseList, List *targetList);
 static bool HasOrderByAggregate(List *sortClauseList, List *targetList);
 static bool HasOrderByNonCommutativeAggregate(List *sortClauseList, List *targetList);
@ -1624,7 +1620,19 @@ MasterAggregateExpression(Aggref *originalAggregate,
 		Expr *directarg;
 		foreach_ptr(directarg, originalAggregate->aggdirectargs)
 		{
-			if (!IsA(directarg, Const) && !IsA(directarg, Param))
+			/*
+			 * Need to replace nodes that contain any Vars with Vars referring
+			 * to the related column of the result set returned for the worker
+			 * aggregation.
+			 *
+			 * When there are no Vars, then the expression can be fully evaluated
+			 * on the coordinator, so we skip it here. This is not just an
+			 * optimization, but the result of the expression might require
+			 * calling the final function of the aggregate, and doing so when
+			 * there are no input rows (i.e.: with an empty tuple slot) is not
+			 * desirable for the node-executor methods.
+			 */
+			if (pull_var_clause_default((Node *) directarg) != NIL)
 			{
 				Var *var = makeVar(masterTableId, walkerContext->columnId,
 								   exprType((Node *) directarg),
@ -2705,38 +2713,6 @@ ProcessWindowFunctionsForWorkerQuery(List *windowClauseList,
 		return;
 	}

-	WindowClause *windowClause = NULL;
-	foreach_ptr(windowClause, windowClauseList)
-	{
-		List *partitionClauseTargetList =
-			GenerateNewTargetEntriesForSortClauses(originalTargetEntryList,
-												   windowClause->partitionClause,
-												   &(queryTargetList->
-													 targetProjectionNumber),
-												   queryWindowClause->
-												   nextSortGroupRefIndex);
-		List *orderClauseTargetList =
-			GenerateNewTargetEntriesForSortClauses(originalTargetEntryList,
-												   windowClause->orderClause,
-												   &(queryTargetList->
-													 targetProjectionNumber),
-												   queryWindowClause->
-												   nextSortGroupRefIndex);
-
-		/*
-		 * Note that even Citus does push down the window clauses as-is, we may still need to
-		 * add the generated entries to the target list. The reason is that the same aggregates
-		 * might be referred from another target entry that is a bare aggregate (e.g., no window
-		 * functions), which would have been mutated. For instance, when an average aggregate
-		 * is mutated on the target list, the window function would refer to a sum aggregate,
-		 * which is obviously wrong.
-		 */
-		queryTargetList->targetEntryList = list_concat(queryTargetList->targetEntryList,
-													   partitionClauseTargetList);
-		queryTargetList->targetEntryList = list_concat(queryTargetList->targetEntryList,
-													   orderClauseTargetList);
-	}
-
 	queryWindowClause->workerWindowClauseList = windowClauseList;
 	queryWindowClause->hasWindowFunctions = true;
 }
@ -2802,19 +2778,6 @@ ProcessLimitOrderByForWorkerQuery(OrderByLimitReference orderByLimitReference,
 							 groupClauseList,
 							 sortClauseList,
 							 orderByLimitReference);
-
-	/*
-	 * TODO: Do we really need to add the target entries if we're not pushing
-	 * down ORDER BY?
-	 */
-	List *newTargetEntryListForSortClauses =
-		GenerateNewTargetEntriesForSortClauses(originalTargetList,
-											   queryOrderByLimit->workerSortClauseList,
-											   &(queryTargetList->targetProjectionNumber),
-											   queryOrderByLimit->nextSortGroupRefIndex);
-
-	queryTargetList->targetEntryList =
-		list_concat(queryTargetList->targetEntryList, newTargetEntryListForSortClauses);
 }


@ -3100,7 +3063,13 @@ WorkerAggregateExpressionList(Aggref *originalAggregate,
 		Expr *directarg;
 		foreach_ptr(directarg, originalAggregate->aggdirectargs)
 		{
-			if (!IsA(directarg, Const) && !IsA(directarg, Param))
+			/*
+			 * The worker aggregation should execute any node that contains any
+			 * Var nodes and return the result in the targetlist, so that the
+			 * combine query can then fetch the result via remote scan; see
+			 * MasterAggregateExpression.
+			 */
+			if (pull_var_clause_default((Node *) directarg) != NIL)
 			{
 				workerAggregateList = lappend(workerAggregateList, directarg);
 			}
@ -4803,87 +4772,6 @@ WorkerSortClauseList(Node *limitCount, List *groupClauseList, List *sortClauseLi
 }


-/*
- * GenerateNewTargetEntriesForSortClauses goes over provided sort clause lists and
- * creates new target entries if needed to make sure sort clauses has correct
- * references. The function returns list of new target entries, caller is
- * responsible to add those target entries to the end of worker target list.
- *
- * The function is required because we change the target entry if it contains an
- * expression having an aggregate operation, or just the AVG aggregate.
- * Afterwards any order by clause referring to original target entry starts
- * to point to a wrong expression.
- *
- * Note the function modifies SortGroupClause items in sortClauseList,
- * targetProjectionNumber, and nextSortGroupRefIndex.
- */
-static List *
-GenerateNewTargetEntriesForSortClauses(List *originalTargetList,
-									   List *sortClauseList,
-									   AttrNumber *targetProjectionNumber,
-									   Index *nextSortGroupRefIndex)
-{
-	List *createdTargetList = NIL;
-
-	SortGroupClause *sgClause = NULL;
-	foreach_ptr(sgClause, sortClauseList)
-	{
-		TargetEntry *targetEntry = get_sortgroupclause_tle(sgClause, originalTargetList);
-		Expr *targetExpr = targetEntry->expr;
-		bool containsAggregate = contain_aggs_of_level((Node *) targetExpr, 0);
-		bool createNewTargetEntry = false;
-
-		/* we are only interested in target entries containing aggregates */
-		if (!containsAggregate)
-		{
-			continue;
-		}
-
-		/*
-		 * If the target expression is not an Aggref, it is either an expression
-		 * on a single aggregate, or expression containing multiple aggregates.
-		 * Worker query mutates these target entries to have a naked target entry
-		 * per aggregate function. We want to use original target entries if this
-		 * the case.
-		 * If the original target expression is an avg aggref, we also want to use
-		 * original target entry.
-		 */
-		if (!IsA(targetExpr, Aggref))
-		{
-			createNewTargetEntry = true;
-		}
-		else
-		{
-			Aggref *aggNode = (Aggref *) targetExpr;
-			AggregateType aggregateType = GetAggregateType(aggNode);
-			if (aggregateType == AGGREGATE_AVERAGE)
-			{
-				createNewTargetEntry = true;
-			}
-		}
-
-		if (createNewTargetEntry)
-		{
-			bool resJunk = true;
-			AttrNumber nextResNo = (*targetProjectionNumber);
-			Expr *newExpr = copyObject(targetExpr);
-			TargetEntry *newTargetEntry = makeTargetEntry(newExpr, nextResNo,
-														  targetEntry->resname, resJunk);
-			newTargetEntry->ressortgroupref = *nextSortGroupRefIndex;
-
-			createdTargetList = lappend(createdTargetList, newTargetEntry);
-
-			sgClause->tleSortGroupRef = *nextSortGroupRefIndex;
-
-			(*nextSortGroupRefIndex)++;
-			(*targetProjectionNumber)++;
-		}
-	}
-
-	return createdTargetList;
-}
-
-
 /*
 * CanPushDownLimitApproximate checks if we can push down the limit clause to
 * the worker nodes, and get approximate and meaningful results. We can do this
--- a/src/backend/distributed/planner/multi_logical_planner.c
+++ b/src/backend/distributed/planner/multi_logical_planner.c
@ -1534,6 +1534,7 @@ MultiTableNodeList(List *tableEntryList, List *rangeTableList)
 		tableNode->partitionColumn = partitionColumn;
 		tableNode->alias = rangeTableEntry->alias;
 		tableNode->referenceNames = rangeTableEntry->eref;
+		tableNode->includePartitions = GetOriginalInh(rangeTableEntry);

 		tableNodeList = lappend(tableNodeList, tableNode);
 	}
--- a/src/backend/distributed/planner/multi_physical_planner.c
+++ b/src/backend/distributed/planner/multi_physical_planner.c
@ -45,6 +45,7 @@
 #include "distributed/multi_join_order.h"
 #include "distributed/multi_logical_optimizer.h"
 #include "distributed/multi_logical_planner.h"
+#include "distributed/multi_partitioning_utils.h"
 #include "distributed/multi_physical_planner.h"
 #include "distributed/log_utils.h"
 #include "distributed/pg_dist_partition.h"
@ -743,6 +744,8 @@ BaseRangeTableList(MultiNode *multiNode)
 				rangeTableEntry->eref = multiTable->referenceNames;
 				rangeTableEntry->alias = multiTable->alias;
 				rangeTableEntry->relid = multiTable->relationId;
+				rangeTableEntry->inh = multiTable->includePartitions;
+
 				SetRangeTblExtraData(rangeTableEntry, CITUS_RTE_RELATION, NULL, NULL,
 									 list_make1_int(multiTable->rangeTableId),
 									 NIL, NIL, NIL, NIL);
@ -824,7 +827,21 @@ static List *
 QueryTargetList(MultiNode *multiNode)
 {
 	List *projectNodeList = FindNodesOfType(multiNode, T_MultiProject);
-	Assert(list_length(projectNodeList) > 0);
+	if (list_length(projectNodeList) == 0)
+	{
+		/*
+		 * The physical planner assumes that all worker queries would have
+		 * target list entries based on the fact that at least the column
+		 * on the JOINs have to be on the target list. However, there is
+		 * an exception to that if there is a cartesian product join and
+		 * there is no additional target list entries belong to one side
+		 * of the JOIN. Once we support cartesian product join, we should
+		 * remove this error.
+		 */
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot perform distributed planning on this query"),
+						errdetail("Cartesian products are currently unsupported")));
+	}

 	MultiProject *topProjectNode = (MultiProject *) linitial(projectNodeList);
 	List *columnList = topProjectNode->columnList;
@ -1454,6 +1471,7 @@ ConstructCallingRTE(RangeTblEntry *rangeTableEntry, List *dependentJobList)
 		callingRTE->rtekind = RTE_RELATION;
 		callingRTE->eref = rangeTableEntry->eref;
 		callingRTE->relid = rangeTableEntry->relid;
+		callingRTE->inh = rangeTableEntry->inh;
 	}
 	else if (rangeTableKind == CITUS_RTE_REMOTE_QUERY)
 	{
@ -4352,16 +4370,8 @@ FragmentAlias(RangeTblEntry *rangeTableEntry, RangeTableFragment *fragment)
 		Oid relationId = rangeTableEntry->relid;
 		char *relationName = get_rel_name(relationId);

-		/*
-		 * If the table is not in the default namespace (public), we include it in
-		 * the fragment alias.
-		 */
 		Oid schemaId = get_rel_namespace(relationId);
 		schemaName = get_namespace_name(schemaId);
-		if (strncmp(schemaName, "public", NAMEDATALEN) == 0)
-		{
-			schemaName = NULL;
-		}

 		aliasName = relationName;

--- a/src/backend/distributed/planner/multi_router_planner.c
+++ b/src/backend/distributed/planner/multi_router_planner.c
@ -555,6 +555,14 @@ ModifyPartialQuerySupported(Query *queryTree, bool multiShardQuery,
 	{
 		ListCell *cteCell = NULL;

+		/* CTEs still not supported for INSERTs. */
+		if (queryTree->commandType == CMD_INSERT)
+		{
+			return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
+								 "Router planner doesn't support common table expressions with INSERT queries.",
+								 NULL, NULL);
+		}
+
 		foreach(cteCell, queryTree->cteList)
 		{
 			CommonTableExpr *cte = (CommonTableExpr *) lfirst(cteCell);
@ -562,31 +570,22 @@ ModifyPartialQuerySupported(Query *queryTree, bool multiShardQuery,

 			if (cteQuery->commandType != CMD_SELECT)
 			{
-				/* Modifying CTEs still not supported for INSERTs & multi shard queries. */
-				if (queryTree->commandType == CMD_INSERT)
-				{
-					return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
-										 "Router planner doesn't support non-select common table expressions with non-select queries.",
-										 NULL, NULL);
-				}
-
+				/* Modifying CTEs still not supported for multi shard queries. */
 				if (multiShardQuery)
 				{
 					return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
 										 "Router planner doesn't support non-select common table expressions with multi shard queries.",
 										 NULL, NULL);
 				}
+				/* Modifying CTEs exclude both INSERT CTEs & INSERT queries. */
+				else if (cteQuery->commandType == CMD_INSERT)
+				{
+					return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
+										 "Router planner doesn't support INSERT common table expressions.",
+										 NULL, NULL);
+				}
 			}

-			/* Modifying CTEs exclude both INSERT CTEs & INSERT queries. */
-			if (cteQuery->commandType == CMD_INSERT)
-			{
-				return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
-									 "Router planner doesn't support INSERT common table expressions.",
-									 NULL, NULL);
-			}
-
-
 			if (cteQuery->hasForUpdate &&
 				FindNodeMatchingCheckFunctionInRangeTableList(cteQuery->rtable,
 															  IsReferenceTableRTE))
@ -2433,7 +2432,7 @@ CreateLocalDummyPlacement()
 {
 	ShardPlacement *dummyPlacement = CitusMakeNode(ShardPlacement);
 	dummyPlacement->nodeId = LOCAL_NODE_ID;
-	dummyPlacement->nodeName = LOCAL_HOST_NAME;
+	dummyPlacement->nodeName = LocalHostName;
 	dummyPlacement->nodePort = PostPortNumber;
 	dummyPlacement->groupId = GetLocalGroupId();
 	return dummyPlacement;
--- a/src/backend/distributed/planner/relation_restriction_equivalence.c
+++ b/src/backend/distributed/planner/relation_restriction_equivalence.c
@ -61,6 +61,8 @@ typedef struct AttributeEquivalenceClass
 {
 	uint32 equivalenceId;
 	List *equivalentAttributes;
+
+	Index unionQueryPartitionKeyIndex;
 } AttributeEquivalenceClass;

 /*
@ -83,7 +85,8 @@ typedef struct AttributeEquivalenceClassMember


 static bool ContextContainsLocalRelation(RelationRestrictionContext *restrictionContext);
-static Var * FindUnionAllVar(PlannerInfo *root, List *appendRelList, Oid relationOid,
+static int RangeTableOffsetCompat(PlannerInfo *root, AppendRelInfo *appendRelInfo);
+static Var * FindUnionAllVar(PlannerInfo *root, List *translatedVars, Oid relationOid,
 							 Index relationRteIndex, Index *partitionKeyIndex);
 static bool ContainsMultipleDistributedRelations(PlannerRestrictionContext *
 												 plannerRestrictionContext);
@ -91,11 +94,11 @@ static List * GenerateAttributeEquivalencesForRelationRestrictions(
 	RelationRestrictionContext *restrictionContext);
 static AttributeEquivalenceClass * AttributeEquivalenceClassForEquivalenceClass(
 	EquivalenceClass *plannerEqClass, RelationRestriction *relationRestriction);
-static void AddToAttributeEquivalenceClass(AttributeEquivalenceClass **
+static void AddToAttributeEquivalenceClass(AttributeEquivalenceClass *
 										   attributeEquivalenceClass,
 										   PlannerInfo *root, Var *varToBeAdded);
 static void AddRteSubqueryToAttributeEquivalenceClass(AttributeEquivalenceClass *
-													  *attributeEquivalenceClass,
+													  attributeEquivalenceClass,
 													  RangeTblEntry *
 													  rangeTableEntry,
 													  PlannerInfo *root,
@ -103,17 +106,17 @@ static void AddRteSubqueryToAttributeEquivalenceClass(AttributeEquivalenceClass
 static Query * GetTargetSubquery(PlannerInfo *root, RangeTblEntry *rangeTableEntry,
 								 Var *varToBeAdded);
 static void AddUnionAllSetOperationsToAttributeEquivalenceClass(
-	AttributeEquivalenceClass **
+	AttributeEquivalenceClass *
 	attributeEquivalenceClass,
 	PlannerInfo *root,
 	Var *varToBeAdded);
-static void AddUnionSetOperationsToAttributeEquivalenceClass(AttributeEquivalenceClass **
+static void AddUnionSetOperationsToAttributeEquivalenceClass(AttributeEquivalenceClass *
 															 attributeEquivalenceClass,
 															 PlannerInfo *root,
 															 SetOperationStmt *
 															 setOperation,
 															 Var *varToBeAdded);
-static void AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass **
+static void AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass *
 													  attrEquivalenceClass,
 													  RangeTblEntry *rangeTableEntry,
 													  Var *varToBeAdded);
@ -141,7 +144,7 @@ static AttributeEquivalenceClass * GenerateEquivalenceClassForRelationRestrictio
 	RelationRestrictionContext
 	*
 	relationRestrictionContext);
-static void ListConcatUniqueAttributeClassMemberLists(AttributeEquivalenceClass **
+static void ListConcatUniqueAttributeClassMemberLists(AttributeEquivalenceClass *
 													  firstClass,
 													  AttributeEquivalenceClass *
 													  secondClass);
@ -156,9 +159,13 @@ static JoinRestrictionContext * FilterJoinRestrictionContext(
 static bool RangeTableArrayContainsAnyRTEIdentities(RangeTblEntry **rangeTableEntries, int
 													rangeTableArrayLength, Relids
 													queryRteIdentities);
-static int RangeTableOffsetCompat(PlannerInfo *root, AppendRelInfo *appendRelInfo);
 static Relids QueryRteIdentities(Query *queryTree);

+#if PG_VERSION_NUM >= PG_VERSION_13
+static int ParentCountPriorToAppendRel(List *appendRelList, AppendRelInfo *appendRelInfo);
+#endif
+
+
 /*
 * AllDistributionKeysInQueryAreEqual returns true if either
 *    (i)  there exists join in the query and all relations joined on their
@ -249,7 +256,7 @@ SafeToPushdownUnionSubquery(PlannerRestrictionContext *plannerRestrictionContext
 		plannerRestrictionContext->relationRestrictionContext;
 	JoinRestrictionContext *joinRestrictionContext =
 		plannerRestrictionContext->joinRestrictionContext;
-	Index unionQueryPartitionKeyIndex = 0;
+
 	AttributeEquivalenceClass *attributeEquivalence =
 		palloc0(sizeof(AttributeEquivalenceClass));
 	ListCell *relationRestrictionCell = NULL;
@ -279,7 +286,8 @@ SafeToPushdownUnionSubquery(PlannerRestrictionContext *plannerRestrictionContext
 		 */
 		if (appendRelList != NULL)
 		{
-			varToBeAdded = FindUnionAllVar(relationPlannerRoot, appendRelList,
+			varToBeAdded = FindUnionAllVar(relationPlannerRoot,
+										   relationRestriction->translatedVars,
 										   relationRestriction->relationId,
 										   relationRestriction->index,
 										   &partitionKeyIndex);
@ -323,17 +331,17 @@ SafeToPushdownUnionSubquery(PlannerRestrictionContext *plannerRestrictionContext
 		 * we check whether all the relations have partition keys in the
 		 * same position.
 		 */
-		if (unionQueryPartitionKeyIndex == InvalidAttrNumber)
+		if (attributeEquivalence->unionQueryPartitionKeyIndex == InvalidAttrNumber)
 		{
-			unionQueryPartitionKeyIndex = partitionKeyIndex;
+			attributeEquivalence->unionQueryPartitionKeyIndex = partitionKeyIndex;
 		}
-		else if (unionQueryPartitionKeyIndex != partitionKeyIndex)
+		else if (attributeEquivalence->unionQueryPartitionKeyIndex != partitionKeyIndex)
 		{
 			continue;
 		}

 		Assert(varToBeAdded != NULL);
-		AddToAttributeEquivalenceClass(&attributeEquivalence, relationPlannerRoot,
+		AddToAttributeEquivalenceClass(attributeEquivalence, relationPlannerRoot,
 									   varToBeAdded);
 	}

@ -373,66 +381,74 @@ SafeToPushdownUnionSubquery(PlannerRestrictionContext *plannerRestrictionContext
 }


+/*
+ * RangeTableOffsetCompat returns the range table offset(in glob->finalrtable) for the appendRelInfo.
+ * For PG < 13 this is a no op.
+ */
+static int
+RangeTableOffsetCompat(PlannerInfo *root, AppendRelInfo *appendRelInfo)
+{
+	#if PG_VERSION_NUM >= PG_VERSION_13
+	int parentCount = ParentCountPriorToAppendRel(root->append_rel_list, appendRelInfo);
+	int skipParentCount = parentCount - 1;
+
+	int i = 1;
+	for (; i < root->simple_rel_array_size; i++)
+	{
+		RangeTblEntry *rte = root->simple_rte_array[i];
+		if (rte->inh)
+		{
+			/*
+			 * We skip the previous parents because we want to find the offset
+			 * for the given append rel info.
+			 */
+			if (skipParentCount > 0)
+			{
+				skipParentCount--;
+				continue;
+			}
+			break;
+		}
+	}
+	int indexInRtable = (i - 1);
+
+	/*
+	 * Postgres adds the global rte array size to parent_relid as an offset.
+	 * Here we do the reverse operation: Commit on postgres side:
+	 * 6ef77cf46e81f45716ec981cb08781d426181378
+	 */
+	int parentRelIndex = appendRelInfo->parent_relid - 1;
+	return parentRelIndex - indexInRtable;
+	#else
+	return 0;
+	#endif
+}
+
+
 /*
 * FindUnionAllVar finds the variable used in union all for the side that has
 * relationRteIndex as its index and the same varattno as the partition key of
 * the given relation with relationOid.
 */
 static Var *
-FindUnionAllVar(PlannerInfo *root, List *appendRelList, Oid relationOid,
+FindUnionAllVar(PlannerInfo *root, List *translatedVars, Oid relationOid,
 				Index relationRteIndex, Index *partitionKeyIndex)
 {
-	ListCell *appendRelCell = NULL;
-	AppendRelInfo *targetAppendRelInfo = NULL;
-	AttrNumber childAttrNumber = 0;
-
-	*partitionKeyIndex = 0;
-
-	/* iterate on the queries that are part of UNION ALL subselects */
-	foreach(appendRelCell, appendRelList)
-	{
-		AppendRelInfo *appendRelInfo = (AppendRelInfo *) lfirst(appendRelCell);
-
-
-		int rtoffset = RangeTableOffsetCompat(root, appendRelInfo);
-
-		/*
-		 * We're only interested in the child rel that is equal to the
-		 * relation we're investigating.
-		 */
-		if (appendRelInfo->child_relid - rtoffset == relationRteIndex)
-		{
-			targetAppendRelInfo = appendRelInfo;
-			break;
-		}
-	}
-
-	if (!targetAppendRelInfo)
+	if (!IsCitusTableType(relationOid, STRICTLY_PARTITIONED_DISTRIBUTED_TABLE))
 	{
+		/* we only care about hash and range partitioned tables */
+		*partitionKeyIndex = 0;
 		return NULL;
 	}

 	Var *relationPartitionKey = DistPartitionKeyOrError(relationOid);

-	#if PG_VERSION_NUM >= PG_VERSION_13
-	for (; childAttrNumber < targetAppendRelInfo->num_child_cols; childAttrNumber++)
-	{
-		int curAttNo = targetAppendRelInfo->parent_colnos[childAttrNumber];
-		if (curAttNo == relationPartitionKey->varattno)
-		{
-			*partitionKeyIndex = (childAttrNumber + 1);
-			int rtoffset = RangeTableOffsetCompat(root, targetAppendRelInfo);
-			relationPartitionKey->varno = targetAppendRelInfo->child_relid - rtoffset;
-			return relationPartitionKey;
-		}
-	}
-	#else
+	AttrNumber childAttrNumber = 0;
+	*partitionKeyIndex = 0;
 	ListCell *translatedVarCell;
-	List *translaterVars = targetAppendRelInfo->translated_vars;
-	foreach(translatedVarCell, translaterVars)
+	foreach(translatedVarCell, translatedVars)
 	{
 		Node *targetNode = (Node *) lfirst(translatedVarCell);
-
 		childAttrNumber++;

 		if (!IsA(targetNode, Var))
@ -449,7 +465,6 @@ FindUnionAllVar(PlannerInfo *root, List *appendRelList, Oid relationOid,
 			return targetVar;
 		}
 	}
-	#endif
 	return NULL;
 }

@ -580,7 +595,6 @@ GenerateAllAttributeEquivalences(PlannerRestrictionContext *plannerRestrictionCo
 	JoinRestrictionContext *joinRestrictionContext =
 		plannerRestrictionContext->joinRestrictionContext;

-
 	/* reset the equivalence id counter per call to prevent overflows */
 	attributeEquivalenceId = 1;

@ -788,14 +802,14 @@ AttributeEquivalenceClassForEquivalenceClass(EquivalenceClass *plannerEqClass,
 										equivalenceParam, &outerNodeRoot);
 			if (expressionVar)
 			{
-				AddToAttributeEquivalenceClass(&attributeEquivalence, outerNodeRoot,
+				AddToAttributeEquivalenceClass(attributeEquivalence, outerNodeRoot,
 											   expressionVar);
 			}
 		}
 		else if (IsA(strippedEquivalenceExpr, Var))
 		{
 			expressionVar = (Var *) strippedEquivalenceExpr;
-			AddToAttributeEquivalenceClass(&attributeEquivalence, plannerInfo,
+			AddToAttributeEquivalenceClass(attributeEquivalence, plannerInfo,
 										   expressionVar);
 		}
 	}
@ -978,7 +992,7 @@ GenerateCommonEquivalence(List *attributeEquivalenceList,
 			if (AttributeClassContainsAttributeClassMember(attributeEquialanceMember,
 														   commonEquivalenceClass))
 			{
-				ListConcatUniqueAttributeClassMemberLists(&commonEquivalenceClass,
+				ListConcatUniqueAttributeClassMemberLists(commonEquivalenceClass,
 														  currentEquivalenceClass);

 				addedEquivalenceIds = bms_add_member(addedEquivalenceIds,
@ -1058,7 +1072,7 @@ GenerateEquivalenceClassForRelationRestriction(
 * firstClass.
 */
 static void
-ListConcatUniqueAttributeClassMemberLists(AttributeEquivalenceClass **firstClass,
+ListConcatUniqueAttributeClassMemberLists(AttributeEquivalenceClass *firstClass,
 										  AttributeEquivalenceClass *secondClass)
 {
 	ListCell *equivalenceClassMemberCell = NULL;
@ -1069,13 +1083,13 @@ ListConcatUniqueAttributeClassMemberLists(AttributeEquivalenceClass **firstClass
 		AttributeEquivalenceClassMember *newEqMember =
 			(AttributeEquivalenceClassMember *) lfirst(equivalenceClassMemberCell);

-		if (AttributeClassContainsAttributeClassMember(newEqMember, *firstClass))
+		if (AttributeClassContainsAttributeClassMember(newEqMember, firstClass))
 		{
 			continue;
 		}

-		(*firstClass)->equivalentAttributes = lappend((*firstClass)->equivalentAttributes,
-													  newEqMember);
+		firstClass->equivalentAttributes = lappend(firstClass->equivalentAttributes,
+												   newEqMember);
 	}
 }

@ -1150,10 +1164,10 @@ GenerateAttributeEquivalencesForJoinRestrictions(JoinRestrictionContext *
 				sizeof(AttributeEquivalenceClass));
 			attributeEquivalence->equivalenceId = attributeEquivalenceId++;

-			AddToAttributeEquivalenceClass(&attributeEquivalence,
+			AddToAttributeEquivalenceClass(attributeEquivalence,
 										   joinRestriction->plannerInfo, leftVar);

-			AddToAttributeEquivalenceClass(&attributeEquivalence,
+			AddToAttributeEquivalenceClass(attributeEquivalence,
 										   joinRestriction->plannerInfo, rightVar);

 			attributeEquivalenceList =
@ -1194,7 +1208,7 @@ GenerateAttributeEquivalencesForJoinRestrictions(JoinRestrictionContext *
 *               equivalence class
 */
 static void
-AddToAttributeEquivalenceClass(AttributeEquivalenceClass **attributeEquivalenceClass,
+AddToAttributeEquivalenceClass(AttributeEquivalenceClass *attributeEquivalenceClass,
 							   PlannerInfo *root, Var *varToBeAdded)
 {
 	/* punt if it's a whole-row var rather than a plain column reference */
@ -1233,9 +1247,10 @@ AddToAttributeEquivalenceClass(AttributeEquivalenceClass **attributeEquivalenceC
 */
 static void
 AddRteSubqueryToAttributeEquivalenceClass(AttributeEquivalenceClass
-										  **attributeEquivalenceClass,
+										  *attributeEquivalenceClass,
 										  RangeTblEntry *rangeTableEntry,
-										  PlannerInfo *root, Var *varToBeAdded)
+										  PlannerInfo *root,
+										  Var *varToBeAdded)
 {
 	RelOptInfo *baseRelOptInfo = find_base_rel(root, varToBeAdded->varno);
 	Query *targetSubquery = GetTargetSubquery(root, rangeTableEntry, varToBeAdded);
@ -1355,7 +1370,7 @@ GetTargetSubquery(PlannerInfo *root, RangeTblEntry *rangeTableEntry, Var *varToB
 * var the given equivalence class.
 */
 static void
-AddUnionAllSetOperationsToAttributeEquivalenceClass(AttributeEquivalenceClass **
+AddUnionAllSetOperationsToAttributeEquivalenceClass(AttributeEquivalenceClass *
 													attributeEquivalenceClass,
 													PlannerInfo *root,
 													Var *varToBeAdded)
@ -1377,41 +1392,101 @@ AddUnionAllSetOperationsToAttributeEquivalenceClass(AttributeEquivalenceClass **
 			continue;
 		}
 		int rtoffset = RangeTableOffsetCompat(root, appendRelInfo);
+		int childRelId = appendRelInfo->child_relid - rtoffset;

-		/* set the varno accordingly for this specific child */
-		varToBeAdded->varno = appendRelInfo->child_relid - rtoffset;
+		if (root->simple_rel_array_size <= childRelId)
+		{
+			/* we prefer to return over an Assert or error to be defensive */
+			return;
+		}

-		AddToAttributeEquivalenceClass(attributeEquivalenceClass, root,
-									   varToBeAdded);
-	}
-}
-
-
-/*
- * RangeTableOffsetCompat returns the range table offset(in glob->finalrtable) for the appendRelInfo.
- * For PG < 13 this is a no op.
- */
-static int
-RangeTableOffsetCompat(PlannerInfo *root, AppendRelInfo *appendRelInfo)
-{
-	#if PG_VERSION_NUM >= PG_VERSION_13
-	int i = 1;
-	for (; i < root->simple_rel_array_size; i++)
-	{
-		RangeTblEntry *rte = root->simple_rte_array[i];
+		RangeTblEntry *rte = root->simple_rte_array[childRelId];
 		if (rte->inh)
 		{
-			break;
+			/*
+			 * This code-path may require improvements. If a leaf of a UNION ALL
+			 * (e.g., an entry in appendRelList) itself is another UNION ALL
+			 * (e.g., rte->inh = true), the logic here might get into an infinite
+			 * recursion.
+			 *
+			 * The downside of "continue" here is that certain UNION ALL queries
+			 * that are safe to pushdown may not be pushed down.
+			 */
+			continue;
+		}
+		else if (rte->rtekind == RTE_RELATION)
+		{
+			Index partitionKeyIndex = 0;
+			List *translatedVars = TranslatedVarsForRteIdentity(GetRTEIdentity(rte));
+			Var *varToBeAddedOnUnionAllSubquery =
+				FindUnionAllVar(root, translatedVars, rte->relid, childRelId,
+								&partitionKeyIndex);
+			if (partitionKeyIndex == 0)
+			{
+				/* no partition key on the target list */
+				continue;
+			}
+
+			if (attributeEquivalenceClass->unionQueryPartitionKeyIndex == 0)
+			{
+				/* the first partition key index we found */
+				attributeEquivalenceClass->unionQueryPartitionKeyIndex =
+					partitionKeyIndex;
+			}
+			else if (attributeEquivalenceClass->unionQueryPartitionKeyIndex !=
+					 partitionKeyIndex)
+			{
+				/*
+				 * Partition keys on the leaves of the UNION ALL queries on
+				 * different ordinal positions. We cannot pushdown, so skip.
+				 */
+				continue;
+			}
+
+			if (varToBeAddedOnUnionAllSubquery != NULL)
+			{
+				AddToAttributeEquivalenceClass(attributeEquivalenceClass, root,
+											   varToBeAddedOnUnionAllSubquery);
+			}
+		}
+		else
+		{
+			/* set the varno accordingly for this specific child */
+			varToBeAdded->varno = childRelId;
+
+			AddToAttributeEquivalenceClass(attributeEquivalenceClass, root,
+										   varToBeAdded);
 		}
 	}
-	int indexInRtable = (i - 1);
-	return appendRelInfo->parent_relid - 1 - (indexInRtable);
-	#else
-	return 0;
-	#endif
 }


+#if PG_VERSION_NUM >= PG_VERSION_13
+
+/*
+ * ParentCountPriorToAppendRel returns the number of parents that come before
+ * the given append rel info.
+ */
+static int
+ParentCountPriorToAppendRel(List *appendRelList, AppendRelInfo *targetAppendRelInfo)
+{
+	int targetParentIndex = targetAppendRelInfo->parent_relid;
+	Bitmapset *parent_ids = NULL;
+	AppendRelInfo *appendRelInfo = NULL;
+	foreach_ptr(appendRelInfo, appendRelList)
+	{
+		int curParentIndex = appendRelInfo->parent_relid;
+		if (curParentIndex <= targetParentIndex)
+		{
+			parent_ids = bms_add_member(parent_ids, curParentIndex);
+		}
+	}
+	return bms_num_members(parent_ids);
+}
+
+
+#endif
+
 /*
 * AddUnionSetOperationsToAttributeEquivalenceClass recursively iterates on all the
 * setOperations and adds each corresponding target entry to the given equivalence
@ -1422,7 +1497,7 @@ RangeTableOffsetCompat(PlannerInfo *root, AppendRelInfo *appendRelInfo)
 * messages.
 */
 static void
-AddUnionSetOperationsToAttributeEquivalenceClass(AttributeEquivalenceClass **
+AddUnionSetOperationsToAttributeEquivalenceClass(AttributeEquivalenceClass *
 												 attributeEquivalenceClass,
 												 PlannerInfo *root,
 												 SetOperationStmt *setOperation,
@ -1450,7 +1525,7 @@ AddUnionSetOperationsToAttributeEquivalenceClass(AttributeEquivalenceClass **
 * the input rte to be an RTE_RELATION.
 */
 static void
-AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass **
+AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass *
 										  attrEquivalenceClass,
 										  RangeTblEntry *rangeTableEntry,
 										  Var *varToBeAdded)
@ -1487,8 +1562,8 @@ AddRteRelationToAttributeEquivalenceClass(AttributeEquivalenceClass **
 	attributeEqMember->rteIdentity = GetRTEIdentity(rangeTableEntry);
 	attributeEqMember->relationId = rangeTableEntry->relid;

-	(*attrEquivalenceClass)->equivalentAttributes =
-		lappend((*attrEquivalenceClass)->equivalentAttributes,
+	attrEquivalenceClass->equivalentAttributes =
+		lappend(attrEquivalenceClass->equivalentAttributes,
 				attributeEqMember);
 }

--- a/src/backend/distributed/planner/shard_pruning.c
+++ b/src/backend/distributed/planner/shard_pruning.c
@ -1575,6 +1575,22 @@ LowerShardBoundary(Datum partitionColumnValue, ShardInterval **shardIntervalCach
 	/* setup partitionColumnValue argument once */
 	fcSetArg(compareFunction, 0, partitionColumnValue);

+	/*
+	 * Now we test partitionColumnValue used in where clause such as
+	 * partCol > partitionColumnValue (or partCol >= partitionColumnValue)
+	 * against four possibilities, these are:
+	 * 1) partitionColumnValue falls into a specific shard, such that:
+	 *    partitionColumnValue >= shard[x].min, and
+	 *    partitionColumnValue < shard[x].max (or partitionColumnValue <= shard[x].max).
+	 * 2) partitionColumnValue < shard[x].min for all the shards
+	 * 3) partitionColumnValue > shard[x].max for all the shards
+	 * 4) partitionColumnValue falls in between two shards, such that:
+	 *    partitionColumnValue > shard[x].max and
+	 *    partitionColumnValue < shard[x+1].min
+	 *
+	 * For 1), we find that shard in below loop using binary search and
+	 * return the index of it. For the others, see the end of this function.
+	 */
 	while (lowerBoundIndex < upperBoundIndex)
 	{
 		int middleIndex = lowerBoundIndex + ((upperBoundIndex - lowerBoundIndex) / 2);
@ -1607,7 +1623,7 @@ LowerShardBoundary(Datum partitionColumnValue, ShardInterval **shardIntervalCach
 			continue;
 		}

-		/* found interval containing partitionValue */
+		/* partitionColumnValue falls into a specific shard, possibility 1) */
 		return middleIndex;
 	}

@ -1618,20 +1634,30 @@ LowerShardBoundary(Datum partitionColumnValue, ShardInterval **shardIntervalCach
 	 * (we'd have hit the return middleIndex; case otherwise). Figure out
 	 * whether there's possibly any interval containing a value that's bigger
 	 * than the partition key one.
+	 *
+	 * Also note that we initialized lowerBoundIndex with 0. Similarly,
+	 * we always set it to the index of the  shard that we consider as our
+	 * lower boundary during binary search.
 	 */
-	if (lowerBoundIndex == 0)
+	if (lowerBoundIndex == shardCount)
 	{
-		/* all intervals are bigger, thus return 0 */
-		return 0;
-	}
-	else if (lowerBoundIndex == shardCount)
-	{
-		/* partition value is bigger than all partition values */
+		/*
+		 * Since lowerBoundIndex is an inclusive index, being equal to shardCount
+		 * means all the shards have smaller values than partitionColumnValue,
+		 * which corresponds to possibility 3).
+		 * In that case, since we can't have a lower bound shard, we return
+		 * INVALID_SHARD_INDEX here.
+		 */
 		return INVALID_SHARD_INDEX;
 	}

-	/* value falls inbetween intervals */
-	return lowerBoundIndex + 1;
+	/*
+	 * partitionColumnValue is either smaller than all the shards or falls in
+	 * between two shards, which corresponds to possibility 2) or 4).
+	 * Knowing that lowerBoundIndex is an inclusive index, we directly return
+	 * it as the index for the lower bound shard here.
+	 */
+	return lowerBoundIndex;
 }


@ -1651,6 +1677,23 @@ UpperShardBoundary(Datum partitionColumnValue, ShardInterval **shardIntervalCach
 	/* setup partitionColumnValue argument once */
 	fcSetArg(compareFunction, 0, partitionColumnValue);

+	/*
+	 * Now we test partitionColumnValue used in where clause such as
+	 * partCol < partitionColumnValue (or partCol <= partitionColumnValue)
+	 * against four possibilities, these are:
+	 * 1) partitionColumnValue falls into a specific shard, such that:
+	 *    partitionColumnValue <= shard[x].max, and
+	 *    partitionColumnValue > shard[x].min (or partitionColumnValue >= shard[x].min).
+	 * 2) partitionColumnValue > shard[x].max for all the shards
+	 * 3) partitionColumnValue < shard[x].min for all the shards
+	 * 4) partitionColumnValue falls in between two shards, such that:
+	 *    partitionColumnValue > shard[x].max and
+	 *    partitionColumnValue < shard[x+1].min
+	 *
+	 * For 1), we find that shard in below loop using binary search and
+	 * return the index of it. For the others, see the end of this function.
+	 */
+
 	while (lowerBoundIndex < upperBoundIndex)
 	{
 		int middleIndex = lowerBoundIndex + ((upperBoundIndex - lowerBoundIndex) / 2);
@ -1683,7 +1726,7 @@ UpperShardBoundary(Datum partitionColumnValue, ShardInterval **shardIntervalCach
 			continue;
 		}

-		/* found interval containing partitionValue */
+		/* partitionColumnValue falls into a specific shard, possibility 1) */
 		return middleIndex;
 	}

@ -1694,19 +1737,29 @@ UpperShardBoundary(Datum partitionColumnValue, ShardInterval **shardIntervalCach
 	 * (we'd have hit the return middleIndex; case otherwise). Figure out
 	 * whether there's possibly any interval containing a value that's smaller
 	 * than the partition key one.
+	 *
+	 * Also note that we initialized upperBoundIndex with shardCount. Similarly,
+	 * we always set it to the index of the next shard that we consider as our
+	 * upper boundary during binary search.
 	 */
-	if (upperBoundIndex == shardCount)
+	if (upperBoundIndex == 0)
 	{
-		/* all intervals are smaller, thus return 0 */
-		return shardCount - 1;
-	}
-	else if (upperBoundIndex == 0)
-	{
-		/* partition value is smaller than all partition values */
+		/*
+		 * Since upperBoundIndex is an exclusive index, being equal to 0 means
+		 * all the shards have greater values than partitionColumnValue, which
+		 * corresponds to possibility 3).
+		 * In that case, since we can't have an upper bound shard, we return
+		 * INVALID_SHARD_INDEX here.
+		 */
 		return INVALID_SHARD_INDEX;
 	}

-	/* value falls inbetween intervals, return the inverval one smaller as bound */
+	/*
+	 * partitionColumnValue is either greater than all the shards or falls in
+	 * between two shards, which corresponds to possibility 2) or 4).
+	 * Knowing that upperBoundIndex is an exclusive index, we return the index
+	 * for the previous shard here.
+	 */
 	return upperBoundIndex - 1;
 }

--- a/src/backend/distributed/progress/multi_progress.c
+++ b/src/backend/distributed/progress/multi_progress.c
@ -27,18 +27,16 @@ static ProgressMonitorData * MonitorDataFromDSMHandle(dsm_handle dsmHandle,


 /*
- * CreateProgressMonitor is used to create a place to store progress information related
- * to long running processes. The function creates a dynamic shared memory segment
- * consisting of a header regarding to the process and an array of "steps" that the long
- * running "operations" consists of. The handle of the dynamic shared memory is stored in
- * pg_stat_get_progress_info output, to be parsed by a progress retrieval command
- * later on. This behavior may cause unrelated (but hopefully harmless) rows in
- * pg_stat_progress_vacuum output. The caller of this function should provide a magic
- * number, a unique 64 bit unsigned integer, to distinguish different types of commands.
+ * CreateProgressMonitor is used to create a place to store progress
+ * information related to long running processes. The function creates a
+ * dynamic shared memory segment consisting of a header regarding to the
+ * process and an array of "steps" that the long running "operations" consists
+ * of. After initializing the data in the array of steps, the shared memory
+ * segment can be shared with other processes using RegisterProgressMonitor, by
+ * giving it the value that's written to the dsmHandle argument.
 */
 ProgressMonitorData *
-CreateProgressMonitor(uint64 progressTypeMagicNumber, int stepCount, Size stepSize,
-					  Oid relationId)
+CreateProgressMonitor(int stepCount, Size stepSize, dsm_handle *dsmHandle)
 {
 	if (stepSize <= 0 || stepCount <= 0)
 	{
@ -58,20 +56,37 @@ CreateProgressMonitor(uint64 progressTypeMagicNumber, int stepCount, Size stepSi
 		return NULL;
 	}

-	dsm_handle dsmHandle = dsm_segment_handle(dsmSegment);
+	*dsmHandle = dsm_segment_handle(dsmSegment);

-	ProgressMonitorData *monitor = MonitorDataFromDSMHandle(dsmHandle, &dsmSegment);
+	ProgressMonitorData *monitor = MonitorDataFromDSMHandle(*dsmHandle, &dsmSegment);

 	monitor->stepCount = stepCount;
 	monitor->processId = MyProcPid;
+	return monitor;
+}

+
+/*
+ * RegisterProgressMonitor shares dsmHandle with other postgres process by
+ * storing it in pg_stat_get_progress_info output, to be parsed by a
+ * progress retrieval command later on. This behavior may cause unrelated (but
+ * hopefully harmless) rows in pg_stat_progress_vacuum output. The caller of
+ * this function should provide a magic number, a unique 64 bit unsigned
+ * integer, to distinguish different types of commands.
+ *
+ * IMPORTANT: After registering the progress monitor, all modification to the
+ * data should be done using concurrency safe operations (i.e. locks and
+ * atomics)
+ */
+void
+RegisterProgressMonitor(uint64 progressTypeMagicNumber, Oid relationId,
+						dsm_handle dsmHandle)
+{
 	pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM, relationId);
 	pgstat_progress_update_param(1, dsmHandle);
 	pgstat_progress_update_param(0, progressTypeMagicNumber);

 	currentProgressDSMHandle = dsmHandle;
-
-	return monitor;
 }


@ -204,24 +219,46 @@ ProgressMonitorData *
 MonitorDataFromDSMHandle(dsm_handle dsmHandle, dsm_segment **attachedSegment)
 {
 	dsm_segment *dsmSegment = dsm_find_mapping(dsmHandle);
-	ProgressMonitorData *monitor = NULL;

 	if (dsmSegment == NULL)
 	{
 		dsmSegment = dsm_attach(dsmHandle);
 	}

-	if (dsmSegment != NULL)
+	if (dsmSegment == NULL)
 	{
-		monitor = (ProgressMonitorData *) dsm_segment_address(dsmSegment);
-		monitor->steps = (void *) (monitor + 1);
-		*attachedSegment = dsmSegment;
+		return NULL;
 	}

+	ProgressMonitorData *monitor = (ProgressMonitorData *) dsm_segment_address(
+		dsmSegment);
+
+	*attachedSegment = dsmSegment;
+
 	return monitor;
 }


+/*
+ * ProgressMonitorSteps returns a pointer to the array of steps that are stored
+ * in a progress monitor. This is simply the data right after the header, so
+ * this function is trivial. The main purpose of this function is to make the
+ * intent clear to readers of the code.
+ *
+ * NOTE: The pointer this function returns is explicitly not stored in the
+ * header, because the header is shared between processes. The absolute pointer
+ * to the steps can have a different value between processes though, because
+ * the same piece of shared memory often has a different address in different
+ * processes. So we calculate this pointer over and over to make sure we use
+ * the right value for each process.
+ */
+void *
+ProgressMonitorSteps(ProgressMonitorData *monitor)
+{
+	return monitor + 1;
+}
+
+
 /*
 * DetachFromDSMSegments ensures that the process is detached from all of the segments in
 * the given list.
--- a/src/backend/distributed/relay/relay_event_utility.c
+++ b/src/backend/distributed/relay/relay_event_utility.c
@ -556,30 +556,6 @@ RelayEventExtendNames(Node *parseTree, char *schemaName, uint64 shardId)

 				AppendShardIdToName(oldRelationName, shardId);
 				AppendShardIdToName(newRelationName, shardId);
-
-				/*
-				 * PostgreSQL creates array types for each ordinary table, with
-				 * the same name plus a prefix of '_'.
-				 *
-				 * ALTER TABLE ... RENAME TO ... also renames the underlying
-				 * array type, and the DDL is run in parallel connections over
-				 * all the placements and shards at once. Concurrent access
-				 * here deadlocks.
-				 *
-				 * Let's provide an easier to understand error message here
-				 * than the deadlock one.
-				 *
-				 * See also https://github.com/citusdata/citus/issues/1664
-				 */
-				int newRelationNameLength = strlen(*newRelationName);
-				if (newRelationNameLength >= (NAMEDATALEN - 1))
-				{
-					ereport(ERROR,
-							(errcode(ERRCODE_NAME_TOO_LONG),
-							 errmsg(
-								 "shard name %s exceeds %d characters",
-								 *newRelationName, NAMEDATALEN - 1)));
-				}
 			}
 			else if (objectType == OBJECT_COLUMN)
 			{
--- a/src/backend/distributed/shared_library_init.c
+++ b/src/backend/distributed/shared_library_init.c
@ -701,6 +701,19 @@ RegisterCitusConfigVariables(void)
 		GUC_NO_SHOW_ALL,
 		NoticeIfSubqueryPushdownEnabled, NULL, NULL);

+	DefineCustomIntVariable(
+		"citus.remote_copy_flush_threshold",
+		gettext_noop("Sets the threshold for remote copy to be flushed."),
+		gettext_noop("When sending data over remote connections via the COPY protocol, "
+					 "bytes are first buffered internally by libpq. If the number of "
+					 "bytes buffered exceeds the threshold, Citus waits for all the "
+					 "bytes to flush."),
+		&RemoteCopyFlushThreshold,
+		8 * 1024 * 1024, 0, INT_MAX,
+		PGC_USERSET,
+		GUC_UNIT_BYTE | GUC_NO_SHOW_ALL,
+		NULL, NULL, NULL);
+
 	DefineCustomIntVariable(
 		"citus.local_copy_flush_threshold",
 		gettext_noop("Sets the threshold for local copy to be flushed."),
@ -1238,6 +1251,16 @@ RegisterCitusConfigVariables(void)
 		GUC_STANDARD,
 		NULL, NULL, NULL);

+	DefineCustomIntVariable(
+		"citus.max_cached_connection_lifetime",
+		gettext_noop("Sets the maximum lifetime of cached connections to other nodes."),
+		NULL,
+		&MaxCachedConnectionLifetime,
+		10 * MS_PER_MINUTE, -1, INT_MAX,
+		PGC_USERSET,
+		GUC_UNIT_MS | GUC_STANDARD,
+		NULL, NULL, NULL);
+
 	DefineCustomIntVariable(
 		"citus.repartition_join_bucket_count_per_node",
 		gettext_noop("Sets the bucket size for repartition joins per node"),
@ -1454,6 +1477,19 @@ RegisterCitusConfigVariables(void)
 		GUC_STANDARD,
 		NULL, NULL, NULL);

+	DefineCustomStringVariable(
+		"citus.local_hostname",
+		gettext_noop("Sets the hostname when connecting back to itself."),
+		gettext_noop("For some operations nodes, mostly the coordinator, connect back to "
+					 "itself. When configuring SSL certificates it sometimes is required "
+					 "to use a specific hostname to match the CN of the certificate when "
+					 "verify-full is used."),
+		&LocalHostName,
+		"localhost",
+		PGC_SUSET,
+		GUC_STANDARD,
+		NULL, NULL, NULL);
+
 	DefineCustomBoolVariable(
 		"citus.writable_standby_coordinator",
 		gettext_noop("Enables simple DML via a streaming replica of the coordinator"),
--- a/src/backend/distributed/sql/citus--10.0-1--10.0-2.sql
+++ b/src/backend/distributed/sql/citus--10.0-1--10.0-2.sql
@ -0,0 +1,5 @@
+-- citus--10.0-1--10.0-2
+
+#include "../../columnar/sql/columnar--10.0-1--10.0-2.sql"
+
+GRANT SELECT ON public.citus_tables TO public;
--- a/src/backend/distributed/sql/citus--10.0-2--10.0-3.sql
+++ b/src/backend/distributed/sql/citus--10.0-2--10.0-3.sql
@ -0,0 +1,18 @@
+-- citus--10.0-2--10.0-3
+
+#include "udfs/citus_update_table_statistics/10.0-3.sql"
+
+CREATE OR REPLACE FUNCTION master_update_table_statistics(relation regclass)
+RETURNS VOID
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$citus_update_table_statistics$$;
+COMMENT ON FUNCTION pg_catalog.master_update_table_statistics(regclass)
+	IS 'updates shard statistics of the given table';
+
+CREATE OR REPLACE FUNCTION pg_catalog.citus_get_active_worker_nodes(OUT node_name text, OUT node_port bigint)
+    RETURNS SETOF record
+    LANGUAGE C STRICT ROWS 100
+    AS 'MODULE_PATHNAME', $$citus_get_active_worker_nodes$$;
+COMMENT ON FUNCTION pg_catalog.citus_get_active_worker_nodes()
+    IS 'fetch set of active worker nodes';
+
--- a/src/backend/distributed/sql/citus--10.0-3--10.0-4.sql
+++ b/src/backend/distributed/sql/citus--10.0-3--10.0-4.sql
@ -0,0 +1,20 @@
+-- citus--10.0-3--10.0-4
+
+-- This migration file aims to fix 2 issues with upgrades on clusters
+
+-- 1. a bug in public schema dependency for citus_tables view.
+--
+-- Users who do not have public schema in their clusters were unable to upgrade
+-- to Citus 10.x due to the citus_tables view that used to be created in public
+-- schema
+
+#include "udfs/citus_tables/10.0-4.sql"
+
+-- 2. a bug in our PG upgrade functions
+--
+-- Users who took the 9.5-2--10.0-1 upgrade path already have the fix, but users
+-- who took the 9.5-1--10.0-1 upgrade path do not. Hence, we repeat the CREATE OR
+-- REPLACE from the 9.5-2 definition for citus_prepare_pg_upgrade.
+
+#include "udfs/citus_prepare_pg_upgrade/9.5-2.sql"
+#include "udfs/citus_finish_pg_upgrade/10.0-4.sql"
--- a/src/backend/distributed/sql/citus--9.4-1--9.4-2.sql
+++ b/src/backend/distributed/sql/citus--9.4-1--9.4-2.sql
@ -0,0 +1,3 @@
+-- 9.4-1--9.4-2 was added later as a patch to fix a bug in our PG upgrade functions
+#include "udfs/citus_prepare_pg_upgrade/9.4-2.sql"
+#include "udfs/citus_finish_pg_upgrade/9.4-2.sql"
--- a/src/backend/distributed/sql/citus--9.4-2--9.4-1.sql
+++ b/src/backend/distributed/sql/citus--9.4-2--9.4-1.sql
@ -0,0 +1,9 @@
+--
+-- 9.4-1--9.4-2 was added later as a patch to fix a bug in our PG upgrade functions
+--
+-- This script brings users who installed the patch released back to the 9.4-1
+-- upgrade path. We do this via a semantical downgrade since there has already been
+-- introduced new changes in the schema from 9.4-1 to 9.5-1. To make sure we include all
+-- changes made during that version change we decide to use the existing upgrade path from
+-- our later introduced 9.4-2 version.
+--
--- a/src/backend/distributed/sql/citus--9.4-2--9.4-3.sql
+++ b/src/backend/distributed/sql/citus--9.4-2--9.4-3.sql
@ -0,0 +1,7 @@
+-- 9.4-2--9.4-3 was added later as a patch to improve master_update_table_statistics
+CREATE OR REPLACE FUNCTION master_update_table_statistics(relation regclass)
+RETURNS VOID
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$citus_update_table_statistics$$;
+COMMENT ON FUNCTION pg_catalog.master_update_table_statistics(regclass)
+	IS 'updates shard statistics of the given table';
--- a/src/backend/distributed/sql/citus--9.4-3--9.4-2.sql
+++ b/src/backend/distributed/sql/citus--9.4-3--9.4-2.sql
@ -0,0 +1,22 @@
+-- citus--9.4-3--9.4-2
+-- This is a downgrade path that will revert the changes made in citus--9.4-2--9.4-3.sql
+-- 9.4-2--9.4-3 was added later as a patch to improve master_update_table_statistics.
+-- We have this downgrade script so that we can continue from the main upgrade path
+-- when upgrading to later versions.
+CREATE OR REPLACE FUNCTION master_update_table_statistics(relation regclass)
+RETURNS VOID AS $$
+DECLARE
+	colocated_tables regclass[];
+BEGIN
+	SELECT get_colocated_table_array(relation) INTO colocated_tables;
+
+	PERFORM
+		master_update_shard_statistics(shardid)
+	FROM
+		pg_dist_shard
+	WHERE
+		logicalrelid = ANY (colocated_tables);
+END;
+$$ LANGUAGE 'plpgsql';
+COMMENT ON FUNCTION master_update_table_statistics(regclass)
+	IS 'updates shard statistics of the given table and its colocated tables';
--- a/src/backend/distributed/sql/citus--9.5-1--10.0-4.sql
+++ b/src/backend/distributed/sql/citus--9.5-1--10.0-4.sql
@ -1,10 +1,16 @@
-- citus--9.5-1--10.0-1
+-- citus--9.5-1--10.0-4
+
+-- This migration file aims to fix the issues with upgrades on clusters without public schema.
+
+-- This file is created by the following command, and some more changes in a separate commit
+-- cat citus--9.5-1--10.0-1.sql citus--10.0-1--10.0-2.sql citus--10.0-2--10.0-3.sql > citus--9.5-1--10.0-4.sql
+
+-- copy of citus--9.5-1--10.0-1

 DROP FUNCTION pg_catalog.upgrade_to_reference_table(regclass);
 DROP FUNCTION IF EXISTS pg_catalog.citus_total_relation_size(regclass);

 #include "udfs/citus_total_relation_size/10.0-1.sql"
-#include "udfs/citus_tables/10.0-1.sql"
 #include "udfs/citus_finish_pg_upgrade/10.0-1.sql"
 #include "udfs/alter_distributed_table/10.0-1.sql"
 #include "udfs/alter_table_set_access_method/10.0-1.sql"
@ -164,4 +170,48 @@ SELECT * FROM pg_catalog.citus_worker_stat_activity();
 ALTER VIEW citus.citus_worker_stat_activity SET SCHEMA pg_catalog;
 GRANT SELECT ON pg_catalog.citus_worker_stat_activity TO PUBLIC;

+-- copy of citus--10.0-1--10.0-2
+
+#include "../../columnar/sql/columnar--10.0-1--10.0-2.sql"
+
+-- copy of citus--10.0-2--10.0-3
+
+#include "udfs/citus_update_table_statistics/10.0-3.sql"
+
+CREATE OR REPLACE FUNCTION master_update_table_statistics(relation regclass)
+RETURNS VOID
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$citus_update_table_statistics$$;
+COMMENT ON FUNCTION pg_catalog.master_update_table_statistics(regclass)
+	IS 'updates shard statistics of the given table';
+
+CREATE OR REPLACE FUNCTION pg_catalog.citus_get_active_worker_nodes(OUT node_name text, OUT node_port bigint)
+    RETURNS SETOF record
+    LANGUAGE C STRICT ROWS 100
+    AS 'MODULE_PATHNAME', $$citus_get_active_worker_nodes$$;
+COMMENT ON FUNCTION pg_catalog.citus_get_active_worker_nodes()
+    IS 'fetch set of active worker nodes';
+
+-- copy of citus--10.0-3--10.0-4
+
+-- This migration file aims to fix 2 issues with upgrades on clusters
+
+-- 1. a bug in public schema dependency for citus_tables view.
+--
+-- Users who do not have public schema in their clusters were unable to upgrade
+-- to Citus 10.x due to the citus_tables view that used to be created in public
+-- schema
+
+#include "udfs/citus_tables/10.0-4.sql"
+
+-- 2. a bug in our PG upgrade functions
+--
+-- Users who took the 9.5-2--10.0-1 upgrade path already have the fix, but users
+-- who took the 9.5-1--10.0-1 upgrade path do not. Hence, we repeat the CREATE OR
+-- REPLACE from the 9.5-2 definition for citus_prepare_pg_upgrade.
+
+#include "udfs/citus_prepare_pg_upgrade/9.5-2.sql"
+#include "udfs/citus_finish_pg_upgrade/10.0-4.sql"
+
+
 RESET search_path;
--- a/src/backend/distributed/sql/citus--9.5-1--9.5-2.sql
+++ b/src/backend/distributed/sql/citus--9.5-1--9.5-2.sql
@ -0,0 +1,3 @@
+-- 9.5-1--9.5-2 was added later as a patch to fix a bug in our PG upgrade functions
+#include "udfs/citus_prepare_pg_upgrade/9.5-2.sql"
+#include "udfs/citus_finish_pg_upgrade/9.5-2.sql"
--- a/src/backend/distributed/sql/citus--9.5-2--9.5-1.sql
+++ b/src/backend/distributed/sql/citus--9.5-2--9.5-1.sql
@ -0,0 +1,9 @@
+--
+-- 9.5-1--9.5-2 was added later as a patch to fix a bug in our PG upgrade functions
+--
+-- This script brings users who installed the patch released back to the 9.5-1
+-- upgrade path. We do this via a semantical downgrade since there has already been
+-- introduced new changes in the schema from 9.5-1 to 10.0-1. To make sure we include all
+-- changes made during that version change we decide to use the existing upgrade path from
+-- our later introduced 9.5-1 version.
+--
--- a/src/backend/distributed/sql/citus--9.5-2--9.5-3.sql
+++ b/src/backend/distributed/sql/citus--9.5-2--9.5-3.sql
@ -0,0 +1,7 @@
+-- 9.5-2--9.5-3 was added later as a patch to improve master_update_table_statistics
+CREATE OR REPLACE FUNCTION master_update_table_statistics(relation regclass)
+RETURNS VOID
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$citus_update_table_statistics$$;
+COMMENT ON FUNCTION pg_catalog.master_update_table_statistics(regclass)
+	IS 'updates shard statistics of the given table';
--- a/src/backend/distributed/sql/citus--9.5-3--9.5-2.sql
+++ b/src/backend/distributed/sql/citus--9.5-3--9.5-2.sql
@ -0,0 +1,22 @@
+-- citus--9.5-3--9.5-2
+-- This is a downgrade path that will revert the changes made in citus--9.5-2--9.5-3.sql
+-- 9.5-2--9.5-3 was added later as a patch to improve master_update_table_statistics.
+-- We have this downgrade script so that we can continue from the main upgrade path
+-- when upgrading to later versions.
+CREATE OR REPLACE FUNCTION master_update_table_statistics(relation regclass)
+RETURNS VOID AS $$
+DECLARE
+	colocated_tables regclass[];
+BEGIN
+	SELECT get_colocated_table_array(relation) INTO colocated_tables;
+
+	PERFORM
+		master_update_shard_statistics(shardid)
+	FROM
+		pg_dist_shard
+	WHERE
+		logicalrelid = ANY (colocated_tables);
+END;
+$$ LANGUAGE 'plpgsql';
+COMMENT ON FUNCTION master_update_table_statistics(regclass)
+	IS 'updates shard statistics of the given table and its colocated tables';
--- a/src/backend/distributed/sql/downgrades/citus--10.0-4--9.5-1.sql
+++ b/src/backend/distributed/sql/downgrades/citus--10.0-4--9.5-1.sql
@ -1,4 +1,51 @@
-- citus--10.0-1--9.5-1
+-- citus--10.0-4--9.5-1
+
+-- This migration file aims to fix the issues with upgrades on clusters without public schema.
+
+-- This file is created by the following command, and some more changes in a separate commit
+-- cat citus--10.0-3--10.0-2.sql citus--10.0-2--10.0-1.sql citus--10.0-1--9.5-1.sql > citus--10.0-4--9.5-1.sql
+
+-- copy of citus--10.0-4--10.0-3
+--
+-- 10.0-3--10.0-4 was added later as a patch to fix a bug in our PG upgrade functions
+--
+-- The upgrade fixes a bug in citus_(prepare|finish)_pg_upgrade. Given the old versions of
+-- these functions contain a bug it is better to _not_ restore the old version and keep
+-- the patched version of the function.
+--
+-- This is inline with the downgrade scripts for earlier versions of this patch
+--
+
+-- copy of citus--10.0-3--10.0-2
+-- this is a downgrade path that will revert the changes made in citus--10.0-2--10.0-3.sql
+
+DROP FUNCTION pg_catalog.citus_update_table_statistics(regclass);
+
+#include "../udfs/citus_update_table_statistics/10.0-1.sql"
+
+CREATE OR REPLACE FUNCTION master_update_table_statistics(relation regclass)
+RETURNS VOID AS $$
+DECLARE
+	colocated_tables regclass[];
+BEGIN
+	SELECT get_colocated_table_array(relation) INTO colocated_tables;
+
+	PERFORM
+		master_update_shard_statistics(shardid)
+	FROM
+		pg_dist_shard
+	WHERE
+		logicalrelid = ANY (colocated_tables);
+END;
+$$ LANGUAGE 'plpgsql';
+COMMENT ON FUNCTION master_update_table_statistics(regclass)
+	IS 'updates shard statistics of the given table and its colocated tables';
+
+DROP FUNCTION pg_catalog.citus_get_active_worker_nodes(OUT text, OUT bigint);
+/* copy of citus--10.0-2--10.0-1.sql */
+#include "../../../columnar/sql/downgrades/columnar--10.0-2--10.0-1.sql"
+
+-- copy of citus--10.0-1--9.5-1

 -- In Citus 10.0, we added another internal udf (notify_constraint_dropped)
 -- to be called by citus_drop_trigger. Since this script is executed when
@ -18,7 +65,8 @@ DROP FUNCTION pg_catalog.notify_constraint_dropped();

 #include "../../../columnar/sql/downgrades/columnar--10.0-1--9.5-1.sql"

-DROP VIEW public.citus_tables;
+DROP VIEW IF EXISTS pg_catalog.citus_tables;
+DROP VIEW IF EXISTS public.citus_tables;
 DROP FUNCTION pg_catalog.alter_distributed_table(regclass, text, int, text, boolean);
 DROP FUNCTION pg_catalog.alter_table_set_access_method(regclass, text);
 DROP FUNCTION pg_catalog.citus_total_relation_size(regclass,boolean);
--- a/src/backend/distributed/sql/udfs/citus_finish_pg_upgrade/10.0-4.sql
+++ b/src/backend/distributed/sql/udfs/citus_finish_pg_upgrade/10.0-4.sql
@ -0,0 +1,108 @@
+CREATE OR REPLACE FUNCTION pg_catalog.citus_finish_pg_upgrade()
+    RETURNS void
+    LANGUAGE plpgsql
+    SET search_path = pg_catalog
+    AS $cppu$
+DECLARE
+    table_name regclass;
+    command text;
+    trigger_name text;
+BEGIN
+    --
+    -- restore citus catalog tables
+    --
+    INSERT INTO pg_catalog.pg_dist_partition SELECT * FROM public.pg_dist_partition;
+    INSERT INTO pg_catalog.pg_dist_shard SELECT * FROM public.pg_dist_shard;
+    INSERT INTO pg_catalog.pg_dist_placement SELECT * FROM public.pg_dist_placement;
+    INSERT INTO pg_catalog.pg_dist_node_metadata SELECT * FROM public.pg_dist_node_metadata;
+    INSERT INTO pg_catalog.pg_dist_node SELECT * FROM public.pg_dist_node;
+    INSERT INTO pg_catalog.pg_dist_local_group SELECT * FROM public.pg_dist_local_group;
+    INSERT INTO pg_catalog.pg_dist_transaction SELECT * FROM public.pg_dist_transaction;
+    INSERT INTO pg_catalog.pg_dist_colocation SELECT * FROM public.pg_dist_colocation;
+    -- enterprise catalog tables
+    INSERT INTO pg_catalog.pg_dist_authinfo SELECT * FROM public.pg_dist_authinfo;
+    INSERT INTO pg_catalog.pg_dist_poolinfo SELECT * FROM public.pg_dist_poolinfo;
+
+    ALTER TABLE pg_catalog.pg_dist_rebalance_strategy DISABLE TRIGGER pg_dist_rebalance_strategy_enterprise_check_trigger;
+    INSERT INTO pg_catalog.pg_dist_rebalance_strategy SELECT
+        name,
+        default_strategy,
+        shard_cost_function::regprocedure::regproc,
+        node_capacity_function::regprocedure::regproc,
+        shard_allowed_on_node_function::regprocedure::regproc,
+        default_threshold,
+        minimum_threshold
+    FROM public.pg_dist_rebalance_strategy;
+    ALTER TABLE pg_catalog.pg_dist_rebalance_strategy ENABLE TRIGGER pg_dist_rebalance_strategy_enterprise_check_trigger;
+
+    --
+    -- drop backup tables
+    --
+    DROP TABLE public.pg_dist_authinfo;
+    DROP TABLE public.pg_dist_colocation;
+    DROP TABLE public.pg_dist_local_group;
+    DROP TABLE public.pg_dist_node;
+    DROP TABLE public.pg_dist_node_metadata;
+    DROP TABLE public.pg_dist_partition;
+    DROP TABLE public.pg_dist_placement;
+    DROP TABLE public.pg_dist_poolinfo;
+    DROP TABLE public.pg_dist_shard;
+    DROP TABLE public.pg_dist_transaction;
+    DROP TABLE public.pg_dist_rebalance_strategy;
+
+    --
+    -- reset sequences
+    --
+    PERFORM setval('pg_catalog.pg_dist_shardid_seq', (SELECT MAX(shardid)+1 AS max_shard_id FROM pg_dist_shard), false);
+    PERFORM setval('pg_catalog.pg_dist_placement_placementid_seq', (SELECT MAX(placementid)+1 AS max_placement_id FROM pg_dist_placement), false);
+    PERFORM setval('pg_catalog.pg_dist_groupid_seq', (SELECT MAX(groupid)+1 AS max_group_id FROM pg_dist_node), false);
+    PERFORM setval('pg_catalog.pg_dist_node_nodeid_seq', (SELECT MAX(nodeid)+1 AS max_node_id FROM pg_dist_node), false);
+    PERFORM setval('pg_catalog.pg_dist_colocationid_seq', (SELECT MAX(colocationid)+1 AS max_colocation_id FROM pg_dist_colocation), false);
+
+    --
+    -- register triggers
+    --
+    FOR table_name IN SELECT logicalrelid FROM pg_catalog.pg_dist_partition
+    LOOP
+        trigger_name := 'truncate_trigger_' || table_name::oid;
+        command := 'create trigger ' || trigger_name || ' after truncate on ' || table_name || ' execute procedure pg_catalog.citus_truncate_trigger()';
+        EXECUTE command;
+        command := 'update pg_trigger set tgisinternal = true where tgname = ' || quote_literal(trigger_name);
+        EXECUTE command;
+    END LOOP;
+
+    --
+    -- set dependencies
+    --
+    INSERT INTO pg_depend
+    SELECT
+        'pg_class'::regclass::oid as classid,
+        p.logicalrelid::regclass::oid as objid,
+        0 as objsubid,
+        'pg_extension'::regclass::oid as refclassid,
+        (select oid from pg_extension where extname = 'citus') as refobjid,
+        0 as refobjsubid ,
+        'n' as deptype
+    FROM pg_catalog.pg_dist_partition p;
+
+    -- restore pg_dist_object from the stable identifiers
+    TRUNCATE citus.pg_dist_object;
+    INSERT INTO citus.pg_dist_object (classid, objid, objsubid, distribution_argument_index, colocationid)
+    SELECT
+        address.classid,
+        address.objid,
+        address.objsubid,
+        naming.distribution_argument_index,
+        naming.colocationid
+    FROM
+        public.pg_dist_object naming,
+        pg_catalog.pg_get_object_address(naming.type, naming.object_names, naming.object_args) address;
+
+    DROP TABLE public.pg_dist_object;
+
+    PERFORM citus_internal.columnar_ensure_objects_exist();
+END;
+$cppu$;
+
+COMMENT ON FUNCTION pg_catalog.citus_finish_pg_upgrade()
+    IS 'perform tasks to restore citus settings from a location that has been prepared before pg_upgrade';
--- a/src/backend/distributed/sql/udfs/citus_finish_pg_upgrade/9.4-2.sql
+++ b/src/backend/distributed/sql/udfs/citus_finish_pg_upgrade/9.4-2.sql
@ -0,0 +1,105 @@
+CREATE OR REPLACE FUNCTION pg_catalog.citus_finish_pg_upgrade()
+    RETURNS void
+    LANGUAGE plpgsql
+    SET search_path = pg_catalog
+    AS $cppu$
+DECLARE
+    table_name regclass;
+    command text;
+    trigger_name text;
+BEGIN
+    --
+    -- restore citus catalog tables
+    --
+    INSERT INTO pg_catalog.pg_dist_partition SELECT * FROM public.pg_dist_partition;
+    INSERT INTO pg_catalog.pg_dist_shard SELECT * FROM public.pg_dist_shard;
+    INSERT INTO pg_catalog.pg_dist_placement SELECT * FROM public.pg_dist_placement;
+    INSERT INTO pg_catalog.pg_dist_node_metadata SELECT * FROM public.pg_dist_node_metadata;
+    INSERT INTO pg_catalog.pg_dist_node SELECT * FROM public.pg_dist_node;
+    INSERT INTO pg_catalog.pg_dist_local_group SELECT * FROM public.pg_dist_local_group;
+    INSERT INTO pg_catalog.pg_dist_transaction SELECT * FROM public.pg_dist_transaction;
+    INSERT INTO pg_catalog.pg_dist_colocation SELECT * FROM public.pg_dist_colocation;
+    -- enterprise catalog tables
+    INSERT INTO pg_catalog.pg_dist_authinfo SELECT * FROM public.pg_dist_authinfo;
+    INSERT INTO pg_catalog.pg_dist_poolinfo SELECT * FROM public.pg_dist_poolinfo;
+
+    ALTER TABLE pg_catalog.pg_dist_rebalance_strategy DISABLE TRIGGER pg_dist_rebalance_strategy_enterprise_check_trigger;
+    INSERT INTO pg_catalog.pg_dist_rebalance_strategy SELECT
+        name,
+        default_strategy,
+        shard_cost_function::regprocedure::regproc,
+        node_capacity_function::regprocedure::regproc,
+        shard_allowed_on_node_function::regprocedure::regproc,
+        default_threshold,
+        minimum_threshold
+    FROM public.pg_dist_rebalance_strategy;
+    ALTER TABLE pg_catalog.pg_dist_rebalance_strategy ENABLE TRIGGER pg_dist_rebalance_strategy_enterprise_check_trigger;
+
+    --
+    -- drop backup tables
+    --
+    DROP TABLE public.pg_dist_authinfo;
+    DROP TABLE public.pg_dist_colocation;
+    DROP TABLE public.pg_dist_local_group;
+    DROP TABLE public.pg_dist_node;
+    DROP TABLE public.pg_dist_node_metadata;
+    DROP TABLE public.pg_dist_partition;
+    DROP TABLE public.pg_dist_placement;
+    DROP TABLE public.pg_dist_poolinfo;
+    DROP TABLE public.pg_dist_shard;
+    DROP TABLE public.pg_dist_transaction;
+
+    --
+    -- reset sequences
+    --
+    PERFORM setval('pg_catalog.pg_dist_shardid_seq', (SELECT MAX(shardid)+1 AS max_shard_id FROM pg_dist_shard), false);
+    PERFORM setval('pg_catalog.pg_dist_placement_placementid_seq', (SELECT MAX(placementid)+1 AS max_placement_id FROM pg_dist_placement), false);
+    PERFORM setval('pg_catalog.pg_dist_groupid_seq', (SELECT MAX(groupid)+1 AS max_group_id FROM pg_dist_node), false);
+    PERFORM setval('pg_catalog.pg_dist_node_nodeid_seq', (SELECT MAX(nodeid)+1 AS max_node_id FROM pg_dist_node), false);
+    PERFORM setval('pg_catalog.pg_dist_colocationid_seq', (SELECT MAX(colocationid)+1 AS max_colocation_id FROM pg_dist_colocation), false);
+
+    --
+    -- register triggers
+    --
+    FOR table_name IN SELECT logicalrelid FROM pg_catalog.pg_dist_partition
+    LOOP
+        trigger_name := 'truncate_trigger_' || table_name::oid;
+        command := 'create trigger ' || trigger_name || ' after truncate on ' || table_name || ' execute procedure pg_catalog.citus_truncate_trigger()';
+        EXECUTE command;
+        command := 'update pg_trigger set tgisinternal = true where tgname = ' || quote_literal(trigger_name);
+        EXECUTE command;
+    END LOOP;
+
+    --
+    -- set dependencies
+    --
+    INSERT INTO pg_depend
+    SELECT
+        'pg_class'::regclass::oid as classid,
+        p.logicalrelid::regclass::oid as objid,
+        0 as objsubid,
+        'pg_extension'::regclass::oid as refclassid,
+        (select oid from pg_extension where extname = 'citus') as refobjid,
+        0 as refobjsubid ,
+        'n' as deptype
+    FROM pg_catalog.pg_dist_partition p;
+
+    -- restore pg_dist_object from the stable identifiers
+    TRUNCATE citus.pg_dist_object;
+    INSERT INTO citus.pg_dist_object (classid, objid, objsubid, distribution_argument_index, colocationid)
+    SELECT
+        address.classid,
+        address.objid,
+        address.objsubid,
+        naming.distribution_argument_index,
+        naming.colocationid
+    FROM
+        public.pg_dist_object naming,
+        pg_catalog.pg_get_object_address(naming.type, naming.object_names, naming.object_args) address;
+
+    DROP TABLE public.pg_dist_object;
+END;
+$cppu$;
+
+COMMENT ON FUNCTION pg_catalog.citus_finish_pg_upgrade()
+    IS 'perform tasks to restore citus settings from a location that has been prepared before pg_upgrade';
--- a/src/backend/distributed/sql/udfs/citus_finish_pg_upgrade/9.5-2.sql
+++ b/src/backend/distributed/sql/udfs/citus_finish_pg_upgrade/9.5-2.sql
@ -0,0 +1,106 @@
+CREATE OR REPLACE FUNCTION pg_catalog.citus_finish_pg_upgrade()
+    RETURNS void
+    LANGUAGE plpgsql
+    SET search_path = pg_catalog
+    AS $cppu$
+DECLARE
+    table_name regclass;
+    command text;
+    trigger_name text;
+BEGIN
+    --
+    -- restore citus catalog tables
+    --
+    INSERT INTO pg_catalog.pg_dist_partition SELECT * FROM public.pg_dist_partition;
+    INSERT INTO pg_catalog.pg_dist_shard SELECT * FROM public.pg_dist_shard;
+    INSERT INTO pg_catalog.pg_dist_placement SELECT * FROM public.pg_dist_placement;
+    INSERT INTO pg_catalog.pg_dist_node_metadata SELECT * FROM public.pg_dist_node_metadata;
+    INSERT INTO pg_catalog.pg_dist_node SELECT * FROM public.pg_dist_node;
+    INSERT INTO pg_catalog.pg_dist_local_group SELECT * FROM public.pg_dist_local_group;
+    INSERT INTO pg_catalog.pg_dist_transaction SELECT * FROM public.pg_dist_transaction;
+    INSERT INTO pg_catalog.pg_dist_colocation SELECT * FROM public.pg_dist_colocation;
+    -- enterprise catalog tables
+    INSERT INTO pg_catalog.pg_dist_authinfo SELECT * FROM public.pg_dist_authinfo;
+    INSERT INTO pg_catalog.pg_dist_poolinfo SELECT * FROM public.pg_dist_poolinfo;
+
+    ALTER TABLE pg_catalog.pg_dist_rebalance_strategy DISABLE TRIGGER pg_dist_rebalance_strategy_enterprise_check_trigger;
+    INSERT INTO pg_catalog.pg_dist_rebalance_strategy SELECT
+        name,
+        default_strategy,
+        shard_cost_function::regprocedure::regproc,
+        node_capacity_function::regprocedure::regproc,
+        shard_allowed_on_node_function::regprocedure::regproc,
+        default_threshold,
+        minimum_threshold
+    FROM public.pg_dist_rebalance_strategy;
+    ALTER TABLE pg_catalog.pg_dist_rebalance_strategy ENABLE TRIGGER pg_dist_rebalance_strategy_enterprise_check_trigger;
+
+    --
+    -- drop backup tables
+    --
+    DROP TABLE public.pg_dist_authinfo;
+    DROP TABLE public.pg_dist_colocation;
+    DROP TABLE public.pg_dist_local_group;
+    DROP TABLE public.pg_dist_node;
+    DROP TABLE public.pg_dist_node_metadata;
+    DROP TABLE public.pg_dist_partition;
+    DROP TABLE public.pg_dist_placement;
+    DROP TABLE public.pg_dist_poolinfo;
+    DROP TABLE public.pg_dist_shard;
+    DROP TABLE public.pg_dist_transaction;
+    DROP TABLE public.pg_dist_rebalance_strategy;
+
+    --
+    -- reset sequences
+    --
+    PERFORM setval('pg_catalog.pg_dist_shardid_seq', (SELECT MAX(shardid)+1 AS max_shard_id FROM pg_dist_shard), false);
+    PERFORM setval('pg_catalog.pg_dist_placement_placementid_seq', (SELECT MAX(placementid)+1 AS max_placement_id FROM pg_dist_placement), false);
+    PERFORM setval('pg_catalog.pg_dist_groupid_seq', (SELECT MAX(groupid)+1 AS max_group_id FROM pg_dist_node), false);
+    PERFORM setval('pg_catalog.pg_dist_node_nodeid_seq', (SELECT MAX(nodeid)+1 AS max_node_id FROM pg_dist_node), false);
+    PERFORM setval('pg_catalog.pg_dist_colocationid_seq', (SELECT MAX(colocationid)+1 AS max_colocation_id FROM pg_dist_colocation), false);
+
+    --
+    -- register triggers
+    --
+    FOR table_name IN SELECT logicalrelid FROM pg_catalog.pg_dist_partition
+    LOOP
+        trigger_name := 'truncate_trigger_' || table_name::oid;
+        command := 'create trigger ' || trigger_name || ' after truncate on ' || table_name || ' execute procedure pg_catalog.citus_truncate_trigger()';
+        EXECUTE command;
+        command := 'update pg_trigger set tgisinternal = true where tgname = ' || quote_literal(trigger_name);
+        EXECUTE command;
+    END LOOP;
+
+    --
+    -- set dependencies
+    --
+    INSERT INTO pg_depend
+    SELECT
+        'pg_class'::regclass::oid as classid,
+        p.logicalrelid::regclass::oid as objid,
+        0 as objsubid,
+        'pg_extension'::regclass::oid as refclassid,
+        (select oid from pg_extension where extname = 'citus') as refobjid,
+        0 as refobjsubid ,
+        'n' as deptype
+    FROM pg_catalog.pg_dist_partition p;
+
+    -- restore pg_dist_object from the stable identifiers
+    TRUNCATE citus.pg_dist_object;
+    INSERT INTO citus.pg_dist_object (classid, objid, objsubid, distribution_argument_index, colocationid)
+    SELECT
+        address.classid,
+        address.objid,
+        address.objsubid,
+        naming.distribution_argument_index,
+        naming.colocationid
+    FROM
+        public.pg_dist_object naming,
+        pg_catalog.pg_get_object_address(naming.type, naming.object_names, naming.object_args) address;
+
+    DROP TABLE public.pg_dist_object;
+END;
+$cppu$;
+
+COMMENT ON FUNCTION pg_catalog.citus_finish_pg_upgrade()
+    IS 'perform tasks to restore citus settings from a location that has been prepared before pg_upgrade';
--- a/src/backend/distributed/sql/udfs/citus_finish_pg_upgrade/latest.sql
+++ b/src/backend/distributed/sql/udfs/citus_finish_pg_upgrade/latest.sql
@ -86,17 +86,7 @@ BEGIN
    FROM pg_catalog.pg_dist_partition p;

    -- restore pg_dist_object from the stable identifiers
-    -- DELETE/INSERT to avoid primary key violations
-    WITH old_records AS (
-        DELETE FROM
-            citus.pg_dist_object
-        RETURNING
-            type,
-            object_names,
-            object_args,
-            distribution_argument_index,
-            colocationid
-    )
+    TRUNCATE citus.pg_dist_object;
    INSERT INTO citus.pg_dist_object (classid, objid, objsubid, distribution_argument_index, colocationid)
    SELECT
        address.classid,
@ -105,8 +95,10 @@ BEGIN
        naming.distribution_argument_index,
        naming.colocationid
    FROM
-        old_records naming,
-        pg_get_object_address(naming.type, naming.object_names, naming.object_args) address;
+        public.pg_dist_object naming,
+        pg_catalog.pg_get_object_address(naming.type, naming.object_names, naming.object_args) address;
+
+    DROP TABLE public.pg_dist_object;

    PERFORM citus_internal.columnar_ensure_objects_exist();
 END;
--- a/src/backend/distributed/sql/udfs/citus_prepare_pg_upgrade/9.4-2.sql
+++ b/src/backend/distributed/sql/udfs/citus_prepare_pg_upgrade/9.4-2.sql
@ -0,0 +1,44 @@
+CREATE OR REPLACE FUNCTION pg_catalog.citus_prepare_pg_upgrade()
+    RETURNS void
+    LANGUAGE plpgsql
+    SET search_path = pg_catalog
+    AS $cppu$
+BEGIN
+    --
+    -- backup citus catalog tables
+    --
+    CREATE TABLE public.pg_dist_partition AS SELECT * FROM pg_catalog.pg_dist_partition;
+    CREATE TABLE public.pg_dist_shard AS SELECT * FROM pg_catalog.pg_dist_shard;
+    CREATE TABLE public.pg_dist_placement AS SELECT * FROM pg_catalog.pg_dist_placement;
+    CREATE TABLE public.pg_dist_node_metadata AS SELECT * FROM pg_catalog.pg_dist_node_metadata;
+    CREATE TABLE public.pg_dist_node AS SELECT * FROM pg_catalog.pg_dist_node;
+    CREATE TABLE public.pg_dist_local_group AS SELECT * FROM pg_catalog.pg_dist_local_group;
+    CREATE TABLE public.pg_dist_transaction AS SELECT * FROM pg_catalog.pg_dist_transaction;
+    CREATE TABLE public.pg_dist_colocation AS SELECT * FROM pg_catalog.pg_dist_colocation;
+    -- enterprise catalog tables
+    CREATE TABLE public.pg_dist_authinfo AS SELECT * FROM pg_catalog.pg_dist_authinfo;
+    CREATE TABLE public.pg_dist_poolinfo AS SELECT * FROM pg_catalog.pg_dist_poolinfo;
+    CREATE TABLE public.pg_dist_rebalance_strategy AS SELECT
+        name,
+        default_strategy,
+        shard_cost_function::regprocedure::text,
+        node_capacity_function::regprocedure::text,
+        shard_allowed_on_node_function::regprocedure::text,
+        default_threshold,
+        minimum_threshold
+    FROM pg_catalog.pg_dist_rebalance_strategy;
+
+    -- store upgrade stable identifiers on pg_dist_object catalog
+    CREATE TABLE public.pg_dist_object AS SELECT
+       address.type,
+       address.object_names,
+       address.object_args,
+       objects.distribution_argument_index,
+       objects.colocationid
+    FROM citus.pg_dist_object objects,
+         pg_catalog.pg_identify_object_as_address(objects.classid, objects.objid, objects.objsubid) address;
+END;
+$cppu$;
+
+COMMENT ON FUNCTION pg_catalog.citus_prepare_pg_upgrade()
+    IS 'perform tasks to copy citus settings to a location that could later be restored after pg_upgrade is done';
--- a/src/backend/distributed/sql/udfs/citus_prepare_pg_upgrade/9.5-2.sql
+++ b/src/backend/distributed/sql/udfs/citus_prepare_pg_upgrade/9.5-2.sql
@ -0,0 +1,60 @@
+CREATE OR REPLACE FUNCTION pg_catalog.citus_prepare_pg_upgrade()
+    RETURNS void
+    LANGUAGE plpgsql
+    SET search_path = pg_catalog
+    AS $cppu$
+BEGIN
+    --
+    -- Drop existing backup tables
+    --
+    DROP TABLE IF EXISTS public.pg_dist_partition;
+    DROP TABLE IF EXISTS public.pg_dist_shard;
+    DROP TABLE IF EXISTS public.pg_dist_placement;
+    DROP TABLE IF EXISTS public.pg_dist_node_metadata;
+    DROP TABLE IF EXISTS public.pg_dist_node;
+    DROP TABLE IF EXISTS public.pg_dist_local_group;
+    DROP TABLE IF EXISTS public.pg_dist_transaction;
+    DROP TABLE IF EXISTS public.pg_dist_colocation;
+    DROP TABLE IF EXISTS public.pg_dist_authinfo;
+    DROP TABLE IF EXISTS public.pg_dist_poolinfo;
+    DROP TABLE IF EXISTS public.pg_dist_rebalance_strategy;
+    DROP TABLE IF EXISTS public.pg_dist_object;
+
+    --
+    -- backup citus catalog tables
+    --
+    CREATE TABLE public.pg_dist_partition AS SELECT * FROM pg_catalog.pg_dist_partition;
+    CREATE TABLE public.pg_dist_shard AS SELECT * FROM pg_catalog.pg_dist_shard;
+    CREATE TABLE public.pg_dist_placement AS SELECT * FROM pg_catalog.pg_dist_placement;
+    CREATE TABLE public.pg_dist_node_metadata AS SELECT * FROM pg_catalog.pg_dist_node_metadata;
+    CREATE TABLE public.pg_dist_node AS SELECT * FROM pg_catalog.pg_dist_node;
+    CREATE TABLE public.pg_dist_local_group AS SELECT * FROM pg_catalog.pg_dist_local_group;
+    CREATE TABLE public.pg_dist_transaction AS SELECT * FROM pg_catalog.pg_dist_transaction;
+    CREATE TABLE public.pg_dist_colocation AS SELECT * FROM pg_catalog.pg_dist_colocation;
+    -- enterprise catalog tables
+    CREATE TABLE public.pg_dist_authinfo AS SELECT * FROM pg_catalog.pg_dist_authinfo;
+    CREATE TABLE public.pg_dist_poolinfo AS SELECT * FROM pg_catalog.pg_dist_poolinfo;
+    CREATE TABLE public.pg_dist_rebalance_strategy AS SELECT
+        name,
+        default_strategy,
+        shard_cost_function::regprocedure::text,
+        node_capacity_function::regprocedure::text,
+        shard_allowed_on_node_function::regprocedure::text,
+        default_threshold,
+        minimum_threshold
+    FROM pg_catalog.pg_dist_rebalance_strategy;
+
+    -- store upgrade stable identifiers on pg_dist_object catalog
+    CREATE TABLE public.pg_dist_object AS SELECT
+       address.type,
+       address.object_names,
+       address.object_args,
+       objects.distribution_argument_index,
+       objects.colocationid
+    FROM citus.pg_dist_object objects,
+         pg_catalog.pg_identify_object_as_address(objects.classid, objects.objid, objects.objsubid) address;
+END;
+$cppu$;
+
+COMMENT ON FUNCTION pg_catalog.citus_prepare_pg_upgrade()
+    IS 'perform tasks to copy citus settings to a location that could later be restored after pg_upgrade is done';
--- a/src/backend/distributed/sql/udfs/citus_prepare_pg_upgrade/latest.sql
+++ b/src/backend/distributed/sql/udfs/citus_prepare_pg_upgrade/latest.sql
@ -18,6 +18,7 @@ BEGIN
    DROP TABLE IF EXISTS public.pg_dist_authinfo;
    DROP TABLE IF EXISTS public.pg_dist_poolinfo;
    DROP TABLE IF EXISTS public.pg_dist_rebalance_strategy;
+    DROP TABLE IF EXISTS public.pg_dist_object;

    --
    -- backup citus catalog tables
@ -44,8 +45,14 @@ BEGIN
    FROM pg_catalog.pg_dist_rebalance_strategy;

    -- store upgrade stable identifiers on pg_dist_object catalog
-    UPDATE citus.pg_dist_object
-       SET (type, object_names, object_args) = (SELECT * FROM pg_identify_object_as_address(classid, objid, objsubid));
+    CREATE TABLE public.pg_dist_object AS SELECT
+       address.type,
+       address.object_names,
+       address.object_args,
+       objects.distribution_argument_index,
+       objects.colocationid
+    FROM citus.pg_dist_object objects,
+         pg_catalog.pg_identify_object_as_address(objects.classid, objects.objid, objects.objsubid) address;
 END;
 $cppu$;

--- a/src/backend/distributed/sql/udfs/citus_tables/10.0-4.sql
+++ b/src/backend/distributed/sql/udfs/citus_tables/10.0-4.sql
@ -0,0 +1,38 @@
+DO $$
+declare
+citus_tables_create_query text;
+BEGIN
+citus_tables_create_query=$CTCQ$
+    CREATE OR REPLACE VIEW %I.citus_tables AS
+    SELECT
+        logicalrelid AS table_name,
+        CASE WHEN partkey IS NOT NULL THEN 'distributed' ELSE 'reference' END AS citus_table_type,
+        coalesce(column_to_column_name(logicalrelid, partkey), '<none>') AS distribution_column,
+        colocationid AS colocation_id,
+        pg_size_pretty(citus_total_relation_size(logicalrelid, fail_on_error := false)) AS table_size,
+        (select count(*) from pg_dist_shard where logicalrelid = p.logicalrelid) AS shard_count,
+        pg_get_userbyid(relowner) AS table_owner,
+        amname AS access_method
+    FROM
+        pg_dist_partition p
+    JOIN
+        pg_class c ON (p.logicalrelid = c.oid)
+    LEFT JOIN
+        pg_am a ON (a.oid = c.relam)
+    WHERE
+        partkey IS NOT NULL OR repmodel = 't'
+    ORDER BY
+        logicalrelid::text;
+$CTCQ$;
+
+IF EXISTS (SELECT 1 FROM pg_namespace WHERE nspname = 'public') THEN
+    EXECUTE format(citus_tables_create_query, 'public');
+    GRANT SELECT ON public.citus_tables TO public;
+ELSE
+    EXECUTE format(citus_tables_create_query, 'citus');
+    ALTER VIEW citus.citus_tables SET SCHEMA pg_catalog;
+    GRANT SELECT ON pg_catalog.citus_tables TO public;
+END IF;
+
+END;
+$$;
--- a/src/backend/distributed/sql/udfs/citus_tables/latest.sql
+++ b/src/backend/distributed/sql/udfs/citus_tables/latest.sql
@ -1,20 +1,38 @@
-CREATE VIEW public.citus_tables AS
-SELECT
-  logicalrelid AS table_name,
-  CASE WHEN partkey IS NOT NULL THEN 'distributed' ELSE 'reference' END AS citus_table_type,
-  coalesce(column_to_column_name(logicalrelid, partkey), '<none>') AS distribution_column,
-  colocationid AS colocation_id,
-  pg_size_pretty(citus_total_relation_size(logicalrelid, fail_on_error := false)) AS table_size,
-  (select count(*) from pg_dist_shard where logicalrelid = p.logicalrelid) AS shard_count,
-  pg_get_userbyid(relowner) AS table_owner,
-  amname AS access_method
-FROM
-  pg_dist_partition p
-JOIN
-  pg_class c ON (p.logicalrelid = c.oid)
-LEFT JOIN
-  pg_am a ON (a.oid = c.relam)
-WHERE
-  partkey IS NOT NULL OR repmodel = 't'
-ORDER BY
-  logicalrelid::text;
+DO $$
+declare
+citus_tables_create_query text;
+BEGIN
+citus_tables_create_query=$CTCQ$
+    CREATE OR REPLACE VIEW %I.citus_tables AS
+    SELECT
+        logicalrelid AS table_name,
+        CASE WHEN partkey IS NOT NULL THEN 'distributed' ELSE 'reference' END AS citus_table_type,
+        coalesce(column_to_column_name(logicalrelid, partkey), '<none>') AS distribution_column,
+        colocationid AS colocation_id,
+        pg_size_pretty(citus_total_relation_size(logicalrelid, fail_on_error := false)) AS table_size,
+        (select count(*) from pg_dist_shard where logicalrelid = p.logicalrelid) AS shard_count,
+        pg_get_userbyid(relowner) AS table_owner,
+        amname AS access_method
+    FROM
+        pg_dist_partition p
+    JOIN
+        pg_class c ON (p.logicalrelid = c.oid)
+    LEFT JOIN
+        pg_am a ON (a.oid = c.relam)
+    WHERE
+        partkey IS NOT NULL OR repmodel = 't'
+    ORDER BY
+        logicalrelid::text;
+$CTCQ$;
+
+IF EXISTS (SELECT 1 FROM pg_namespace WHERE nspname = 'public') THEN
+    EXECUTE format(citus_tables_create_query, 'public');
+    GRANT SELECT ON public.citus_tables TO public;
+ELSE
+    EXECUTE format(citus_tables_create_query, 'citus');
+    ALTER VIEW citus.citus_tables SET SCHEMA pg_catalog;
+    GRANT SELECT ON pg_catalog.citus_tables TO public;
+END IF;
+
+END;
+$$;
--- a/src/backend/distributed/sql/udfs/citus_update_table_statistics/10.0-3.sql
+++ b/src/backend/distributed/sql/udfs/citus_update_table_statistics/10.0-3.sql
@ -0,0 +1,6 @@
+CREATE OR REPLACE FUNCTION pg_catalog.citus_update_table_statistics(relation regclass)
+	RETURNS VOID
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$citus_update_table_statistics$$;
+COMMENT ON FUNCTION pg_catalog.citus_update_table_statistics(regclass)
+	IS 'updates shard statistics of the given table';
--- a/src/backend/distributed/sql/udfs/citus_update_table_statistics/latest.sql
+++ b/src/backend/distributed/sql/udfs/citus_update_table_statistics/latest.sql
@ -1,17 +1,6 @@
-CREATE FUNCTION pg_catalog.citus_update_table_statistics(relation regclass)
-RETURNS VOID AS $$
-DECLARE
-	colocated_tables regclass[];
-BEGIN
-	SELECT get_colocated_table_array(relation) INTO colocated_tables;
-
-	PERFORM
-		master_update_shard_statistics(shardid)
-	FROM
-		pg_dist_shard
-	WHERE
-		logicalrelid = ANY (colocated_tables);
-END;
-$$ LANGUAGE 'plpgsql';
+CREATE OR REPLACE FUNCTION pg_catalog.citus_update_table_statistics(relation regclass)
+	RETURNS VOID
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$citus_update_table_statistics$$;
 COMMENT ON FUNCTION pg_catalog.citus_update_table_statistics(regclass)
-	IS 'updates shard statistics of the given table and its colocated tables';
+	IS 'updates shard statistics of the given table';
--- a/src/backend/distributed/sql/udfs/time_partitions/10.0-1.sql
+++ b/src/backend/distributed/sql/udfs/time_partitions/10.0-1.sql
@ -5,12 +5,13 @@ FROM (
  FROM pg_class c
  JOIN pg_inherits i ON (c.oid = inhrelid)
  JOIN pg_partitioned_table p ON (inhparent = partrelid)
-  JOIN pg_attribute a ON (partrelid = attrelid AND ARRAY[attnum] <@ string_to_array(partattrs::text, ' ')::int2[])
+  JOIN pg_attribute a ON (partrelid = attrelid)
  JOIN pg_type t ON (atttypid = t.oid)
  JOIN pg_namespace tn ON (t.typnamespace = tn.oid)
  LEFT JOIN pg_am am ON (c.relam = am.oid),
  pg_catalog.time_partition_range(c.oid)
  WHERE c.relpartbound IS NOT NULL AND p.partstrat = 'r' AND p.partnatts = 1
+  AND a.attnum = ANY(partattrs::int2[])
 ) partitions
 ORDER BY partrelid::text, lower_bound;

--- a/src/backend/distributed/sql/udfs/time_partitions/latest.sql
+++ b/src/backend/distributed/sql/udfs/time_partitions/latest.sql
@ -5,12 +5,13 @@ FROM (
  FROM pg_class c
  JOIN pg_inherits i ON (c.oid = inhrelid)
  JOIN pg_partitioned_table p ON (inhparent = partrelid)
-  JOIN pg_attribute a ON (partrelid = attrelid AND ARRAY[attnum] <@ string_to_array(partattrs::text, ' ')::int2[])
+  JOIN pg_attribute a ON (partrelid = attrelid)
  JOIN pg_type t ON (atttypid = t.oid)
  JOIN pg_namespace tn ON (t.typnamespace = tn.oid)
  LEFT JOIN pg_am am ON (c.relam = am.oid),
  pg_catalog.time_partition_range(c.oid)
  WHERE c.relpartbound IS NOT NULL AND p.partstrat = 'r' AND p.partnatts = 1
+  AND a.attnum = ANY(partattrs::int2[])
 ) partitions
 ORDER BY partrelid::text, lower_bound;

--- a/src/backend/distributed/test/metadata_sync.c
+++ b/src/backend/distributed/test/metadata_sync.c
@ -15,6 +15,7 @@

 #include "catalog/pg_type.h"
 #include "distributed/connection_management.h"
+#include "distributed/intermediate_result_pruning.h"
 #include "distributed/listutils.h"
 #include "distributed/maintenanced.h"
 #include "distributed/metadata_sync.h"
@ -104,7 +105,7 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS)
 	}

 	MultiConnection *connection = GetNodeConnection(FORCE_NEW_CONNECTION,
-													"localhost", PostPortNumber);
+													LOCAL_HOST_NAME, PostPortNumber);
 	ExecuteCriticalRemoteCommand(connection, "LISTEN " METADATA_SYNC_CHANNEL);

 	int waitFlags = WL_SOCKET_READABLE | WL_TIMEOUT | WL_POSTMASTER_DEATH;
--- a/src/backend/distributed/test/progress_utils.c
+++ b/src/backend/distributed/test/progress_utils.c
@ -36,12 +36,13 @@ create_progress(PG_FUNCTION_ARGS)
 {
 	uint64 magicNumber = PG_GETARG_INT64(0);
 	int stepCount = PG_GETARG_INT32(1);
-	ProgressMonitorData *monitor = CreateProgressMonitor(magicNumber, stepCount,
-														 sizeof(uint64), 0);
+	dsm_handle dsmHandle;
+	ProgressMonitorData *monitor = CreateProgressMonitor(stepCount,
+														 sizeof(uint64), &dsmHandle);

 	if (monitor != NULL)
 	{
-		uint64 *steps = (uint64 *) monitor->steps;
+		uint64 *steps = (uint64 *) ProgressMonitorSteps(monitor);

 		int i = 0;
 		for (; i < stepCount; i++)
@ -50,6 +51,7 @@ create_progress(PG_FUNCTION_ARGS)
 		}
 	}

+	RegisterProgressMonitor(magicNumber, 0, dsmHandle);
 	PG_RETURN_VOID();
 }

@ -64,7 +66,7 @@ update_progress(PG_FUNCTION_ARGS)

 	if (monitor != NULL && step < monitor->stepCount)
 	{
-		uint64 *steps = (uint64 *) monitor->steps;
+		uint64 *steps = (uint64 *) ProgressMonitorSteps(monitor);
 		steps[step] = newValue;
 	}

@ -93,7 +95,7 @@ show_progress(PG_FUNCTION_ARGS)
 	ProgressMonitorData *monitor = NULL;
 	foreach_ptr(monitor, monitorList)
 	{
-		uint64 *steps = monitor->steps;
+		uint64 *steps = ProgressMonitorSteps(monitor);

 		for (int stepIndex = 0; stepIndex < monitor->stepCount; stepIndex++)
 		{
--- a/src/backend/distributed/test/run_from_same_connection.c
+++ b/src/backend/distributed/test/run_from_same_connection.c
@ -17,10 +17,10 @@

 #include "access/xact.h"
 #include "distributed/connection_management.h"
+#include "distributed/coordinator_protocol.h"
 #include "distributed/function_utils.h"
 #include "distributed/intermediate_result_pruning.h"
 #include "distributed/lock_graph.h"
-#include "distributed/coordinator_protocol.h"
 #include "distributed/metadata_cache.h"
 #include "distributed/remote_commands.h"
 #include "distributed/run_from_same_connection.h"
--- a/src/backend/distributed/test/xact_stats.c
+++ b/src/backend/distributed/test/xact_stats.c
@ -18,9 +18,14 @@
 #include "miscadmin.h"
 #include "pgstat.h"

+#include "distributed/transaction_management.h"
+
+
 static Size MemoryContextTotalSpace(MemoryContext context);

 PG_FUNCTION_INFO_V1(top_transaction_context_size);
+PG_FUNCTION_INFO_V1(coordinated_transaction_should_use_2PC);
+

 /*
 * top_transaction_context_size returns current size of TopTransactionContext.
@ -54,3 +59,20 @@ MemoryContextTotalSpace(MemoryContext context)

 	return totalSpace;
 }
+
+
+/*
+ * coordinated_transaction_should_use_2PC returns true if the transaction is in a
+ * coordinated transaction and uses 2PC. If the transaction is nott in a
+ * coordinated transaction, the function throws an error.
+ */
+Datum
+coordinated_transaction_should_use_2PC(PG_FUNCTION_ARGS)
+{
+	if (!InCoordinatedTransaction())
+	{
+		ereport(ERROR, (errmsg("The transaction is not a coordinated transaction")));
+	}
+
+	PG_RETURN_BOOL(GetCoordinatedTransactionShouldUse2PC());
+}
--- a/src/backend/distributed/transaction/relation_access_tracking.c
+++ b/src/backend/distributed/transaction/relation_access_tracking.c
@ -793,7 +793,8 @@ CheckConflictingRelationAccesses(Oid relationId, ShardPlacementAccessType access
 								 "foreign keys. Any parallel modification to "
 								 "those hash distributed tables in the same "
 								 "transaction can only be executed in sequential query "
-								 "execution mode", relationName)));
+								 "execution mode",
+								 relationName != NULL ? relationName : "<dropped>")));

 			/*
 			 * Switching to sequential mode is admittedly confusing and, could be useless
--- a/src/backend/distributed/transaction/remote_transaction.c
+++ b/src/backend/distributed/transaction/remote_transaction.c
@ -20,6 +20,7 @@
 #include "distributed/connection_management.h"
 #include "distributed/listutils.h"
 #include "distributed/metadata_cache.h"
+#include "distributed/placement_connection.h"
 #include "distributed/remote_commands.h"
 #include "distributed/remote_transaction.h"
 #include "distributed/transaction_identifier.h"
@ -782,8 +783,16 @@ CoordinatedRemoteTransactionsPrepare(void)
 			continue;
 		}

-		StartRemoteTransactionPrepare(connection);
-		connectionList = lappend(connectionList, connection);
+		/*
+		 * Check if any DML or DDL is executed over the connection on any
+		 * placement/table. If yes, we start preparing the transaction, otherwise
+		 * we skip prepare since the connection didn't perform any write (read-only)
+		 */
+		if (ConnectionModifiedPlacement(connection))
+		{
+			StartRemoteTransactionPrepare(connection);
+			connectionList = lappend(connectionList, connection);
+		}
 	}

 	bool raiseInterrupts = true;
@ -798,6 +807,10 @@ CoordinatedRemoteTransactionsPrepare(void)

 		if (transaction->transactionState != REMOTE_TRANS_PREPARING)
 		{
+			/*
+			 * Verify that the connection didn't modify any placement
+			 */
+			Assert(!ConnectionModifiedPlacement(connection));
 			continue;
 		}

--- a/src/backend/distributed/transaction/transaction_management.c
+++ b/src/backend/distributed/transaction/transaction_management.c
@ -96,9 +96,16 @@ MemoryContext CommitContext = NULL;
 /*
 * Should this coordinated transaction use 2PC? Set by
 * CoordinatedTransactionUse2PC(), e.g. if DDL was issued and
- * MultiShardCommitProtocol was set to 2PC.
+ * MultiShardCommitProtocol was set to 2PC. But, even if this
+ * flag is set, the transaction manager is smart enough to only
+ * do 2PC on the remote connections that did a modification.
+ *
+ * As a variable name ShouldCoordinatedTransactionUse2PC could
+ * be improved. We use CoordinatedTransactionShouldUse2PC() as the
+ * public API function, hence couldn't come up with a better name
+ * for the underlying variable at the moment.
 */
-bool CoordinatedTransactionUses2PC = false;
+bool ShouldCoordinatedTransactionUse2PC = false;

 /* if disabled, distributed statements in a function may run as separate transactions */
 bool FunctionOpensTransactionBlock = true;
@ -183,15 +190,29 @@ InCoordinatedTransaction(void)


 /*
- * CoordinatedTransactionUse2PC() signals that the current coordinated
+ * CoordinatedTransactionShouldUse2PC() signals that the current coordinated
 * transaction should use 2PC to commit.
+ *
+ * Note that even if 2PC is enabled, it is only used for connections that make
+ * modification (DML or DDL).
 */
 void
-CoordinatedTransactionUse2PC(void)
+CoordinatedTransactionShouldUse2PC(void)
 {
 	Assert(InCoordinatedTransaction());

-	CoordinatedTransactionUses2PC = true;
+	ShouldCoordinatedTransactionUse2PC = true;
+}
+
+
+/*
+ * GetCoordinatedTransactionShouldUse2PC is a wrapper function to read the value
+ * of CoordinatedTransactionShouldUse2PCFlag.
+ */
+bool
+GetCoordinatedTransactionShouldUse2PC(void)
+{
+	return ShouldCoordinatedTransactionUse2PC;
 }


@ -297,28 +318,8 @@ CoordinatedTransactionCallback(XactEvent event, void *arg)
 			/* stop propagating notices from workers, we know the query is failed */
 			DisableWorkerMessagePropagation();

-			/*
-			 * FIXME: Add warning for the COORD_TRANS_COMMITTED case. That
-			 * can be reached if this backend fails after the
-			 * XACT_EVENT_PRE_COMMIT state.
-			 */
+			RemoveIntermediateResultsDirectory();

-			/*
-			 * Call other parts of citus that need to integrate into
-			 * transaction management. Do so before doing other work, so the
-			 * callbacks still can perform work if needed.
-			 */
-			{
-				/*
-				 * On Windows it's not possible to delete a file before you've closed all
-				 * handles to it (rmdir will return success but not take effect). Since
-				 * we're in an ABORT handler it's very likely that not all handles have
-				 * been closed; force them closed here before running
-				 * RemoveIntermediateResultsDirectory.
-				 */
-				AtEOXact_Files(false);
-				RemoveIntermediateResultsDirectory();
-			}
 			ResetShardPlacementTransactionState();

 			/* handles both already prepared and open transactions */
@ -425,7 +426,7 @@ CoordinatedTransactionCallback(XactEvent event, void *arg)
 			 */
 			MarkFailedShardPlacements();

-			if (CoordinatedTransactionUses2PC)
+			if (ShouldCoordinatedTransactionUse2PC)
 			{
 				CoordinatedRemoteTransactionsPrepare();
 				CurrentCoordinatedTransactionState = COORD_TRANS_PREPARED;
@ -453,7 +454,7 @@ CoordinatedTransactionCallback(XactEvent event, void *arg)
 			 * Check again whether shards/placement successfully
 			 * committed. This handles failure at COMMIT/PREPARE time.
 			 */
-			PostCommitMarkFailedShardPlacements(CoordinatedTransactionUses2PC);
+			PostCommitMarkFailedShardPlacements(ShouldCoordinatedTransactionUse2PC);
 			break;
 		}

@ -485,7 +486,7 @@ ResetGlobalVariables()
 	FreeSavedExplainPlan();
 	dlist_init(&InProgressTransactions);
 	activeSetStmts = NULL;
-	CoordinatedTransactionUses2PC = false;
+	ShouldCoordinatedTransactionUse2PC = false;
 	TransactionModifiedNodeMetadata = false;
 	MetadataSyncOnCommit = false;
 	ResetWorkerErrorIndication();
--- a/src/backend/distributed/transaction/worker_transaction.c
+++ b/src/backend/distributed/transaction/worker_transaction.c
@ -96,7 +96,7 @@ SendCommandToWorkerAsUser(const char *nodeName, int32 nodePort, const char *node
 	uint32 connectionFlags = 0;

 	UseCoordinatedTransaction();
-	CoordinatedTransactionUse2PC();
+	CoordinatedTransactionShouldUse2PC();

 	MultiConnection *transactionConnection = GetNodeUserDatabaseConnection(
 		connectionFlags, nodeName,
@ -404,7 +404,7 @@ SendCommandToWorkersParamsInternal(TargetWorkerSet targetWorkerSet, const char *
 	List *workerNodeList = TargetWorkerSetNodeList(targetWorkerSet, ShareLock);

 	UseCoordinatedTransaction();
-	CoordinatedTransactionUse2PC();
+	CoordinatedTransactionShouldUse2PC();

 	/* open connections in parallel */
 	WorkerNode *workerNode = NULL;
--- a/src/backend/distributed/utils/distribution_column.c
+++ b/src/backend/distributed/utils/distribution_column.c
@ -18,6 +18,7 @@
 #include "access/htup_details.h"
 #include "distributed/distribution_column.h"
 #include "distributed/metadata_cache.h"
+#include "distributed/multi_partitioning_utils.h"
 #include "distributed/version_compat.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodes.h"
@ -115,6 +116,53 @@ column_to_column_name(PG_FUNCTION_ARGS)
 }


+/*
+ * FindColumnWithNameOnTargetRelation gets a source table and
+ * column name. The function returns the the column with the
+ * same name on the target table.
+ *
+ * Note that due to dropping columns, the parent's distribution key may not
+ * match the partition's distribution key. See issue #5123.
+ *
+ * The function throws error if the input or output is not valid or does
+ * not exist.
+ */
+Var *
+FindColumnWithNameOnTargetRelation(Oid sourceRelationId, char *sourceColumnName,
+								   Oid targetRelationId)
+{
+	if (sourceColumnName == NULL || sourceColumnName[0] == '\0')
+	{
+		ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN),
+						errmsg("cannot find the given column on table \"%s\"",
+							   generate_qualified_relation_name(sourceRelationId))));
+	}
+
+	AttrNumber attributeNumberOnTarget = get_attnum(targetRelationId, sourceColumnName);
+	if (attributeNumberOnTarget == InvalidAttrNumber)
+	{
+		ereport(ERROR, (errmsg("Column \"%s\" does not exist on "
+							   "relation \"%s\"", sourceColumnName,
+							   get_rel_name(targetRelationId))));
+	}
+
+	Index varNo = 1;
+	Oid targetTypeId = InvalidOid;
+	int32 targetTypMod = 0;
+	Oid targetCollation = InvalidOid;
+	Index varlevelsup = 0;
+
+	/* this function throws error in case anything goes wrong */
+	get_atttypetypmodcoll(targetRelationId, attributeNumberOnTarget,
+						  &targetTypeId, &targetTypMod, &targetCollation);
+	Var *targetColumn =
+		makeVar(varNo, attributeNumberOnTarget, targetTypeId, targetTypMod,
+				targetCollation, varlevelsup);
+
+	return targetColumn;
+}
+
+
 /*
 * BuildDistributionKeyFromColumnName builds a simple distribution key consisting
 * only out of a reference to the column of name columnName. Errors out if the
--- a/src/backend/distributed/utils/foreign_key_relationship.c
+++ b/src/backend/distributed/utils/foreign_key_relationship.c
@ -100,9 +100,6 @@ static ForeignConstraintRelationshipNode * CreateOrFindNode(HTAB *adjacencyLists
 															relid);
 static List * GetConnectedListHelper(ForeignConstraintRelationshipNode *node,
 									 bool isReferencing);
-static HTAB * CreateOidVisitedHashSet(void);
-static bool OidVisited(HTAB *oidVisitedMap, Oid oid);
-static void VisitOid(HTAB *oidVisitedMap, Oid oid);
 static List * GetForeignConstraintRelationshipHelper(Oid relationId, bool isReferencing);


@ -442,7 +439,7 @@ GetConnectedListHelper(ForeignConstraintRelationshipNode *node, bool isReferenci
 * As hash_create allocates memory in heap, callers are responsible to call
 * hash_destroy when appropriate.
 */
-static HTAB *
+HTAB *
 CreateOidVisitedHashSet(void)
 {
 	HASHCTL info = { 0 };
@ -464,7 +461,7 @@ CreateOidVisitedHashSet(void)
 /*
 * OidVisited returns true if given oid is visited according to given oid hash-set.
 */
-static bool
+bool
 OidVisited(HTAB *oidVisitedMap, Oid oid)
 {
 	bool found = false;
@ -476,7 +473,7 @@ OidVisited(HTAB *oidVisitedMap, Oid oid)
 /*
 * VisitOid sets given oid as visited in given hash-set.
 */
-static void
+void
 VisitOid(HTAB *oidVisitedMap, Oid oid)
 {
 	bool found = false;
--- a/src/backend/distributed/utils/maintenanced.c
+++ b/src/backend/distributed/utils/maintenanced.c
@ -644,7 +644,8 @@ CitusMaintenanceDaemonMain(Datum main_arg)
 				 */
 				lastShardCleanTime = GetCurrentTimestamp();

-				numberOfDroppedShards = TryDropMarkedShards();
+				bool waitForCleanupLock = false;
+				numberOfDroppedShards = TryDropMarkedShards(waitForCleanupLock);
 			}

 			CommitTransactionCommand();
--- a/src/backend/distributed/utils/multi_partitioning_utils.c
+++ b/src/backend/distributed/utils/multi_partitioning_utils.c
@ -548,13 +548,14 @@ PartitionParentOid(Oid partitionOid)


 /*
- * LongestPartitionName is a uitility function that returns the partition
- * name which is the longest in terms of number of characters.
+ * PartitionWithLongestNameRelationId is a utility function that returns the
+ * oid of the partition table that has the longest name in terms of number of
+ * characters.
 */
-char *
-LongestPartitionName(Oid parentRelationId)
+Oid
+PartitionWithLongestNameRelationId(Oid parentRelationId)
 {
-	char *longestName = NULL;
+	Oid longestNamePartitionId = InvalidOid;
 	int longestNameLength = 0;
 	List *partitionList = PartitionList(parentRelationId);

@ -565,12 +566,12 @@ LongestPartitionName(Oid parentRelationId)
 		int partitionNameLength = strnlen(partitionName, NAMEDATALEN);
 		if (partitionNameLength > longestNameLength)
 		{
-			longestName = partitionName;
+			longestNamePartitionId = partitionRelationId;
 			longestNameLength = partitionNameLength;
 		}
 	}

-	return longestName;
+	return longestNamePartitionId;
 }


--- a/src/backend/distributed/utils/reference_table_utils.c
+++ b/src/backend/distributed/utils/reference_table_utils.c
@ -193,7 +193,7 @@ EnsureReferenceTablesExistOnAllNodesExtended(char transferMode)
 		int connectionFlags = OUTSIDE_TRANSACTION;

 		MultiConnection *connection = GetNodeUserDatabaseConnection(
-			connectionFlags, "localhost", PostPortNumber,
+			connectionFlags, LocalHostName, PostPortNumber,
 			userName, NULL);

 		if (PQstatus(connection->pgConn) == CONNECTION_OK)
--- a/src/backend/distributed/utils/resource_lock.c
+++ b/src/backend/distributed/utils/resource_lock.c
@ -387,6 +387,37 @@ SetLocktagForShardDistributionMetadata(int64 shardId, LOCKTAG *tag)
 }


+/*
+ * LockPlacementCleanup takes an exclusive lock to ensure that only one process
+ * can cleanup placements at the same time.
+ */
+void
+LockPlacementCleanup(void)
+{
+	LOCKTAG tag;
+	const bool sessionLock = false;
+	const bool dontWait = false;
+	SET_LOCKTAG_PLACEMENT_CLEANUP(tag);
+	(void) LockAcquire(&tag, ExclusiveLock, sessionLock, dontWait);
+}
+
+
+/*
+ * TryLockPlacementCleanup takes an exclusive lock to ensure that only one
+ * process can cleanup placements at the same time.
+ */
+bool
+TryLockPlacementCleanup(void)
+{
+	LOCKTAG tag;
+	const bool sessionLock = false;
+	const bool dontWait = true;
+	SET_LOCKTAG_PLACEMENT_CLEANUP(tag);
+	bool lockAcquired = LockAcquire(&tag, ExclusiveLock, sessionLock, dontWait);
+	return lockAcquired;
+}
+
+
 /*
 * LockReferencedReferenceShardDistributionMetadata acquires shard distribution
 * metadata locks with the given lock mode on the reference tables which has a
@ -502,8 +533,6 @@ LockShardResource(uint64 shardId, LOCKMODE lockmode)
 	const bool sessionLock = false;
 	const bool dontWait = false;

-	AssertArg(shardId != INVALID_SHARD_ID);
-
 	SET_LOCKTAG_SHARD_RESOURCE(tag, MyDatabaseId, shardId);

 	(void) LockAcquire(&tag, lockmode, sessionLock, dontWait);
--- a/src/backend/distributed/utils/role.c
+++ b/src/backend/distributed/utils/role.c
@ -41,7 +41,7 @@ alter_role_if_exists(PG_FUNCTION_ARGS)

 	Node *parseTree = ParseTreeNode(utilityQuery);

-	ProcessUtilityParseTree(parseTree, utilityQuery, PROCESS_UTILITY_TOPLEVEL, NULL,
+	ProcessUtilityParseTree(parseTree, utilityQuery, PROCESS_UTILITY_QUERY, NULL,
 							None_Receiver, NULL);

 	PG_RETURN_BOOL(true);
@ -98,7 +98,7 @@ worker_create_or_alter_role(PG_FUNCTION_ARGS)

 		ProcessUtilityParseTree(parseTree,
 								createRoleUtilityQuery,
-								PROCESS_UTILITY_TOPLEVEL,
+								PROCESS_UTILITY_QUERY,
 								NULL,
 								None_Receiver, NULL);

@ -126,7 +126,7 @@ worker_create_or_alter_role(PG_FUNCTION_ARGS)

 		ProcessUtilityParseTree(parseTree,
 								alterRoleUtilityQuery,
-								PROCESS_UTILITY_TOPLEVEL,
+								PROCESS_UTILITY_QUERY,
 								NULL,
 								None_Receiver, NULL);

--- a/src/backend/distributed/utils/shard_utils.c
+++ b/src/backend/distributed/utils/shard_utils.c
@ -11,10 +11,17 @@

 #include "postgres.h"

+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/fmgrprotos.h"
 #include "utils/lsyscache.h"
+#include "distributed/coordinator_protocol.h"
+#include "distributed/metadata_utility.h"
 #include "distributed/relay_utility.h"
 #include "distributed/shard_utils.h"

+static int GetLargestShardId(void);
+
 /*
 * GetTableLocalShardOid returns the oid of the shard from the given distributed
 * relation with the shardId.
@ -36,3 +43,81 @@ GetTableLocalShardOid(Oid citusTableOid, uint64 shardId)

 	return shardRelationOid;
 }
+
+
+/*
+ * GetLongestShardName is a utility function that returns the name of the shard of a
+ * table that has the longest name in terms of number of characters.
+ *
+ * Both the Oid and name of the table are required so we can create longest shard names
+ * after a RENAME.
+ */
+char *
+GetLongestShardName(Oid citusTableOid, char *finalRelationName)
+{
+	char *longestShardName = pstrdup(finalRelationName);
+	ShardInterval *shardInterval = LoadShardIntervalWithLongestShardName(citusTableOid);
+	AppendShardIdToName(&longestShardName, shardInterval->shardId);
+
+	return longestShardName;
+}
+
+
+/*
+ * GetLongestShardNameForLocalPartition is a utility function that creates a hypothetical shard
+ * name for a partition table that is not distributed yet.
+ */
+char *
+GetLongestShardNameForLocalPartition(Oid parentTableOid, char *partitionRelationName)
+{
+	char *longestShardName = pstrdup(partitionRelationName);
+	CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(parentTableOid);
+	int shardIntervalCount = cacheEntry->shardIntervalArrayLength;
+	int newShardId = GetLargestShardId() + shardIntervalCount;
+	AppendShardIdToName(&longestShardName, newShardId);
+
+	return longestShardName;
+}
+
+
+/*
+ * GetLargestShardId returns the biggest shard id, and returns a 10^6 in case of failure
+ * to get the last value from the sequence.
+ */
+int
+GetLargestShardId()
+{
+	Oid savedUserId = InvalidOid;
+	int savedSecurityContext = 0;
+
+	GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
+	SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);
+
+	text *sequenceName = cstring_to_text(SHARDID_SEQUENCE_NAME);
+	Oid sequenceId = ResolveRelationId(sequenceName, false);
+	Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
+
+	volatile int64 largestShardId = 0;
+
+	/*
+	 * pg_sequence_last_value() returns NULL if the sequence value is not yet used.
+	 * DirectFunctionCall1() gives an ERROR message on NULL return values, and that's why we
+	 * need a PG_TRY block.
+	 */
+	PG_TRY();
+	{
+		Datum lastShardIdDatum = DirectFunctionCall1(pg_sequence_last_value,
+													 sequenceIdDatum);
+		largestShardId = DatumGetInt64(lastShardIdDatum);
+	}
+	PG_CATCH();
+	{
+		/* assume that we have a shardId with 7 digits */
+		largestShardId = 1000000;
+	}
+	PG_END_TRY();
+
+	SetUserIdAndSecContext(savedUserId, savedSecurityContext);
+
+	return largestShardId;
+}
--- a/src/backend/distributed/utils/shardinterval_utils.c
+++ b/src/backend/distributed/utils/shardinterval_utils.c
@ -297,7 +297,7 @@ FindShardIntervalIndex(Datum searchedValue, CitusTableCacheEntry *cacheEntry)
 	ShardInterval **shardIntervalCache = cacheEntry->sortedShardIntervalArray;
 	int shardCount = cacheEntry->shardIntervalArrayLength;
 	FmgrInfo *compareFunction = cacheEntry->shardIntervalCompareFunction;
-	bool useBinarySearch = (IsCitusTableTypeCacheEntry(cacheEntry, HASH_DISTRIBUTED) ||
+	bool useBinarySearch = (!IsCitusTableTypeCacheEntry(cacheEntry, HASH_DISTRIBUTED) ||
 							!cacheEntry->hasUniformHashDistribution);
 	int shardIndex = INVALID_SHARD_INDEX;

--- a/src/backend/distributed/worker/worker_create_or_replace.c
+++ b/src/backend/distributed/worker/worker_create_or_replace.c
@ -111,12 +111,12 @@ worker_create_or_replace_object(PG_FUNCTION_ARGS)
 		RenameStmt *renameStmt = CreateRenameStatement(&address, newName);
 		const char *sqlRenameStmt = DeparseTreeNode((Node *) renameStmt);
 		ProcessUtilityParseTree((Node *) renameStmt, sqlRenameStmt,
-								PROCESS_UTILITY_TOPLEVEL,
+								PROCESS_UTILITY_QUERY,
 								NULL, None_Receiver, NULL);
 	}

 	/* apply create statement locally */
-	ProcessUtilityParseTree(parseTree, sqlStatement, PROCESS_UTILITY_TOPLEVEL, NULL,
+	ProcessUtilityParseTree(parseTree, sqlStatement, PROCESS_UTILITY_QUERY, NULL,
 							None_Receiver, NULL);

 	/* type has been created */
--- a/src/backend/distributed/worker/worker_data_fetch_protocol.c
+++ b/src/backend/distributed/worker/worker_data_fetch_protocol.c
@ -28,13 +28,14 @@
 #include "commands/extension.h"
 #include "commands/sequence.h"
 #include "distributed/citus_ruleutils.h"
+#include "distributed/commands/multi_copy.h"
 #include "distributed/commands/utility_hook.h"
 #include "distributed/connection_management.h"
-#include "distributed/listutils.h"
 #include "distributed/coordinator_protocol.h"
+#include "distributed/intermediate_results.h"
+#include "distributed/listutils.h"
 #include "distributed/metadata_cache.h"
 #include "distributed/multi_client_executor.h"
-#include "distributed/commands/multi_copy.h"
 #include "distributed/multi_logical_optimizer.h"
 #include "distributed/multi_partitioning_utils.h"
 #include "distributed/multi_server_executor.h"
@ -45,6 +46,7 @@
 #include "distributed/worker_protocol.h"
 #include "distributed/version_compat.h"
 #include "nodes/makefuncs.h"
+#include "parser/parse_relation.h"
 #include "storage/lmgr.h"
 #include "tcop/tcopprot.h"
 #include "tcop/utility.h"
@ -396,7 +398,7 @@ worker_apply_shard_ddl_command(PG_FUNCTION_ARGS)

 	/* extend names in ddl command and apply extended command */
 	RelayEventExtendNames(ddlCommandNode, schemaName, shardId);
-	ProcessUtilityParseTree(ddlCommandNode, ddlCommand, PROCESS_UTILITY_TOPLEVEL, NULL,
+	ProcessUtilityParseTree(ddlCommandNode, ddlCommand, PROCESS_UTILITY_QUERY, NULL,
 							None_Receiver, NULL);

 	PG_RETURN_VOID();
@ -428,7 +430,7 @@ worker_apply_inter_shard_ddl_command(PG_FUNCTION_ARGS)
 	RelayEventExtendNamesForInterShardCommands(ddlCommandNode, leftShardId,
 											   leftShardSchemaName, rightShardId,
 											   rightShardSchemaName);
-	ProcessUtilityParseTree(ddlCommandNode, ddlCommand, PROCESS_UTILITY_TOPLEVEL, NULL,
+	ProcessUtilityParseTree(ddlCommandNode, ddlCommand, PROCESS_UTILITY_QUERY, NULL,
 							None_Receiver, NULL);

 	PG_RETURN_VOID();
@ -461,7 +463,7 @@ worker_apply_sequence_command(PG_FUNCTION_ARGS)
 	}

 	/* run the CREATE SEQUENCE command */
-	ProcessUtilityParseTree(commandNode, commandString, PROCESS_UTILITY_TOPLEVEL, NULL,
+	ProcessUtilityParseTree(commandNode, commandString, PROCESS_UTILITY_QUERY, NULL,
 							None_Receiver, NULL);
 	CommandCounterIncrement();

@ -594,9 +596,6 @@ worker_append_table_to_shard(PG_FUNCTION_ARGS)
 	char *sourceSchemaName = NULL;
 	char *sourceTableName = NULL;

-	Oid savedUserId = InvalidOid;
-	int savedSecurityContext = 0;
-
 	CheckCitusVersion(ERROR);

 	/* We extract schema names and table names from qualified names */
@ -613,10 +612,13 @@ worker_append_table_to_shard(PG_FUNCTION_ARGS)
 	uint64 shardId = ExtractShardIdFromTableName(shardTableName, false);
 	LockShardResource(shardId, AccessExclusiveLock);

-	/* copy remote table's data to this node */
+	/*
+	 * Copy into intermediate results directory, which is automatically cleaned on
+	 * error.
+	 */
 	StringInfo localFilePath = makeStringInfo();
-	appendStringInfo(localFilePath, "base/%s/%s" UINT64_FORMAT,
-					 PG_JOB_CACHE_DIR, TABLE_FILE_PREFIX, shardId);
+	appendStringInfo(localFilePath, "%s/worker_append_table_to_shard_" UINT64_FORMAT,
+					 CreateIntermediateResultsDirectory(), shardId);

 	char *sourceQualifiedName = quote_qualified_identifier(sourceSchemaName,
 														   sourceTableName);
@ -641,7 +643,8 @@ worker_append_table_to_shard(PG_FUNCTION_ARGS)
 		appendStringInfo(sourceCopyCommand, COPY_OUT_COMMAND, sourceQualifiedName);
 	}

-	bool received = ReceiveRegularFile(sourceNodeName, sourceNodePort, NULL,
+	char *userName = CurrentUserName();
+	bool received = ReceiveRegularFile(sourceNodeName, sourceNodePort, userName,
 									   sourceCopyCommand,
 									   localFilePath);
 	if (!received)
@ -664,17 +667,36 @@ worker_append_table_to_shard(PG_FUNCTION_ARGS)
 	/* make sure we are allowed to execute the COPY command */
 	CheckCopyPermissions(localCopyCommand);

-	/* need superuser to copy from files */
-	GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
-	SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);
+	Relation shardRelation = table_openrv(localCopyCommand->relation, RowExclusiveLock);

-	ProcessUtilityParseTree((Node *) localCopyCommand, queryString->data,
-							PROCESS_UTILITY_TOPLEVEL, NULL, None_Receiver, NULL);
+	/* mimic check from copy.c */
+	if (XactReadOnly && !shardRelation->rd_islocaltemp)
+	{
+		PreventCommandIfReadOnly("COPY FROM");
+	}

-	SetUserIdAndSecContext(savedUserId, savedSecurityContext);
+	ParseState *parseState = make_parsestate(NULL);
+	(void) addRangeTableEntryForRelation(parseState, shardRelation,
+#if PG_VERSION_NUM >= PG_VERSION_12
+										 RowExclusiveLock,
+#endif
+										 NULL, false, false);
+
+	CopyState copyState = BeginCopyFrom(parseState,
+										shardRelation,
+										localCopyCommand->filename,
+										localCopyCommand->is_program,
+										NULL,
+										localCopyCommand->attlist,
+										localCopyCommand->options);
+	CopyFrom(copyState);
+	EndCopyFrom(copyState);
+
+	free_parsestate(parseState);

 	/* finally delete the temporary file we created */
 	CitusDeleteFile(localFilePath->data);
+	table_close(shardRelation, NoLock);

 	PG_RETURN_VOID();
 }
@ -782,7 +804,7 @@ AlterSequenceMinMax(Oid sequenceId, char *schemaName, char *sequenceName,

 		/* since the command is an AlterSeqStmt, a dummy command string works fine */
 		ProcessUtilityParseTree((Node *) alterSequenceStatement, dummyString,
-								PROCESS_UTILITY_TOPLEVEL, NULL, None_Receiver, NULL);
+								PROCESS_UTILITY_QUERY, NULL, None_Receiver, NULL);
 	}
 }

--- a/src/backend/distributed/worker/worker_merge_protocol.c
+++ b/src/backend/distributed/worker/worker_merge_protocol.c
@ -37,6 +37,7 @@

 #include "executor/spi.h"
 #include "nodes/makefuncs.h"
+#include "parser/parse_relation.h"
 #include "parser/parse_type.h"
 #include "storage/lmgr.h"
 #include "utils/acl.h"
@ -183,8 +184,6 @@ worker_merge_files_into_table(PG_FUNCTION_ARGS)
 	StringInfo jobSchemaName = JobSchemaName(jobId);
 	StringInfo taskTableName = TaskTableName(taskId);
 	StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
-	Oid savedUserId = InvalidOid;
-	int savedSecurityContext = 0;
 	Oid userId = GetUserId();

 	/* we should have the same number of column names and types */
@ -233,14 +232,9 @@ worker_merge_files_into_table(PG_FUNCTION_ARGS)

 	CreateTaskTable(jobSchemaName, taskTableName, columnNameList, columnTypeList);

-	/* need superuser to copy from files */
-	GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
-	SetUserIdAndSecContext(CitusExtensionOwner(), SECURITY_LOCAL_USERID_CHANGE);
-
 	CopyTaskFilesFromDirectory(jobSchemaName, taskTableName, taskDirectoryName,
 							   userId);

-	SetUserIdAndSecContext(savedUserId, savedSecurityContext);
 	PG_RETURN_VOID();
 }

@ -569,8 +563,8 @@ CopyTaskFilesFromDirectory(StringInfo schemaName, StringInfo relationName,
 		appendStringInfo(fullFilename, "%s/%s", directoryName, baseFilename);

 		/* build relation object and copy statement */
-		RangeVar *relation = makeRangeVar(schemaName->data, relationName->data, -1);
-		CopyStmt *copyStatement = CopyStatement(relation, fullFilename->data);
+		RangeVar *rangeVar = makeRangeVar(schemaName->data, relationName->data, -1);
+		CopyStmt *copyStatement = CopyStatement(rangeVar, fullFilename->data);
 		if (BinaryWorkerCopyFormat)
 		{
 			DefElem *copyOption = makeDefElem("format", (Node *) makeString("binary"),
@ -579,12 +573,28 @@ CopyTaskFilesFromDirectory(StringInfo schemaName, StringInfo relationName,
 		}

 		{
-			ParseState *pstate = make_parsestate(NULL);
-			pstate->p_sourcetext = queryString;
+			ParseState *parseState = make_parsestate(NULL);
+			parseState->p_sourcetext = queryString;

-			DoCopy(pstate, copyStatement, -1, -1, &copiedRowCount);
+			Relation relation = table_openrv(rangeVar, RowExclusiveLock);
+			(void) addRangeTableEntryForRelation(parseState, relation,
+#if PG_VERSION_NUM >= PG_VERSION_12
+												 RowExclusiveLock,
+#endif
+												 NULL, false, false);

-			free_parsestate(pstate);
+			CopyState copyState = BeginCopyFrom(parseState,
+												relation,
+												copyStatement->filename,
+												copyStatement->is_program,
+												NULL,
+												copyStatement->attlist,
+												copyStatement->options);
+			copiedRowCount = CopyFrom(copyState);
+			EndCopyFrom(copyState);
+
+			free_parsestate(parseState);
+			table_close(relation, NoLock);
 		}

 		copiedRowTotal += copiedRowCount;
--- a/src/include/distributed/commands.h
+++ b/src/include/distributed/commands.h
@ -24,6 +24,14 @@
 /* controlled via GUC, should be accessed via EnableLocalReferenceForeignKeys() */
 extern bool EnableLocalReferenceForeignKeys;

+extern void SwitchToSequentialAndLocalExecutionIfRelationNameTooLong(Oid relationId,
+																	 char *
+																	 finalRelationName);
+extern void SwitchToSequentialAndLocalExecutionIfPartitionNameTooLong(Oid
+																	  parentRelationId,
+																	  Oid
+																	  partitionRelationId);
+

 /*
 * DistributeObjectOps specifies handlers for node/object type pairs.
--- a/src/include/distributed/connection_management.h
+++ b/src/include/distributed/connection_management.h
@ -200,8 +200,12 @@ extern int NodeConnectionTimeout;
 /* maximum number of connections to cache per worker per session */
 extern int MaxCachedConnectionsPerWorker;

+/* maximum lifetime of connections in miliseconds */
+extern int MaxCachedConnectionLifetime;
+
 /* parameters used for outbound connections */
 extern char *NodeConninfo;
+extern char *LocalHostName;

 /* the hash tables are externally accessiable */
 extern HTAB *ConnectionHash;
@ -258,4 +262,5 @@ extern bool IsCitusInitiatedRemoteBackend(void);
 extern double MillisecondsPassedSince(instr_time moment);
 extern long MillisecondsToTimeout(instr_time start, long msAfterStart);

+extern void WarmUpConnParamsHash(void);
 #endif /* CONNECTION_MANAGMENT_H */
--- a/src/include/distributed/deparse_shard_query.h
+++ b/src/include/distributed/deparse_shard_query.h
@ -28,7 +28,8 @@ extern void SetTaskQueryString(Task *task, char *queryString);
 extern void SetTaskQueryStringList(Task *task, List *queryStringList);
 extern char * TaskQueryString(Task *task);
 extern char * TaskQueryStringAtIndex(Task *task, int index);
-extern bool UpdateRelationsToLocalShardTables(Node *node, List *relationShardList);
 extern int GetTaskQueryType(Task *task);
+extern void AddInsertAliasIfNeeded(Query *query);
+

 #endif /* DEPARSE_SHARD_QUERY_H */
--- a/src/include/distributed/distributed_planner.h
+++ b/src/include/distributed/distributed_planner.h
@ -67,6 +67,9 @@ typedef struct RelationRestriction

 	/* list of RootPlanParams for all outer nodes */
 	List *outerPlanParamsList;
+
+	/* list of translated vars, this is copied from postgres since it gets deleted on postgres*/
+	List *translatedVars;
 } RelationRestriction;

 typedef struct JoinRestrictionContext
@ -219,9 +222,9 @@ extern PlannedStmt * distributed_planner(Query *parse,
 #define LOCAL_TABLE_SUBQUERY_CTE_HINT \
 	"Use CTE's or subqueries to select from local tables and use them in joins"

-
 extern List * ExtractRangeTableEntryList(Query *query);
 extern bool NeedsDistributedPlanning(Query *query);
+extern List * TranslatedVarsForRteIdentity(int rteIdentity);
 extern struct DistributedPlan * GetDistributedPlan(CustomScan *node);
 extern void multi_relation_restriction_hook(PlannerInfo *root, RelOptInfo *relOptInfo,
 											Index restrictionIndex, RangeTblEntry *rte);
@ -238,6 +241,7 @@ extern Node * ResolveExternalParams(Node *inputNode, ParamListInfo boundParams);
 extern bool IsMultiTaskPlan(struct DistributedPlan *distributedPlan);
 extern RangeTblEntry * RemoteScanRangeTableEntry(List *columnNameList);
 extern int GetRTEIdentity(RangeTblEntry *rte);
+extern bool GetOriginalInh(RangeTblEntry *rte);
 extern LOCKMODE GetQueryLockMode(Query *query);
 extern int32 BlessRecordExpression(Expr *expr);
 extern void DissuadePlannerFromUsingPlan(PlannedStmt *plan);
--- a/src/include/distributed/distribution_column.h
+++ b/src/include/distributed/distribution_column.h
@ -19,6 +19,9 @@


 /* Remaining metadata utility functions  */
+extern Var * FindColumnWithNameOnTargetRelation(Oid sourceRelationId,
+												char *sourceColumnName,
+												Oid targetRelationId);
 extern Var * BuildDistributionKeyFromColumnName(Relation distributedRelation,
 												char *columnName);
 extern char * ColumnToColumnName(Oid relationId, char *columnNodeString);
--- a/Show More
+++ b/Show More