Initial commit of Citus 5.0

2016-02-11 04:05:32 +02:00 · 2016-02-11 04:05:32 +02:00 · 136306a1fe
commit 136306a1fe
357 changed files with 137231 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,22 @@
+*		whitespace=space-before-tab,trailing-space
+*.[chly]	whitespace=space-before-tab,trailing-space,indent-with-non-tab,tabwidth=4
+*.dsl		whitespace=space-before-tab,trailing-space,tab-in-indent
+*.patch		-whitespace
+*.pl		whitespace=space-before-tab,trailing-space,tabwidth=4
+*.po		whitespace=space-before-tab,trailing-space,tab-in-indent,-blank-at-eof
+*.sgml		whitespace=space-before-tab,trailing-space,tab-in-indent,-blank-at-eol
+*.x[ms]l	whitespace=space-before-tab,trailing-space,tab-in-indent
+
+# Avoid confusing ASCII underlines with leftover merge conflict markers
+README		conflict-marker-size=32
+README.*	conflict-marker-size=32
+
+# Certain data files that contain special whitespace, and other special cases
+*.data						-whitespace
+
+# Test output files that contain extra whitespace
+*.out					-whitespace
+src/test/regress/output/*.source	-whitespace
+
+# These files are maintained or generated elsewhere.  We take them as is.
+configure				-whitespace
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,38 @@
+# Global excludes across all subdirectories
+*.o
+*.so
+*.so.[0-9]
+*.so.[0-9].[0-9]
+*.sl
+*.sl.[0-9]
+*.sl.[0-9].[0-9]
+*.dylib
+*.dll
+*.a
+*.mo
+*.pot
+objfiles.txt
+.deps/
+*.gcno
+*.gcda
+*.gcov
+*.gcov.out
+lcov.info
+coverage/
+*.vcproj
+*.vcxproj
+win32ver.rc
+*.exe
+lib*dll.def
+lib*.pc
+
+# Local excludes in root directory
+/config.log
+/config.status
+/pgsql.sln
+/pgsql.sln.cache
+/Debug/
+/Release/
+/autom4te.cache
+/Makefile.global
+/src/Makefile.custom
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,15 @@
+sudo: required
+dist: trusty
+language: c
+cache: apt
+env:
+  matrix:
+    - PGVERSION=9.5
+    - PGVERSION=9.4
+before_install:
+  - git clone --depth 1 https://github.com/citusdata/tools.git
+  - tools/travis/setup_apt.sh
+  - tools/travis/nuke_pg.sh
+install:
+  - tools/travis/install_pg.sh
+script: tools/travis/pg_travis_multi_test.sh
--- a/49
+++ b/49
@ -0,0 +1,49 @@
+# CitusDB toplevel Makefile
+
+citusdb_subdir = .
+citusdb_top_builddir = .
+
+# Hint that configure should be run first
+ifeq (,$(wildcard Makefile.global))
+  $(error ./configure needs to be run before compiling CitusDB)
+endif
+
+include Makefile.global
+
+all: extension csql
+
+# build extension
+extension:
+	$(MAKE) -C src/backend/distributed/ all
+install-extension:
+	$(MAKE) -C src/backend/distributed/ install
+install-headers:
+	$(MKDIR_P) '$(includedir_server)/distributed/'
+# generated headers are located in the build directory
+	$(INSTALL_DATA) src/include/citusdb_config.h '$(includedir_server)/'
+# the rest in the source tree
+	$(INSTALL_DATA) $(citusdb_abs_srcdir)/src/include/distributed/*.h '$(includedir_server)/distributed/'
+clean-extension:
+	$(MAKE) -C src/backend/distributed/ clean
+.PHONY: extension install-extension clean-extension
+# Add to generic targets
+install: install-extension install-headers
+clean: clean-extension
+
+# build csql binary
+csql:
+	$(MAKE) -C src/bin/csql/ all
+install-csql:
+	$(MAKE) -C src/bin/csql/ install
+clean-csql:
+	$(MAKE) -C src/bin/csql/ clean
+.PHONY: csql install-csql clean-csql
+# Add to generic targets
+install: install-csql
+clean: clean-csql
+
+# depend on install for now
+check: all install
+	$(MAKE) -C src/test/regress check-full
+
+.PHONY: all check install clean
--- a/Makefile.global.in
+++ b/Makefile.global.in
@ -0,0 +1,61 @@
+# -*-makefile-*-
+# @configure_input@
+# Makefile.global.in - Makefile to be included by all submakes
+#
+# This file is converted by configure into an actual Makefile,
+# replacing the @varname@ placeholders by actual values.
+#
+# This files is intended to contain infrastructure needed by several
+# makefiles, particulary central handling of compilation flags and
+# rules.
+
+citusdb_abs_srcdir:=@abs_top_srcdir@/${citusdb_subdir}
+citusdb_abs_top_srcdir:=@abs_top_srcdir@
+PG_CONFIG:=@PG_CONFIG@
+PGXS:=$(shell $(PG_CONFIG) --pgxs)
+
+# Support for VPATH builds (i.e. builds from outside the source tree)
+vpath_build=@vpath_build@
+ifeq ($(vpath_build),yes)
+	VPATH:=$(citusdb_abs_srcdir)
+	USE_VPATH:=$(VPATH)
+endif
+
+# CitusDB is built using PostgreSQL's pgxs
+USE_PGXS=1
+include $(PGXS)
+
+# Remake Makefile.global from Makefile.global.in if the latter
+# changed. In order to trigger this rule, the including file must
+# write `include $(citusdb_top_builddir)/Makefile.global', not some
+# shortcut thereof.  This makes it less likely to accidentally run
+# with some outdated Makefile.global.
+# Make internally restarts whenever included Makefiles are
+# regenerated.
+$(citusdb_top_builddir)/Makefile.global: $(citusdb_top_builddir)/Makefile.global.in @top_srcdir@/configure $(citusdb_top_builddir)/config.status
+	cd @abs_top_builddir@ && ./config.status Makefile.global
+
+# Ensure configuration is generated by the most recent configure,
+# useful for longer existing build directories.
+$(citusdb_top_builddir)/config.status: @top_srcdir@/configure
+	cd @abs_top_builddir@ && ./config.status --recheck
+
+# Regenerate configure if configure.in changed
+@top_srcdir@/configure: $(citusdb_abs_srcdir)/configure.in
+	cd ${citusdb_abs_srcdir} && ./autogen.sh
+
+# If specified via configure, replace the default compiler. Normally
+# we'll build with the one postgres was built with. But it's useful to
+# be able to use a different one, especially when building against
+# distribution packages.
+ifneq (@CC@,)
+    override CC=@CC@
+endif
+
+# Add options passed to configure or computed therein, to CFLAGS/CPPFLAGS/...
+override CFLAGS += @CFLAGS@ @CITUS_CFLAGS@
+override CPPFLAGS := @CPPFLAGS@ -I '${citusdb_abs_top_srcdir}/src/include' $(CPPFLAGS)
+override LDFLAGS += @LDFLAGS@
+
+# optional file with user defined, additional, rules
+-include ${citusdb_abs_srcdir}/src/Makefile.custom
--- a/autogen.sh
+++ b/autogen.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+#
+# autogen.sh converts configure.in to configure and creates
+# citusdb_config.h.in. The resuting resulting files are checked into
+# the SCM, to avoid everyone needing autoconf installed.
+
+autoreconf -f
--- a/4170
+++ b/4170
--- a/configure.in
+++ b/configure.in
@ -0,0 +1,109 @@
+# CitusDB autoconf input script.
+#
+# Converted into an actual configure script by autogen.sh. This
+# conversion only has to be done when configure.in changes. To avoid
+# everyone needing autoconf installed, the resulting files are checked
+# into the SCM.
+
+AC_INIT([CitusDB], [5.0], [], [citusdb], [])
+AC_COPYRIGHT([Copyright (c) Copyright (c) 2012-2015, Citus Data, Inc.])
+
+AC_PROG_SED
+
+# Locate pg_config binary
+AC_ARG_VAR([PG_CONFIG], [Location to find pg_config for target PostgreSQL instalation (default PATH)])
+AC_ARG_VAR([PATH], [PATH for target PostgreSQL install pg_config])
+
+if test -z "$PG_CONFIG"; then
+  AC_PATH_PROG(PG_CONFIG, pg_config)
+fi
+
+if test -z "$PG_CONFIG"; then
+   AC_MSG_ERROR([Could not find pg_config. Set PG_CONFIG or PATH.])
+fi
+
+# check we're building against a supported version of PostgreSQL
+citusac_pg_config_version=$($PG_CONFIG --version 2>/dev/null)
+version_num=$(echo "$citusac_pg_config_version"|
+              $SED -e 's/^PostgreSQL \([[0-9]]*\)\.\([[0-9]]*\)\([[a-zA-Z0-9.]]*\)$/\1.\2/')
+
+if test -z "$version_num"; then
+  AC_MSG_ERROR([Could not detect PostgreSQL version from pg_config.])
+fi
+
+if test "$version_num" != '9.4' -a "$version_num" != '9.5'; then
+   AC_MSG_ERROR([CitusDB is not compatible with the detected PostgreSQL version ${version_num}.])
+else
+   AC_MSG_NOTICE([building against PostgreSQL $version_num])
+fi;
+
+# Check whether we're building inside the source tree, if not, prepare
+# the build directory.
+if test "$srcdir" -ef '.' ; then
+  vpath_build=no
+else
+  vpath_build=yes
+  _AS_ECHO_N([preparing build tree... ])
+  citusac_abs_top_srcdir=`cd "$srcdir" && pwd`
+  $SHELL "$citusac_abs_top_srcdir/prep_buildtree" "$citusac_abs_top_srcdir" "." \
+      || AC_MSG_ERROR(failed)
+  AC_MSG_RESULT(done)
+fi
+AC_SUBST(vpath_build)
+
+# Allow to overwrite the C compiler, default to the one postgres was
+# compiled with
+AC_PROG_CC([$($PG_CONFIG --cc)])
+
+# check for a number of CFLAGS that make development easier
+
+# CITUSAC_PROG_CC_CFLAGS_OPT
+# -----------------------
+# Given a string, check if the compiler supports the string as a
+# command-line option. If it does, add the string to CFLAGS.
+AC_DEFUN([CITUSAC_PROG_CC_CFLAGS_OPT],
+[define([Ac_cachevar], [AS_TR_SH([citusac_cv_prog_cc_cflags_$1])])dnl
+AC_CACHE_CHECK([whether $CC supports $1], [Ac_cachevar],
+[citusac_save_CFLAGS=$CFLAGS
+CFLAGS="$citusac_save_CFLAGS $1"
+ac_save_c_werror_flag=$ac_c_werror_flag
+ac_c_werror_flag=yes
+_AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
+                   [Ac_cachevar=yes],
+                   [Ac_cachevar=no])
+ac_c_werror_flag=$ac_save_c_werror_flag
+CFLAGS="$citusac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CITUS_CFLAGS="$CITUS_CFLAGS $1"
+fi
+undefine([Ac_cachevar])dnl
+])# CITUSAC_PROG_CC_CFLAGS_OPT
+
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wall])
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wextra])
+# disarm options included in the above, which are too noisy for now
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wno-unused-parameter])
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wno-sign-compare])
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wno-missing-field-initializers])
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wno-clobbered])
+# And add a few extra warnings
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wdeclaration-after-statement])
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wendif-labels])
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wmissing-format-attribute])
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wmissing-declarations])
+CITUSAC_PROG_CC_CFLAGS_OPT([-Wmissing-prototypes])
+
+AC_SUBST(CITUS_CFLAGS, "$CITUS_CFLAGS")
+
+AC_CONFIG_FILES([Makefile.global])
+AC_CONFIG_HEADERS([src/include/citusdb_config.h])
+AH_TOP([
+/*
+ * citusdb_config.h.in is generated by autoconf/autoheader and
+ * converted into citusdb_config.h by configure.  Include when code needs to
+ * depend on determinations made by configure.
+ *
+ * Do not manually edit!
+ */
+])
+AC_OUTPUT
--- a/47
+++ b/47
@ -0,0 +1,47 @@
+#! /bin/sh
+#
+# CitusDB copy of PostgreSQL's config/prep_buildtree
+#
+# This script prepares a CitusDB build tree for an out-of-tree/VPATH
+# build.  It is intended to be run by the configure script.
+
+me=`basename $0`
+
+help="\
+Usage: $me sourcetree [buildtree]"
+
+if test -z "$1"; then
+    echo "$help" 1>&2
+    exit 1
+elif test x"$1" = x"--help"; then
+    echo "$help"
+    exit 0
+fi
+
+unset CDPATH
+
+sourcetree=`cd $1 && pwd`
+
+buildtree=`cd ${2:-'.'} && pwd`
+
+# We must not auto-create the subdirectories holding built documentation.
+# If we did, it would interfere with installation of prebuilt docs from
+# the source tree, if a VPATH build is done from a distribution tarball.
+# See bug #5595.
+for item in `find "$sourcetree" -type d \( \( -name CVS -prune \) -o \( -name .git -prune \) -o -print \) | grep -v "$sourcetree/doc/src/sgml/\+"`; do
+    subdir=`expr "$item" : "$sourcetree\(.*\)"`
+    if test ! -d "$buildtree/$subdir"; then
+        mkdir -p "$buildtree/$subdir" || exit 1
+    fi
+done
+
+for item in `find "$sourcetree" -not -path '*/.git/hg/*' \( -name Makefile -print -o -name GNUmakefile -print \)`; do
+    filename=`expr "$item" : "$sourcetree\(.*\)"`
+    if test ! -f "${item}.in"; then
+        if cmp "$item" "$buildtree/$filename" >/dev/null 2>&1; then : ; else
+            ln -fs "$item" "$buildtree/$filename" || exit 1
+        fi
+    fi
+done
+
+exit 0
--- a/src/backend/.gitignore
+++ b/src/backend/.gitignore
--- a/src/backend/distributed/.gitignore
+++ b/src/backend/distributed/.gitignore
@ -0,0 +1,13 @@
+# ====================
+# = Project-Specific =
+# ====================
+
+# regression test detritus
+/log/
+/regression.diffs
+/regression.out
+/results/
+/tmp_check*
+
+# ignore latest install file
+citusdb--5.0.sql
--- a/src/backend/distributed/Makefile
+++ b/src/backend/distributed/Makefile
@ -0,0 +1,33 @@
+# Makefile for the CitusDB extension
+
+citusdb_subdir = src/backend/distributed
+citusdb_top_builddir = ../../..
+
+MODULE_big = citusdb
+EXTENSION = citusdb
+EXTVERSION = 5.0
+DATA_built = $(EXTENSION)--$(EXTVERSION).sql
+SCRIPTS = $(wildcard $(citusdb_top_builddir)/src/bin/scripts/*)
+
+# directories with source files
+SUBDIRS = . commands executor master planner relay test utils worker
+
+# That patsubst rule searches all directories listed in SUBDIRS for .c
+# files, and adds the corresponding .o files to OBJS
+OBJS += \
+	$(patsubst $(citusdb_abs_srcdir)/%.c,%.o,$(foreach dir,$(SUBDIRS), $(wildcard $(citusdb_abs_srcdir)/$(dir)/*.c)))
+
+# define build process for latest install file
+$(EXTENSION)--$(EXTVERSION).sql: $(EXTENSION).sql
+	cat $^ > $@
+
+# be explicit about the default target
+all:
+
+NO_PGXS = 1
+
+SHLIB_LINK = $(libpq)
+
+include $(citusdb_top_builddir)/Makefile.global
+
+override CPPFLAGS += -I$(libpq_srcdir)
--- a/src/backend/distributed/citusdb.control
+++ b/src/backend/distributed/citusdb.control
@ -0,0 +1,6 @@
+# CitusDB extension
+comment = 'CitusDB distributed database'
+default_version = '5.0'
+module_pathname = '$libdir/citusdb'
+relocatable = false
+schema = pg_catalog
--- a/src/backend/distributed/citusdb.sql
+++ b/src/backend/distributed/citusdb.sql
@ -0,0 +1,497 @@
+/* citusdb.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION citusdb" to load this file. \quit
+
+CREATE SCHEMA citusdb;
+
+-- Ensure CREATE EXTENSION is not run against an old citusdb data
+-- directory, we're not compatible (due to the builtin functions/tables)
+DO $$
+BEGIN
+   IF EXISTS(SELECT * FROM pg_proc WHERE proname = 'worker_apply_shard_ddl_command') THEN
+      RAISE 'cannot install citusdb extension in CitusDB 4 data directory';
+   END IF;
+END;
+$$;
+
+/*****************************************************************************
+ * CitusDB data types
+ *****************************************************************************/
+CREATE TYPE citusdb.distribution_type AS ENUM (
+   'hash',
+   'range',
+   'append'
+);
+
+
+/*****************************************************************************
+ * CitusDB tables & corresponding indexes
+ *****************************************************************************/
+CREATE TABLE citusdb.pg_dist_partition(
+    logicalrelid Oid NOT NULL,
+    partmethod "char" NOT NULL,
+    partkey text NOT NULL
+);
+CREATE UNIQUE INDEX pg_dist_partition_logical_relid_index
+ON citusdb.pg_dist_partition using btree(logicalrelid);
+ALTER TABLE citusdb.pg_dist_partition SET SCHEMA pg_catalog;
+
+CREATE TABLE citusdb.pg_dist_shard(
+    logicalrelid oid NOT NULL,
+    shardid int8 NOT NULL,
+    shardstorage "char" NOT NULL,
+    shardalias text,
+    shardminvalue text,
+    shardmaxvalue text
+);
+CREATE UNIQUE INDEX pg_dist_shard_shardid_index
+ON citusdb.pg_dist_shard using btree(shardid);
+CREATE INDEX pg_dist_shard_logical_relid_index
+ON citusdb.pg_dist_shard using btree(logicalrelid);
+ALTER TABLE citusdb.pg_dist_shard SET SCHEMA pg_catalog;
+
+CREATE TABLE citusdb.pg_dist_shard_placement(
+    shardid int8 NOT NULL,
+    shardstate int4 NOT NULL,
+    shardlength int8 NOT NULL,
+    nodename text NOT NULL,
+    nodeport int8 NOT NULL
+) WITH oids;
+CREATE UNIQUE INDEX pg_dist_shard_placement_oid_index
+ON citusdb.pg_dist_shard_placement using btree(oid);
+CREATE INDEX pg_dist_shard_placement_shardid_index
+ON citusdb.pg_dist_shard_placement using btree(shardid);
+CREATE INDEX pg_dist_shard_placement_nodeid_index
+ON citusdb.pg_dist_shard_placement using btree(nodename, nodeport);
+ALTER TABLE citusdb.pg_dist_shard_placement SET SCHEMA pg_catalog;
+
+
+/*****************************************************************************
+ * CitusDB sequences
+ *****************************************************************************/
+
+/*
+ * Unternal sequence to generate 64-bit shard ids. These identifiers are then
+ * used to identify shards in the distributed database.
+ */
+CREATE SEQUENCE citusdb.pg_dist_shardid_seq
+    MINVALUE 102008
+    NO CYCLE;
+ALTER SEQUENCE  citusdb.pg_dist_shardid_seq SET SCHEMA pg_catalog;
+
+/*
+ * internal sequence to generate 32-bit jobIds. These identifiers are then
+ * used to identify jobs in the distributed database; and they wrap at 32-bits
+ * to allow for slave nodes to independently execute their distributed jobs.
+ */
+CREATE SEQUENCE citusdb.pg_dist_jobid_seq
+    MINVALUE 2 /* first jobId reserved for clean up jobs */
+    MAXVALUE 4294967296;
+ALTER SEQUENCE  citusdb.pg_dist_jobid_seq SET SCHEMA pg_catalog;
+
+
+/*****************************************************************************
+ * CitusDB functions
+ *****************************************************************************/
+
+/* For backward compatibility and ease of use create functions et al. in pg_catalog */
+SET search_path = 'pg_catalog';
+
+/* master_* functions */
+
+CREATE FUNCTION master_get_table_metadata(relation_name text, OUT logical_relid oid,
+                                          OUT part_storage_type "char",
+                                          OUT part_method "char", OUT part_key text,
+                                          OUT part_replica_count integer,
+                                          OUT part_max_size bigint,
+                                          OUT part_placement_policy integer)
+    RETURNS record
+    LANGUAGE C STABLE STRICT
+    AS 'MODULE_PATHNAME', $$master_get_table_metadata$$;
+COMMENT ON FUNCTION master_get_table_metadata(relation_name text)
+    IS 'fetch metadata values for the table';
+
+CREATE FUNCTION master_get_table_ddl_events(text)
+    RETURNS SETOF text
+    LANGUAGE C STRICT ROWS 100
+    AS 'MODULE_PATHNAME', $$master_get_table_ddl_events$$;
+COMMENT ON FUNCTION master_get_table_ddl_events(text)
+    IS 'fetch set of ddl statements for the table';
+
+CREATE FUNCTION master_get_new_shardid()
+    RETURNS bigint
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$master_get_new_shardid$$;
+COMMENT ON FUNCTION master_get_new_shardid()
+    IS 'fetch unique shardId';
+
+CREATE FUNCTION master_get_local_first_candidate_nodes(OUT node_name text,
+                                                       OUT node_port bigint)
+    RETURNS SETOF record
+    LANGUAGE C STRICT ROWS 100
+    AS 'MODULE_PATHNAME', $$master_get_local_first_candidate_nodes$$;
+COMMENT ON FUNCTION master_get_local_first_candidate_nodes()
+    IS 'fetch set of candidate nodes for shard uploading choosing the local node first';
+
+CREATE FUNCTION master_create_empty_shard(text)
+    RETURNS bigint
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$master_create_empty_shard$$;
+COMMENT ON FUNCTION master_create_empty_shard(text)
+    IS 'create an empty shard and shard placements for the table';
+
+CREATE FUNCTION master_append_table_to_shard(bigint, text, text, integer)
+    RETURNS real
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$master_append_table_to_shard$$;
+COMMENT ON FUNCTION master_append_table_to_shard(bigint, text, text, integer)
+    IS 'append given table to all shard placements and update metadata';
+
+CREATE FUNCTION master_apply_delete_command(text)
+    RETURNS integer
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$master_apply_delete_command$$;
+COMMENT ON FUNCTION master_apply_delete_command(text)
+    IS 'drop shards matching delete criteria and update metadata';
+
+CREATE FUNCTION master_get_active_worker_nodes(OUT node_name text, OUT node_port bigint)
+    RETURNS SETOF record
+    LANGUAGE C STRICT ROWS 100
+    AS 'MODULE_PATHNAME', $$master_get_active_worker_nodes$$;
+COMMENT ON FUNCTION master_get_active_worker_nodes()
+    IS 'fetch set of active worker nodes';
+
+CREATE FUNCTION master_get_round_robin_candidate_nodes(shard_id bigint,
+                                                       OUT node_name text,
+                                                       OUT node_port bigint)
+    RETURNS SETOF record
+    LANGUAGE C STRICT ROWS 100
+    AS 'MODULE_PATHNAME', $$master_get_round_robin_candidate_nodes$$;
+COMMENT ON FUNCTION master_get_round_robin_candidate_nodes(shard_id bigint)
+    IS 'fetch set of candidate nodes for shard uploading in round-robin manner';
+
+CREATE FUNCTION master_create_distributed_table(table_name regclass,
+                                                distribution_column text,
+                                                distribution_method citusdb.distribution_type)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$master_create_distributed_table$$;
+COMMENT ON FUNCTION master_create_distributed_table(table_name regclass,
+                                                    distribution_column text,
+                                                    distribution_method citusdb.distribution_type)
+    IS 'define the table distribution functions';
+
+-- define shard creation function for hash-partitioned tables
+CREATE FUNCTION master_create_worker_shards(table_name text, shard_count integer,
+                                            replication_factor integer DEFAULT 2)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+/* task_tracker_* functions */
+
+CREATE FUNCTION task_tracker_assign_task(bigint, integer, text)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$task_tracker_assign_task$$;
+COMMENT ON FUNCTION task_tracker_assign_task(bigint, integer, text)
+    IS 'assign a task to execute';
+
+CREATE FUNCTION task_tracker_task_status(bigint, integer)
+    RETURNS integer
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$task_tracker_task_status$$;
+COMMENT ON FUNCTION task_tracker_task_status(bigint, integer)
+    IS 'check an assigned task''s execution status';
+
+CREATE FUNCTION task_tracker_cleanup_job(bigint)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$task_tracker_cleanup_job$$;
+COMMENT ON FUNCTION task_tracker_cleanup_job(bigint)
+    IS 'clean up all tasks associated with a job';
+
+
+/* worker_* functions */
+
+CREATE FUNCTION worker_fetch_partition_file(bigint, integer, integer, integer, text,
+                                            integer)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_fetch_partition_file$$;
+COMMENT ON FUNCTION worker_fetch_partition_file(bigint, integer, integer, integer, text,
+                                                integer)
+    IS 'fetch partition file from remote node';
+
+CREATE FUNCTION worker_fetch_query_results_file(bigint, integer, integer, text, integer)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_fetch_query_results_file$$;
+COMMENT ON FUNCTION worker_fetch_query_results_file(bigint, integer, integer, text,
+                                                    integer)
+    IS 'fetch query results file from remote node';
+
+CREATE FUNCTION worker_fetch_foreign_file(text, bigint, text[], integer[])
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_fetch_foreign_file$$;
+COMMENT ON FUNCTION worker_fetch_foreign_file(text, bigint, text[], integer[])
+    IS 'fetch foreign file from remote node and apply file';
+
+CREATE FUNCTION worker_fetch_regular_table(text, bigint, text[], integer[])
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_fetch_regular_table$$;
+COMMENT ON FUNCTION worker_fetch_regular_table(text, bigint, text[], integer[])
+    IS 'fetch PostgreSQL table from remote node';
+
+CREATE FUNCTION worker_range_partition_table(bigint, integer, text, text, oid, anyarray)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_range_partition_table$$;
+COMMENT ON FUNCTION worker_range_partition_table(bigint, integer, text, text, oid,
+                                                 anyarray)
+    IS 'range partition query results';
+
+CREATE FUNCTION worker_hash_partition_table(bigint, integer, text, text, oid, integer)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_hash_partition_table$$;
+COMMENT ON FUNCTION worker_hash_partition_table(bigint, integer, text, text, oid,
+                                                integer)
+    IS 'hash partition query results';
+
+CREATE FUNCTION worker_merge_files_into_table(bigint, integer, text[], text[])
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_merge_files_into_table$$;
+COMMENT ON FUNCTION worker_merge_files_into_table(bigint, integer, text[], text[])
+    IS 'merge files into a table';
+
+CREATE FUNCTION worker_merge_files_and_run_query(bigint, integer, text, text)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_merge_files_and_run_query$$;
+COMMENT ON FUNCTION worker_merge_files_and_run_query(bigint, integer, text, text)
+    IS 'merge files and run a reduce query on merged files';
+
+CREATE FUNCTION worker_cleanup_job_schema_cache()
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_cleanup_job_schema_cache$$;
+COMMENT ON FUNCTION worker_cleanup_job_schema_cache()
+    IS 'cleanup all job schemas in current database';
+
+CREATE FUNCTION worker_foreign_file_path(text)
+    RETURNS text
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_foreign_file_path$$;
+COMMENT ON FUNCTION worker_foreign_file_path(text)
+    IS 'get a foreign table''s local file path';
+
+CREATE FUNCTION worker_find_block_local_path(bigint, text[])
+    RETURNS text
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_find_block_local_path$$;
+COMMENT ON FUNCTION worker_find_block_local_path(bigint, text[])
+    IS 'find an HDFS block''s local file path';
+
+CREATE FUNCTION worker_apply_shard_ddl_command(bigint, text)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_apply_shard_ddl_command$$;
+COMMENT ON FUNCTION worker_apply_shard_ddl_command(bigint, text)
+    IS 'extend ddl command with shardId and apply on database';
+
+CREATE FUNCTION worker_append_table_to_shard(text, text, text, integer)
+    RETURNS void
+    LANGUAGE C STRICT
+    AS 'MODULE_PATHNAME', $$worker_append_table_to_shard$$;
+COMMENT ON FUNCTION worker_append_table_to_shard(text, text, text, integer)
+    IS 'append a regular table''s contents to the shard';
+
+
+/* trigger functions */
+
+CREATE OR REPLACE FUNCTION citusdb_drop_trigger()
+    RETURNS event_trigger
+    LANGUAGE plpgsql
+    SET search_path = pg_catalog
+    AS $cdbdt$
+DECLARE v_obj record;
+BEGIN
+    FOR v_obj IN SELECT * FROM pg_event_trigger_dropped_objects() LOOP
+        IF v_obj.object_type <> 'table' THEN
+           CONTINUE;
+        END IF;
+
+        -- nothing to do if not a distributed table
+        IF NOT EXISTS(SELECT * FROM pg_dist_partition WHERE logicalrelid = v_obj.objid) THEN
+            CONTINUE;
+        END IF;
+
+        -- check if there's shards for the table, error out if so
+        IF EXISTS(SELECT * FROM pg_dist_shard WHERE logicalrelid = v_obj.objid) THEN
+           RAISE EXCEPTION USING
+                 MESSAGE = 'cannot drop distributed table with existing shards',
+                 HINT = $$Delete shards first using: $$ ||
+                        $$SELECT master_apply_delete_command('DELETE FROM $$ ||
+                        v_obj.object_identity || $$')$$;
+        END IF;
+
+        -- delete partition entry
+        DELETE FROM pg_dist_partition WHERE logicalrelid = v_obj.objid;
+        IF NOT FOUND THEN
+           RAISE EXCEPTION 'could not find previously found pg_dist_partition entry';
+        END IF;
+
+    END LOOP;
+END;
+$cdbdt$;
+COMMENT ON FUNCTION citusdb_drop_trigger()
+    IS 'perform checks and actions at the end of DROP actions';
+
+CREATE FUNCTION master_dist_partition_cache_invalidate()
+    RETURNS trigger
+    LANGUAGE C
+    AS 'MODULE_PATHNAME', $$master_dist_partition_cache_invalidate$$;
+COMMENT ON FUNCTION master_dist_partition_cache_invalidate()
+    IS 'register relcache invalidation for changed rows';
+
+CREATE FUNCTION master_dist_shard_cache_invalidate()
+    RETURNS trigger
+    LANGUAGE C
+    AS 'MODULE_PATHNAME', $$master_dist_shard_cache_invalidate$$;
+COMMENT ON FUNCTION master_dist_shard_cache_invalidate()
+    IS 'register relcache invalidation for changed rows';
+
+
+/* internal functions, not user accessible */
+
+CREATE FUNCTION citusdb_extradata_container(INTERNAL)
+    RETURNS void
+    LANGUAGE C
+    AS 'MODULE_PATHNAME', $$citusdb_extradata_container$$;
+COMMENT ON FUNCTION pg_catalog.citusdb_extradata_container(INTERNAL)
+    IS 'placeholder function to store additional data in postgres node trees';
+
+
+/*****************************************************************************
+ * CitusDB triggers
+ *****************************************************************************/
+
+CREATE EVENT TRIGGER citusdb_cascade_to_partition
+    ON SQL_DROP
+    EXECUTE PROCEDURE citusdb_drop_trigger();
+
+CREATE TRIGGER dist_partition_cache_invalidate
+    AFTER INSERT OR UPDATE OR DELETE
+    ON pg_catalog.pg_dist_partition
+    FOR EACH ROW EXECUTE PROCEDURE master_dist_partition_cache_invalidate();
+
+CREATE TRIGGER dist_shard_cache_invalidate
+    AFTER INSERT OR UPDATE OR DELETE
+    ON pg_catalog.pg_dist_shard
+    FOR EACH ROW EXECUTE PROCEDURE master_dist_shard_cache_invalidate();
+
+
+/*****************************************************************************
+ * CitusDB aggregates
+ *****************************************************************************/
+CREATE AGGREGATE array_cat_agg(anyarray) (SFUNC = array_cat, STYPE = anyarray);
+COMMENT ON AGGREGATE array_cat_agg(anyarray)
+    IS 'concatenate input arrays into a single array';
+
+
+/*
+ * Creates a temporary table exactly like the specified target table along with
+ * a trigger to redirect any INSERTed rows from the proxy to the underlying
+ * table. Users may optionally provide a sequence which will be incremented
+ * after each row that has been successfully proxied (useful for counting rows
+ * processed). Returns the name of the proxy table that was created.
+ */
+CREATE FUNCTION create_insert_proxy_for_table(target_table regclass,
+                                              sequence regclass DEFAULT NULL)
+RETURNS text
+AS $create_insert_proxy_for_table$
+    DECLARE
+        temp_table_name text;
+        attr_names text[];
+        attr_list text;
+        param_list text;
+        using_list text;
+        insert_command text;
+        -- templates to create dynamic functions, tables, and triggers
+        func_tmpl CONSTANT text :=    $$CREATE FUNCTION pg_temp.copy_to_insert()
+                                        RETURNS trigger
+                                        AS $copy_to_insert$
+                                        BEGIN
+                                            EXECUTE %L USING %s;
+                                            PERFORM nextval(%L);
+                                            RETURN NULL;
+                                        END;
+                                        $copy_to_insert$ LANGUAGE plpgsql;$$;
+        table_tmpl CONSTANT text :=   $$CREATE TEMPORARY TABLE %I
+                                        (LIKE %s INCLUDING DEFAULTS)$$;
+        trigger_tmpl CONSTANT text := $$CREATE TRIGGER copy_to_insert
+                                        BEFORE INSERT ON %s FOR EACH ROW
+                                        EXECUTE PROCEDURE pg_temp.copy_to_insert()$$;
+    BEGIN
+        -- create name of temporary table using unqualified input table name
+        SELECT format('%s_insert_proxy', relname)
+        INTO   STRICT temp_table_name
+        FROM   pg_class
+        WHERE  oid = target_table;
+
+        -- get list of all attributes in table, we'll need shortly
+        SELECT array_agg(attname)
+        INTO   STRICT attr_names
+        FROM   pg_attribute
+        WHERE  attrelid = target_table AND
+               attnum > 0 AND
+               NOT attisdropped;
+
+        -- build fully specified column list and USING clause from attr. names
+        SELECT string_agg(quote_ident(attr_name), ','),
+               string_agg(format('NEW.%I', attr_name), ',')
+        INTO   STRICT attr_list,
+                      using_list
+        FROM   unnest(attr_names) AS attr_name;
+
+        -- build ($1, $2, $3)-style VALUE list to bind parameters
+        SELECT string_agg('$' || param_num, ',')
+        INTO   STRICT param_list
+        FROM   generate_series(1, array_length(attr_names, 1)) AS param_num;
+
+        -- use the above lists to generate appropriate INSERT command
+        insert_command = format('INSERT INTO %s (%s) VALUES (%s)', target_table,
+                                attr_list, param_list);
+
+        -- use the command to make one-off trigger targeting specified table
+        EXECUTE format(func_tmpl, insert_command, using_list, sequence);
+
+        -- create a temporary table exactly like the target table...
+        EXECUTE format(table_tmpl, temp_table_name, target_table);
+
+        -- ... and install the trigger on that temporary table
+        EXECUTE format(trigger_tmpl, quote_ident(temp_table_name)::regclass);
+
+        RETURN temp_table_name;
+    END;
+$create_insert_proxy_for_table$ LANGUAGE plpgsql SET search_path = 'pg_catalog';
+
+COMMENT ON FUNCTION create_insert_proxy_for_table(regclass, regclass)
+        IS 'create a proxy table that redirects INSERTed rows to a target table';
+
+-- define shard repair function
+CREATE FUNCTION master_copy_shard_placement(shard_id bigint,
+                                            source_node_name text,
+                                            source_node_port integer,
+                                            target_node_name text,
+                                            target_node_port integer)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+RESET search_path;
--- a/src/backend/distributed/commands/create_distributed_table.c
+++ b/src/backend/distributed/commands/create_distributed_table.c
@ -0,0 +1,377 @@
+/*-------------------------------------------------------------------------
+ *
+ * create_distributed_relation.c
+ *	  Routines relation to the creation of distributed relations.
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+#include "access/heapam.h"
+#include "access/htup.h"
+#include "access/htup_details.h"
+#include "access/nbtree.h"
+#include "catalog/dependency.h"
+#include "catalog/index.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_am.h"
+#include "catalog/pg_enum.h"
+#include "catalog/pg_extension.h"
+#include "catalog/pg_opclass.h"
+#include "commands/defrem.h"
+#include "commands/extension.h"
+#include "distributed/master_metadata_utility.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/pg_dist_partition.h"
+#include "nodes/execnodes.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/pg_list.h"
+#include "parser/parse_expr.h"
+#include "parser/parse_node.h"
+#include "parser/parse_relation.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "utils/inval.h"
+
+
+/* local function forward declarations */
+static char LookupDistributionMethod(Oid distributionMethodOid);
+static void RecordDistributedRelationDependencies(Oid distributedRelationId,
+												  Node *distributionKey);
+static Oid SupportFunctionForColumn(Var *partitionColumn, Oid accessMethodId,
+									int16 supportFunctionNumber);
+
+
+/* exports for SQL callable functions */
+PG_FUNCTION_INFO_V1(master_create_distributed_table);
+
+
+/*
+ * master_create_distributed_table accepts a table, distribution column and
+ * method and performs the corresponding catalog changes.
+ *
+ * XXX: We should perform more checks here to see if this table is fit for
+ * partitioning. At a minimum, we should validate the following: (i) this node
+ * runs as the master node, (ii) table does not make use of the inheritance
+ * mechanism, (iii) table does not own columns that are sequences, and (iv)
+ * table does not have collated columns. (v) table does not have
+ * preexisting content.
+ */
+Datum
+master_create_distributed_table(PG_FUNCTION_ARGS)
+{
+	Oid distributedRelationId = PG_GETARG_OID(0);
+	text *distributionColumnText = PG_GETARG_TEXT_P(1);
+	Oid distributionMethodOid = PG_GETARG_OID(2);
+
+	Relation distributedRelation = NULL;
+	char *distributedRelationName = NULL;
+	char relationKind = '\0';
+
+	Relation pgDistPartition = NULL;
+	char distributionMethod = LookupDistributionMethod(distributionMethodOid);
+	char *distributionColumnName = text_to_cstring(distributionColumnText);
+	Node *distributionKey = NULL;
+	Var *distributionColumn = NULL;
+	char *distributionKeyString = NULL;
+
+	List *indexOidList = NIL;
+	ListCell *indexOidCell = NULL;
+
+	HeapTuple newTuple = NULL;
+	Datum newValues[Natts_pg_dist_partition];
+	bool newNulls[Natts_pg_dist_partition];
+
+	/*
+	 * Lock target relation with an access exclusive lock - there's no way to
+	 * make sense of this table until we've committed, and we don't want
+	 * multiple backends manipulating this relation.
+	 */
+	distributedRelation = relation_open(distributedRelationId, AccessExclusiveLock);
+	distributedRelationName = RelationGetRelationName(distributedRelation);
+
+	/* open system catalog and insert new tuple */
+	pgDistPartition = heap_open(DistPartitionRelationId(), RowExclusiveLock);
+
+	/* check that the relation is not already distributed */
+	if (IsDistributedTable(distributedRelationId))
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+						errmsg("table \"%s\" is already distributed",
+							   distributedRelationName)));
+	}
+
+	/* verify target relation is either regular or foreign table */
+	relationKind = distributedRelation->rd_rel->relkind;
+	if (relationKind != RELKIND_RELATION && relationKind != RELKIND_FOREIGN_TABLE)
+	{
+		ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						errmsg("cannot distribute relation: %s",
+							   distributedRelationName),
+						errdetail("Distributed relations must be regular or "
+								  "foreign tables.")));
+	}
+
+	distributionKey = BuildDistributionKeyFromColumnName(distributedRelation,
+														 distributionColumnName);
+	distributionKeyString = nodeToString(distributionKey);
+
+	/* the distribution key should always be a Var for now */
+	Assert(IsA(distributionKey, Var));
+	distributionColumn = (Var *) distributionKey;
+
+	/* check for support function needed by specified partition method */
+	if (distributionMethod == DISTRIBUTE_BY_HASH)
+	{
+		Oid hashSupportFunction = SupportFunctionForColumn(distributionColumn,
+														   HASH_AM_OID, HASHPROC);
+		if (hashSupportFunction == InvalidOid)
+		{
+			ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FUNCTION),
+							errmsg("could not identify a hash function for type %s",
+								   format_type_be(distributionColumn->vartype)),
+							errdatatype(distributionColumn->vartype),
+							errdetail("Partition column types must have a hash function "
+									  "defined to use hash partitioning.")));
+		}
+	}
+	else if (distributionMethod == DISTRIBUTE_BY_RANGE)
+	{
+		Oid btreeSupportFunction = SupportFunctionForColumn(distributionColumn,
+															BTREE_AM_OID, BTORDER_PROC);
+		if (btreeSupportFunction == InvalidOid)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_FUNCTION),
+					 errmsg("could not identify a comparison function for type %s",
+							format_type_be(distributionColumn->vartype)),
+					 errdatatype(distributionColumn->vartype),
+					 errdetail("Partition column types must have a comparison function "
+							   "defined to use range partitioning.")));
+		}
+	}
+
+	/*
+	 * Do not allow UNIQUE constraint and/or PRIMARY KEY on append partitioned tables,
+	 * since currently there is no way of enforcing uniqueness for overlapping shards.
+	 *
+	 * Similarly, do not allow UNIQUE constraint and/or PRIMARY KEY if it does not
+	 * include partition column. This check is important for two reasons. First,
+	 * currently CitusDB does not enforce uniqueness constraint on multiple shards.
+	 * Second, INSERT INTO .. ON CONFLICT (i.e., UPSERT) queries can be executed with no
+	 * further check for constraints.
+	 */
+	indexOidList = RelationGetIndexList(distributedRelation);
+	foreach(indexOidCell, indexOidList)
+	{
+		Oid indexOid = lfirst_oid(indexOidCell);
+		Relation indexDesc = index_open(indexOid, RowExclusiveLock);
+		IndexInfo *indexInfo = NULL;
+		AttrNumber *attributeNumberArray = NULL;
+		bool hasDistributionColumn = false;
+		int attributeCount = 0;
+		int attributeIndex = 0;
+
+		/* extract index key information from the index's pg_index info */
+		indexInfo = BuildIndexInfo(indexDesc);
+
+		/* only check unique indexes */
+		if (indexInfo->ii_Unique == false)
+		{
+			index_close(indexDesc, NoLock);
+			continue;
+		}
+
+		/*
+		 * CitusDB cannot enforce uniqueness constraints with overlapping shards. Thus,
+		 * emit a warning for unique indexes on append partitioned tables.
+		 */
+		if (distributionMethod == DISTRIBUTE_BY_APPEND)
+		{
+			ereport(WARNING, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					errmsg("table \"%s\" has a unique constraint",
+							distributedRelationName),
+							errdetail("Unique constraints and primary keys on "
+									  "append-partitioned tables cannot be enforced."),
+							errhint("Consider using hash partitioning.")));
+		}
+
+		attributeCount = indexInfo->ii_NumIndexAttrs;
+		attributeNumberArray = indexInfo->ii_KeyAttrNumbers;
+
+		for (attributeIndex = 0; attributeIndex < attributeCount; attributeIndex++)
+		{
+			AttrNumber attributeNumber = attributeNumberArray[attributeIndex];
+			if (distributionColumn->varattno == attributeNumber)
+			{
+				hasDistributionColumn = true;
+				break;
+			}
+		}
+
+		if (!hasDistributionColumn)
+		{
+			ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							errmsg("cannot distribute relation: \"%s\"",
+								   distributedRelationName),
+							errdetail("Distributed relations cannot have "
+									  "UNIQUE constraints or PRIMARY KEYs that do not "
+									  "include the partition column.")));
+		}
+
+		index_close(indexDesc, NoLock);
+	}
+
+	/* form new tuple for pg_dist_partition */
+	memset(newValues, 0, sizeof(newValues));
+	memset(newNulls, false, sizeof(newNulls));
+
+	newValues[Anum_pg_dist_partition_logicalrelid - 1] =
+		ObjectIdGetDatum(distributedRelationId);
+	newValues[Anum_pg_dist_partition_partmethod - 1] =
+		CharGetDatum(distributionMethod);
+	newValues[Anum_pg_dist_partition_partkey - 1] =
+		CStringGetTextDatum(distributionKeyString);
+
+	newTuple = heap_form_tuple(RelationGetDescr(pgDistPartition), newValues, newNulls);
+
+	/* finally insert tuple, build index entries & register cache invalidation */
+	simple_heap_insert(pgDistPartition, newTuple);
+	CatalogUpdateIndexes(pgDistPartition, newTuple);
+	CacheInvalidateRelcacheByRelid(distributedRelationId);
+
+	RecordDistributedRelationDependencies(distributedRelationId, distributionKey);
+
+	heap_close(pgDistPartition, NoLock);
+	relation_close(distributedRelation, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+
+/*
+ * RecordDistributedRelationDependencies creates the dependency entries
+ * necessary for a distributed relation in addition to the preexisting ones
+ * for a normal relation.
+ *
+ * We create one dependency from the (now distributed) relation to the citusdb
+ * extension to prevent the extension from being dropped while distributed
+ * tables exist. Furthermore a dependency from pg_dist_partition's
+ * distribution clause to the underlying columns is created, but it's marked
+ * as being owned by the relation itself. That means the entire table can be
+ * dropped, but the column itself can't. Neither can the type of the
+ * distribution column be changed (c.f. ATExecAlterColumnType).
+ */
+static void
+RecordDistributedRelationDependencies(Oid distributedRelationId, Node *distributionKey)
+{
+	ObjectAddress relationAddr = { 0, 0, 0 };
+	ObjectAddress citusExtensionAddr = { 0, 0, 0 };
+
+	relationAddr.classId = RelationRelationId;
+	relationAddr.objectId = distributedRelationId;
+	relationAddr.objectSubId = 0;
+
+	citusExtensionAddr.classId = ExtensionRelationId;
+	citusExtensionAddr.objectId = get_extension_oid("citusdb", false);
+	citusExtensionAddr.objectSubId = 0;
+
+	/* dependency from table entry to extension */
+	recordDependencyOn(&relationAddr, &citusExtensionAddr, DEPENDENCY_NORMAL);
+
+	/* make sure the distribution key column/expression does not just go away */
+	recordDependencyOnSingleRelExpr(&relationAddr, distributionKey, distributedRelationId,
+									DEPENDENCY_NORMAL, DEPENDENCY_NORMAL);
+}
+
+
+/*
+ * LookupDistributionMethod maps the oids of citusdb.distribution_type enum
+ * values to pg_dist_partition.partmethod values.
+ *
+ * The passed in oid has to belong to a value of citusdb.distribution_type.
+ */
+static char
+LookupDistributionMethod(Oid distributionMethodOid)
+{
+	HeapTuple enumTuple = NULL;
+	Form_pg_enum enumForm = NULL;
+	char distributionMethod = 0;
+	const char *enumLabel = NULL;
+
+	enumTuple = SearchSysCache1(ENUMOID, ObjectIdGetDatum(distributionMethodOid));
+	if (!HeapTupleIsValid(enumTuple))
+	{
+		ereport(ERROR, (errmsg("invalid internal value for enum: %u",
+							   distributionMethodOid)));
+	}
+
+	enumForm = (Form_pg_enum) GETSTRUCT(enumTuple);
+	enumLabel = NameStr(enumForm->enumlabel);
+
+	if (strncmp(enumLabel, "append", NAMEDATALEN) == 0)
+	{
+		distributionMethod = DISTRIBUTE_BY_APPEND;
+	}
+	else if (strncmp(enumLabel, "hash", NAMEDATALEN) == 0)
+	{
+		distributionMethod = DISTRIBUTE_BY_HASH;
+	}
+	else if (strncmp(enumLabel, "range", NAMEDATALEN) == 0)
+	{
+		distributionMethod = DISTRIBUTE_BY_RANGE;
+	}
+	else
+	{
+		ereport(ERROR, (errmsg("invalid label for enum: %s", enumLabel)));
+	}
+
+	ReleaseSysCache(enumTuple);
+
+	return distributionMethod;
+}
+
+
+/*
+ *	SupportFunctionForColumn locates a support function given a column, an access method,
+ *	and and id of a support function. This function returns InvalidOid if there is no
+ *	support function for the operator class family of the column, but if the data type
+ *	of the column has no default operator class whatsoever, this function errors out.
+ */
+static Oid
+SupportFunctionForColumn(Var *partitionColumn, Oid accessMethodId,
+						 int16 supportFunctionNumber)
+{
+	Oid operatorFamilyId = InvalidOid;
+	Oid supportFunctionOid = InvalidOid;
+	Oid operatorClassInputType = InvalidOid;
+	Oid columnOid = partitionColumn->vartype;
+	Oid operatorClassId = GetDefaultOpClass(columnOid, accessMethodId);
+
+	/* currently only support using the default operator class */
+	if (operatorClassId == InvalidOid)
+	{
+		ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT),
+						errmsg("data type %s has no default operator class for specified"
+							   " partition method", format_type_be(columnOid)),
+						errdatatype(columnOid),
+						errdetail("Partition column types must have a default operator"
+								  " class defined.")));
+	}
+
+	operatorFamilyId = get_opclass_family(operatorClassId);
+	operatorClassInputType = get_opclass_input_type(operatorClassId);
+	supportFunctionOid = get_opfamily_proc(operatorFamilyId, operatorClassInputType,
+										   operatorClassInputType,
+										   supportFunctionNumber);
+
+	return supportFunctionOid;
+}
--- a/src/backend/distributed/commands/transmit.c
+++ b/src/backend/distributed/commands/transmit.c
@ -0,0 +1,300 @@
+/*-------------------------------------------------------------------------
+ *
+ * transmit.c
+ *	  Routines for transmitting regular files between two nodes.
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "distributed/relay_utility.h"
+#include "distributed/transmit.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "storage/fd.h"
+
+
+/* Local functions forward declarations */
+static File FileOpenForTransmit(const char *filename, int fileFlags, int fileMode);
+static void SendCopyInStart(void);
+static void SendCopyOutStart(void);
+static void SendCopyDone(void);
+static void SendCopyData(StringInfo fileBuffer);
+static bool ReceiveCopyData(StringInfo copyData);
+
+
+/*
+ * ReceiveRegularFile receives data from stdin using the standard copy
+ * protocol. The function then creates or truncates a file with the given
+ * filename, and appends received data to this file.
+ */
+void
+ReceiveRegularFile(const char *filename)
+{
+	StringInfo copyData = makeStringInfo();
+	bool copyDone = false;
+	File fileDesc = -1;
+	const int fileFlags = (O_APPEND | O_CREAT | O_RDWR | O_TRUNC | PG_BINARY);
+	const int fileMode = (S_IRUSR | S_IWUSR);
+
+	fileDesc = FileOpenForTransmit(filename, fileFlags, fileMode);
+
+	SendCopyInStart();
+
+	copyDone = ReceiveCopyData(copyData);
+	while (!copyDone)
+	{
+		/* if received data has contents, append to regular file */
+		if (copyData->len > 0)
+		{
+			int appended = FileWrite(fileDesc, copyData->data, copyData->len);
+			if (appended != copyData->len)
+			{
+				ereport(ERROR, (errcode_for_file_access(),
+								errmsg("could not append to received file: %m")));
+			}
+		}
+
+		resetStringInfo(copyData);
+		copyDone = ReceiveCopyData(copyData);
+	}
+
+	FreeStringInfo(copyData);
+	FileClose(fileDesc);
+}
+
+
+/*
+ * SendRegularFile reads data from the given file, and sends these data to
+ * stdout using the standard copy protocol. After all file data are sent, the
+ * function ends the copy protocol and closes the file.
+ */
+void
+SendRegularFile(const char *filename)
+{
+	File fileDesc = -1;
+	StringInfo fileBuffer = NULL;
+	int readBytes = -1;
+	const uint32 fileBufferSize = 32768; /* 32 KB */
+	const int fileFlags = (O_RDONLY | PG_BINARY);
+	const int fileMode = 0;
+
+	/* we currently do not check if the caller has permissions for this file */
+	fileDesc = FileOpenForTransmit(filename, fileFlags, fileMode);
+
+	/*
+	 * We read file's contents into buffers of 32 KB. This buffer size is twice
+	 * as large as Hadoop's default buffer size, and may later be configurable.
+	 */
+	fileBuffer = makeStringInfo();
+	enlargeStringInfo(fileBuffer, fileBufferSize);
+
+	SendCopyOutStart();
+
+	readBytes = FileRead(fileDesc, fileBuffer->data, fileBufferSize);
+	while (readBytes > 0)
+	{
+		fileBuffer->len = readBytes;
+
+		SendCopyData(fileBuffer);
+
+		resetStringInfo(fileBuffer);
+		readBytes = FileRead(fileDesc, fileBuffer->data, fileBufferSize);
+	}
+
+	SendCopyDone();
+
+	FreeStringInfo(fileBuffer);
+	FileClose(fileDesc);
+}
+
+
+/* Helper function that deallocates string info object. */
+void
+FreeStringInfo(StringInfo stringInfo)
+{
+	resetStringInfo(stringInfo);
+
+	pfree(stringInfo->data);
+	pfree(stringInfo);
+}
+
+
+/*
+ * FileOpenForTransmit opens file with the given filename and flags. On success,
+ * the function returns the internal file handle for the opened file. On failure
+ * the function errors out.
+ */
+static File
+FileOpenForTransmit(const char *filename, int fileFlags, int fileMode)
+{
+	File fileDesc = -1;
+	int  fileStated = -1;
+	struct stat fileStat;
+
+	fileStated = stat(filename, &fileStat);
+	if (fileStated >= 0)
+	{
+		if (S_ISDIR(fileStat.st_mode))
+		{
+			ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						errmsg("\"%s\" is a directory", filename)));
+		}
+	}
+
+	fileDesc = PathNameOpenFile((char *) filename, fileFlags, fileMode);
+	if (fileDesc < 0)
+	{
+		ereport(ERROR, (errcode_for_file_access(),
+						errmsg("could not open file \"%s\": %m", filename)));
+	}
+
+	return fileDesc;
+}
+
+
+/*
+ * SendCopyInStart sends the start copy in message to initiate receiving data
+ * from stdin. The frontend should now send copy data.
+ */
+static void
+SendCopyInStart(void)
+{
+	StringInfoData copyInStart = { NULL, 0, 0, 0 };
+	const char copyFormat = 1; /* binary copy format */
+	int flushed = 0;
+
+	pq_beginmessage(&copyInStart, 'G');
+	pq_sendbyte(&copyInStart, copyFormat);
+	pq_sendint(&copyInStart, 0, 2);
+	pq_endmessage(&copyInStart);
+
+	/* flush here to ensure that FE knows it can send data */
+	flushed = pq_flush();
+	if (flushed != 0)
+	{
+		ereport(WARNING, (errmsg("could not flush copy start data")));
+	}
+}
+
+
+/*
+ * SendCopyOutStart sends the start copy out message to initiate sending data to
+ * stdout. After this message, the backend will continue by sending copy data.
+ */
+static void
+SendCopyOutStart(void)
+{
+	StringInfoData copyOutStart = { NULL, 0, 0, 0 };
+	const char copyFormat = 1; /* binary copy format */
+
+	pq_beginmessage(&copyOutStart, 'H');
+	pq_sendbyte(&copyOutStart, copyFormat);
+	pq_sendint(&copyOutStart, 0, 2);
+	pq_endmessage(&copyOutStart);
+}
+
+
+/* Sends the copy-complete message. */
+static void
+SendCopyDone(void)
+{
+	StringInfoData copyDone = { NULL, 0, 0, 0 };
+	int flushed = 0;
+
+	pq_beginmessage(&copyDone, 'c');
+	pq_endmessage(&copyDone);
+
+	/* flush here to signal to FE that we are done */
+	flushed = pq_flush();
+	if (flushed != 0)
+	{
+		ereport(WARNING, (errmsg("could not flush copy start data")));
+	}
+}
+
+
+/* Sends the copy data message to stdout. */
+static void
+SendCopyData(StringInfo fileBuffer)
+{
+	StringInfoData copyData = { NULL, 0, 0, 0 };
+
+	pq_beginmessage(&copyData, 'd');
+	pq_sendbytes(&copyData, fileBuffer->data, fileBuffer->len);
+	pq_endmessage(&copyData);
+}
+
+
+/*
+ * ReceiveCopyData receives one copy data message from stdin, and writes this
+ * message's contents into the given argument. The function then checks if the
+ * copy protocol has been completed, and if it has, the function returns true.
+ * If not, the function returns false indicating there are more data to read.
+ * If the received message does not conform to the copy protocol, the function
+ * mirrors copy.c's error behavior.
+ */
+static bool
+ReceiveCopyData(StringInfo copyData)
+{
+	int messageType = 0;
+	int messageCopied = 0;
+	bool copyDone = true;
+	const int unlimitedSize = 0;
+
+	HOLD_CANCEL_INTERRUPTS();
+	pq_startmsgread();
+	messageType = pq_getbyte();
+	if (messageType == EOF)
+	{
+		ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
+						errmsg("unexpected EOF on client connection")));
+	}
+
+	/* consume the rest of message before checking for message type */
+	messageCopied = pq_getmessage(copyData, unlimitedSize);
+	if (messageCopied == EOF)
+	{
+		ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
+						errmsg("unexpected EOF on client connection")));
+	}
+
+	RESUME_CANCEL_INTERRUPTS();
+
+	switch (messageType)
+	{
+		case 'd':		/* CopyData */
+			copyDone = false;
+			break;
+		case 'c':		/* CopyDone */
+			copyDone = true;
+			break;
+		case 'f':		/* CopyFail */
+			ereport(ERROR, (errcode(ERRCODE_QUERY_CANCELED),
+							errmsg("COPY data failed: %s", pq_getmsgstring(copyData))));
+			break;
+		case 'H':		/* Flush */
+		case 'S':		/* Sync */
+			/*
+			 * Ignore Flush/Sync for the convenience of client libraries (such
+			 * as libpq) that may send those without noticing that the command
+			 * they just sent was COPY.
+			 */
+			copyDone = false;
+			break;
+		default:
+			ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION),
+							errmsg("unexpected message type 0x%02X during COPY data",
+								   messageType)));
+			break;
+	}
+
+	return copyDone;
+}
--- a/src/backend/distributed/executor/multi_client_executor.c
+++ b/src/backend/distributed/executor/multi_client_executor.c
@ -0,0 +1,861 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi_client_executor.c
+ *
+ * This file contains the libpq-specific parts of executing queries on remote
+ * nodes.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "fmgr.h"
+#include "libpq-fe.h"
+#include "distributed/multi_client_executor.h"
+
+#include <errno.h>
+#include <unistd.h>
+
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+#ifdef HAVE_SYS_POLL_H
+#include <sys/poll.h>
+#endif
+#ifdef HAVE_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+
+
+/* Local pool to track active connections */
+static PGconn *ClientConnectionArray[MAX_CONNECTION_COUNT];
+
+/*
+ * The value at any position on ClientPollingStatusArray is only defined when
+ * the corresponding ClientConnectionArray entry exists.
+ */
+static PostgresPollingStatusType ClientPollingStatusArray[MAX_CONNECTION_COUNT];
+
+
+/* Local functions forward declarations */
+static void ClearRemainingResults(PGconn *connection);
+static bool ClientConnectionReady(PGconn *connection,
+								  PostgresPollingStatusType pollingStatus);
+static void ReportRemoteError(PGconn *connection, PGresult *result);
+static void ReportConnectionError(PGconn *connection);
+static char * ConnectionGetOptionValue(PGconn *connection, char *optionKeyword);
+
+
+/* AllocateConnectionId returns a connection id from the connection pool. */
+static int32
+AllocateConnectionId(void)
+{
+	int32 connectionId = INVALID_CONNECTION_ID;
+	int32 connIndex = 0;
+
+	/* allocate connectionId from connection pool */
+	for (connIndex = 0; connIndex < MAX_CONNECTION_COUNT; connIndex++)
+	{
+		PGconn *connection = ClientConnectionArray[connIndex];
+		if (connection == NULL)
+		{
+			connectionId = connIndex;
+			break;
+		}
+	}
+
+	return connectionId;
+}
+
+
+/*
+ * MultiClientConnect synchronously tries to establish a connection. If it
+ * succeeds, it returns the connection id. Otherwise, it reports connection
+ * error and returns INVALID_CONNECTION_ID.
+ */
+int32
+MultiClientConnect(const char *nodeName, uint32 nodePort, const char *nodeDatabase)
+{
+	PGconn *connection = NULL;
+	char connInfoString[STRING_BUFFER_SIZE];
+	ConnStatusType connStatusType = CONNECTION_OK;
+
+	int32 connectionId = AllocateConnectionId();
+	if (connectionId == INVALID_CONNECTION_ID)
+	{
+		ereport(WARNING, (errmsg("could not allocate connection in connection pool")));
+		return connectionId;
+	}
+
+	/* transcribe connection paremeters to string */
+	snprintf(connInfoString, STRING_BUFFER_SIZE, CONN_INFO_TEMPLATE,
+			 nodeName, nodePort, nodeDatabase, CLIENT_CONNECT_TIMEOUT);
+
+	/* establish synchronous connection to worker node */
+	connection = PQconnectdb(connInfoString);
+	connStatusType = PQstatus(connection);
+
+	if (connStatusType == CONNECTION_OK)
+	{
+		ClientConnectionArray[connectionId] = connection;
+	}
+	else
+	{
+		ReportConnectionError(connection);
+
+		PQfinish(connection);
+		connectionId = INVALID_CONNECTION_ID;
+	}
+
+	return connectionId;
+}
+
+
+/*
+ * MultiClientConnectStart asynchronously tries to establish a connection. If it
+ * succeeds, it returns the connection id. Otherwise, it reports connection
+ * error and returns INVALID_CONNECTION_ID.
+ */
+int32
+MultiClientConnectStart(const char *nodeName, uint32 nodePort, const char *nodeDatabase)
+{
+	PGconn *connection = NULL;
+	char connInfoString[STRING_BUFFER_SIZE];
+	ConnStatusType connStatusType = CONNECTION_BAD;
+
+	int32 connectionId = AllocateConnectionId();
+	if (connectionId == INVALID_CONNECTION_ID)
+	{
+		ereport(WARNING, (errmsg("could not allocate connection in connection pool")));
+		return connectionId;
+	}
+
+	/* transcribe connection paremeters to string */
+	snprintf(connInfoString, STRING_BUFFER_SIZE, CONN_INFO_TEMPLATE,
+			 nodeName, nodePort, nodeDatabase, CLIENT_CONNECT_TIMEOUT);
+
+	/* prepare asynchronous request for worker node connection */
+	connection = PQconnectStart(connInfoString);
+	connStatusType = PQstatus(connection);
+
+	/*
+	 * If prepared, we save the connection, and set its initial polling status
+	 * to PGRES_POLLING_WRITING as specified in "Database Connection Control
+	 * Functions" section of the PostgreSQL documentation.
+	 */
+	if (connStatusType != CONNECTION_BAD)
+	{
+		ClientConnectionArray[connectionId] = connection;
+		ClientPollingStatusArray[connectionId] = PGRES_POLLING_WRITING;
+	}
+	else
+	{
+		ReportConnectionError(connection);
+
+		PQfinish(connection);
+		connectionId = INVALID_CONNECTION_ID;
+	}
+
+	return connectionId;
+}
+
+
+/* MultiClientConnectPoll returns the status of client connection. */
+ConnectStatus
+MultiClientConnectPoll(int32 connectionId)
+{
+	PGconn *connection = NULL;
+	PostgresPollingStatusType pollingStatus = PGRES_POLLING_OK;
+	ConnectStatus connectStatus = CLIENT_INVALID_CONNECT;
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	pollingStatus = ClientPollingStatusArray[connectionId];
+	if (pollingStatus == PGRES_POLLING_OK)
+	{
+		connectStatus = CLIENT_CONNECTION_READY;
+	}
+	else if (pollingStatus == PGRES_POLLING_READING)
+	{
+		bool readReady = ClientConnectionReady(connection, PGRES_POLLING_READING);
+		if (readReady)
+		{
+			ClientPollingStatusArray[connectionId] = PQconnectPoll(connection);
+		}
+
+		connectStatus = CLIENT_CONNECTION_BUSY;
+	}
+	else if (pollingStatus == PGRES_POLLING_WRITING)
+	{
+		bool writeReady = ClientConnectionReady(connection, PGRES_POLLING_WRITING);
+		if (writeReady)
+		{
+			ClientPollingStatusArray[connectionId] = PQconnectPoll(connection);
+		}
+
+		connectStatus = CLIENT_CONNECTION_BUSY;
+	}
+	else if (pollingStatus == PGRES_POLLING_FAILED)
+	{
+		ReportConnectionError(connection);
+
+		connectStatus = CLIENT_CONNECTION_BAD;
+	}
+
+	return connectStatus;
+}
+
+
+/* MultiClientDisconnect disconnects the connection. */
+void
+MultiClientDisconnect(int32 connectionId)
+{
+	PGconn *connection = NULL;
+	const int InvalidPollingStatus = -1;
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	PQfinish(connection);
+
+	ClientConnectionArray[connectionId] = NULL;
+	ClientPollingStatusArray[connectionId] = InvalidPollingStatus;
+}
+
+
+/*
+ * MultiClientConnectionUp checks if the connection status is up, in other words,
+ * it is not bad.
+ */
+bool
+MultiClientConnectionUp(int32 connectionId)
+{
+	PGconn *connection = NULL;
+	ConnStatusType connStatusType = CONNECTION_OK;
+	bool connectionUp = true;
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	connStatusType = PQstatus(connection);
+	if (connStatusType == CONNECTION_BAD)
+	{
+		connectionUp = false;
+	}
+
+	return connectionUp;
+}
+
+
+/* MultiClientSendQuery sends the given query over the given connection. */
+bool
+MultiClientSendQuery(int32 connectionId, const char *query)
+{
+	PGconn *connection = NULL;
+	bool success = true;
+	int querySent = 0;
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	querySent = PQsendQuery(connection, query);
+	if (querySent == 0)
+	{
+		char *errorMessage = PQerrorMessage(connection);
+		ereport(WARNING, (errmsg("could not send remote query \"%s\"", query),
+						  errdetail("Client error: %s", errorMessage)));
+
+		success = false;
+	}
+
+	return success;
+}
+
+
+/* MultiClientCancel cancels the running query on the given connection. */
+bool
+MultiClientCancel(int32 connectionId)
+{
+	PGconn *connection = NULL;
+	PGcancel *cancelObject = NULL;
+	int cancelSent = 0;
+	bool canceled = true;
+	char errorBuffer[STRING_BUFFER_SIZE];
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	cancelObject = PQgetCancel(connection);
+
+	cancelSent = PQcancel(cancelObject, errorBuffer, sizeof(errorBuffer));
+	if (cancelSent == 0)
+	{
+		ereport(WARNING, (errmsg("could not issue cancel request"),
+						  errdetail("Client error: %s",	errorBuffer)));
+
+		canceled = false;
+	}
+
+	PQfreeCancel(cancelObject);
+
+	return canceled;
+}
+
+
+/* MultiClientResultStatus checks result status for an asynchronous query. */
+ResultStatus
+MultiClientResultStatus(int32 connectionId)
+{
+	PGconn *connection = NULL;
+	int consumed = 0;
+	ConnStatusType connStatusType = CONNECTION_OK;
+	ResultStatus resultStatus = CLIENT_INVALID_RESULT_STATUS;
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	connStatusType = PQstatus(connection);
+	if (connStatusType == CONNECTION_BAD)
+	{
+		ereport(WARNING, (errmsg("could not maintain connection to worker node")));
+		return CLIENT_RESULT_UNAVAILABLE;
+	}
+
+	/* consume input to allow status change */
+	consumed = PQconsumeInput(connection);
+	if (consumed != 0)
+	{
+		int connectionBusy = PQisBusy(connection);
+		if (connectionBusy == 0)
+		{
+			resultStatus = CLIENT_RESULT_READY;
+		}
+		else
+		{
+			resultStatus = CLIENT_RESULT_BUSY;
+		}
+	}
+	else
+	{
+		ereport(WARNING, (errmsg("could not consume data from worker node")));	
+		resultStatus = CLIENT_RESULT_UNAVAILABLE;
+	}
+
+	return resultStatus;
+}
+
+
+/* MultiClientQueryResult gets results for an asynchronous query. */
+bool
+MultiClientQueryResult(int32 connectionId, void **queryResult, int *rowCount,
+					   int *columnCount)
+{
+	PGconn *connection = NULL;
+	PGresult *result = NULL;
+	ConnStatusType connStatusType = CONNECTION_OK;
+	ExecStatusType resultStatus = PGRES_COMMAND_OK;
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	connStatusType = PQstatus(connection);
+	if (connStatusType == CONNECTION_BAD)
+	{
+		ereport(WARNING, (errmsg("could not maintain connection to worker node")));
+		return false;
+	}
+
+	result = PQgetResult(connection);
+	resultStatus = PQresultStatus(result);
+	if (resultStatus == PGRES_TUPLES_OK)
+	{
+		(*queryResult) = (void **) result;
+		(*rowCount) = PQntuples(result);
+		(*columnCount) = PQnfields(result);
+	}
+	else
+	{
+		ReportRemoteError(connection, result);
+		PQclear(result);
+	}
+
+	/* clear extra result objects */
+	ClearRemainingResults(connection);
+
+	return true;
+}
+
+
+/*
+ * MultiClientBatchResult returns results for a "batch" of queries, meaning a
+ * string containing multiple select statements separated by semicolons. This
+ * function should be called multiple times to retrieve the results for all the
+ * queries, until CLIENT_BATCH_QUERY_DONE is returned (even if a failure occurs).
+ * If a query in the batch fails, the remaining queries will not be executed. On
+ * success, queryResult, rowCount and columnCount will be set to the appropriate
+ * values. After use, queryResult should be cleared using ClientClearResult.
+ */
+BatchQueryStatus
+MultiClientBatchResult(int32 connectionId, void **queryResult, int *rowCount,
+					   int *columnCount)
+{
+	PGconn *connection = NULL;
+	PGresult *result = NULL;
+	ConnStatusType connStatusType = CONNECTION_OK;
+	ExecStatusType resultStatus = PGRES_COMMAND_OK;
+	BatchQueryStatus queryStatus = CLIENT_INVALID_BATCH_QUERY;
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	/* set default result */
+	(*queryResult) = NULL;
+	(*rowCount) = -1;
+	(*columnCount) = -1;
+
+	connStatusType = PQstatus(connection);
+	if (connStatusType == CONNECTION_BAD)
+	{
+		ereport(WARNING, (errmsg("could not maintain connection to worker node")));
+		return CLIENT_BATCH_QUERY_FAILED;
+	}
+
+	result = PQgetResult(connection);
+	if (result == NULL)
+	{
+		return CLIENT_BATCH_QUERY_DONE;
+	}
+
+	resultStatus = PQresultStatus(result);
+	if (resultStatus == PGRES_TUPLES_OK)
+	{
+		(*queryResult) = (void **) result;
+		(*rowCount) = PQntuples(result);
+		(*columnCount) = PQnfields(result);
+		queryStatus = CLIENT_BATCH_QUERY_CONTINUE;
+	}
+	else if (resultStatus == PGRES_COMMAND_OK)
+	{
+		(*queryResult) = (void **) result;
+		queryStatus = CLIENT_BATCH_QUERY_CONTINUE;
+	}
+	else
+	{
+		ReportRemoteError(connection, result);
+		PQclear(result);
+		queryStatus = CLIENT_BATCH_QUERY_FAILED;
+	}
+
+	return queryStatus;
+}
+
+
+/* MultiClientGetValue returns the value of field at the given position. */
+char *
+MultiClientGetValue(void *queryResult, int rowIndex, int columnIndex)
+{
+	char *value = PQgetvalue((PGresult *) queryResult, rowIndex, columnIndex);
+	return value;
+}
+
+
+/* MultiClientClearResult free's the memory associated with a PGresult. */
+void
+MultiClientClearResult(void *queryResult)
+{
+	PQclear((PGresult *) queryResult);
+}
+
+
+/* MultiClientQueryStatus returns the query status. */
+QueryStatus
+MultiClientQueryStatus(int32 connectionId)
+{
+	PGconn *connection = NULL;
+	PGresult *result = NULL;
+	int tupleCount = 0;
+	bool copyResults = false;
+	ConnStatusType connStatusType = CONNECTION_OK;
+	ExecStatusType resultStatus = PGRES_COMMAND_OK;
+	QueryStatus queryStatus = CLIENT_INVALID_QUERY;
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	connStatusType = PQstatus(connection);
+	if (connStatusType == CONNECTION_BAD)
+	{
+		ereport(WARNING, (errmsg("could not maintain connection to worker node")));
+		return CLIENT_QUERY_FAILED;
+	}
+
+	/*
+	 * We now read the result object and check its status. If the result object
+	 * isn't ready yet (the caller didn't wait for the connection to be ready),
+	 * we will block on this call.
+	 */
+	result = PQgetResult(connection);
+	resultStatus = PQresultStatus(result);
+
+	if (resultStatus == PGRES_COMMAND_OK)
+	{
+		queryStatus = CLIENT_QUERY_DONE;
+	}
+	else if (resultStatus == PGRES_TUPLES_OK)
+	{
+		queryStatus = CLIENT_QUERY_DONE;
+
+		/*
+		 * We use the client executor to only issue a select query that returns
+		 * a void value. We therefore should not have more than one value here.
+		 */
+		tupleCount = PQntuples(result);
+		Assert(tupleCount <= 1);
+	}
+	else if (resultStatus == PGRES_COPY_OUT)
+	{
+		queryStatus = CLIENT_QUERY_COPY;
+		copyResults = true;
+	}
+	else
+	{
+		queryStatus = CLIENT_QUERY_FAILED;
+		if (resultStatus == PGRES_COPY_IN)
+		{
+			copyResults = true;
+		}
+
+		ReportRemoteError(connection, result);
+	}
+
+	/* clear the result object */
+	PQclear(result);
+
+	/*
+	 * When using the async query mechanism, we need to keep reading results
+	 * until we get null. The exception to this rule is the copy protocol.
+	 */
+	if (!copyResults)
+	{
+		ClearRemainingResults(connection);
+	}
+
+	return queryStatus;
+}
+
+
+/* MultiClientCopyData copies data from the file. */
+CopyStatus
+MultiClientCopyData(int32 connectionId, int32 fileDescriptor)
+{
+	PGconn *connection = NULL;
+	char *receiveBuffer = NULL;
+	int consumed = 0;
+	int receiveLength = 0;
+	const int asynchronous = 1;
+	CopyStatus copyStatus = CLIENT_INVALID_COPY;
+
+	Assert(connectionId != INVALID_CONNECTION_ID);
+	connection = ClientConnectionArray[connectionId];
+	Assert(connection != NULL);
+
+	/*
+	 * Consume input to handle the case where previous copy operation might have
+	 * received zero bytes.
+	 */
+	consumed = PQconsumeInput(connection);
+	if (consumed == 0)
+	{
+		ereport(WARNING, (errmsg("could not read data from worker node")));
+		return CLIENT_COPY_FAILED;
+	}
+
+	/* receive copy data message in an asynchronous manner */
+	receiveLength = PQgetCopyData(connection, &receiveBuffer, asynchronous);
+	while (receiveLength > 0)
+	{
+		/* received copy data; append these data to file */
+		int appended =  -1;
+		errno = 0;
+
+		appended = write(fileDescriptor, receiveBuffer, receiveLength);
+		if (appended != receiveLength)
+		{
+			/* if write didn't set errno, assume problem is no disk space */
+			if (errno == 0)
+			{
+				errno = ENOSPC;
+			}
+			ereport(FATAL, (errcode_for_file_access(),
+							errmsg("could not append to copied file: %m")));
+		}
+
+		PQfreemem(receiveBuffer);
+
+		receiveLength = PQgetCopyData(connection, &receiveBuffer, asynchronous);
+	}
+
+	/* we now check the last received length returned by copy data */
+	if (receiveLength == 0)
+	{
+		/* we cannot read more data without blocking */
+		copyStatus = CLIENT_COPY_MORE;
+	}
+	else if (receiveLength == -1)
+	{
+		/* received copy done message */
+		PGresult *result = PQgetResult(connection);
+		ExecStatusType resultStatus = PQresultStatus(result);
+
+		if (resultStatus == PGRES_COMMAND_OK)
+		{
+			copyStatus = CLIENT_COPY_DONE;
+		}
+		else
+		{
+			copyStatus = CLIENT_COPY_FAILED;
+
+			ReportRemoteError(connection, result);
+		}
+
+		PQclear(result);
+	}
+	else if (receiveLength == -2)
+	{
+		/* received an error */
+		copyStatus = CLIENT_COPY_FAILED;
+
+		ReportConnectionError(connection);
+	}
+
+	/* if copy out completed, make sure we drain all results from libpq */
+	if (receiveLength < 0)
+	{
+		ClearRemainingResults(connection);
+	}
+
+	return copyStatus;
+}
+
+
+/*
+ * ClearRemainingResults reads result objects from the connection until we get
+ * null, and clears these results. This is the last step in completing an async
+ * query.
+ */
+static void
+ClearRemainingResults(PGconn *connection)
+{
+	PGresult *result = PQgetResult(connection);
+	while (result != NULL)
+	{
+		PQclear(result);
+		result = PQgetResult(connection);
+	}
+}
+
+
+/*
+ * ClientConnectionReady checks if the given connection is ready for non-blocking
+ * reads or writes. This function is loosely based on pqSocketCheck() at fe-misc.c
+ * and libpq_select() at libpqwalreceiver.c.
+ */
+static bool
+ClientConnectionReady(PGconn *connection, PostgresPollingStatusType pollingStatus)
+{
+	bool clientConnectionReady = false;
+	int pollResult = 0;
+
+	/* we use poll(2) if available, otherwise select(2) */
+#ifdef HAVE_POLL
+	int fileDescriptorCount = 1;
+	int immediateTimeout = 0;
+	int pollEventMask = 0;
+	struct pollfd pollFileDescriptor;
+
+	if (pollingStatus == PGRES_POLLING_READING)
+	{
+		pollEventMask = POLLERR | POLLIN;
+	}
+	else if (pollingStatus == PGRES_POLLING_WRITING)
+	{
+		pollEventMask = POLLERR | POLLOUT;
+	}
+
+	pollFileDescriptor.fd = PQsocket(connection);
+	pollFileDescriptor.events = pollEventMask;
+	pollFileDescriptor.revents = 0;
+
+	pollResult = poll(&pollFileDescriptor, fileDescriptorCount, immediateTimeout);
+#else /* !HAVE_POLL */
+
+	fd_set readFileDescriptorSet;
+	fd_set writeFileDescriptorSet;
+	fd_set exceptionFileDescriptorSet;
+	struct timeval immediateTimeout = {0, 0};
+	int connectionFileDescriptor = PQsocket(connection);
+
+	FD_ZERO(&readFileDescriptorSet);
+	FD_ZERO(&writeFileDescriptorSet);
+	FD_ZERO(&exceptionFileDescriptorSet);
+
+	if (pollingStatus == PGRES_POLLING_READING)
+	{
+		FD_SET(connectionFileDescriptor, &exceptionFileDescriptorSet);
+		FD_SET(connectionFileDescriptor, &readFileDescriptorSet);
+	}
+	else if (pollingStatus == PGRES_POLLING_WRITING)
+	{
+		FD_SET(connectionFileDescriptor, &exceptionFileDescriptorSet);
+		FD_SET(connectionFileDescriptor, &writeFileDescriptorSet);
+	}
+
+	pollResult = select(connectionFileDescriptor + 1, &readFileDescriptorSet,
+						&writeFileDescriptorSet, &exceptionFileDescriptorSet,
+						&immediateTimeout);
+#endif /* HAVE_POLL */
+
+	if (pollResult > 0)
+	{
+		clientConnectionReady = true;
+	}
+	else if (pollResult == 0)
+	{
+		clientConnectionReady = false;
+	}
+	else if (pollResult < 0)
+	{
+		if (errno == EINTR)
+		{
+			/*
+			 * If a signal was caught, we return false so the caller polls the
+			 * connection again.
+			 */
+			clientConnectionReady = false;
+		}
+		else
+		{
+			/*
+			 * poll() or select() can set errno to EFAULT (when socket is not
+			 * contained in the calling program's address space), EBADF (invalid
+			 * file descriptor), EINVAL (invalid arguments to select or poll),
+			 * and ENOMEM (no space to allocate file descriptor tables). Out of
+			 * these, only ENOMEM is likely here, and it is a fatal error, so we
+			 * error out.
+			 */
+			Assert(errno == ENOMEM);
+			ereport(ERROR, (errcode_for_socket_access(),
+							errmsg("select()/poll() failed: %m")));
+		}
+	}
+
+	return clientConnectionReady;
+}
+
+
+/*
+ * ReportRemoteError retrieves various error fields from the a remote result and
+ * produces an error report at the WARNING level.
+ */
+static void
+ReportRemoteError(PGconn *connection, PGresult *result)
+{
+	char *sqlStateString = PQresultErrorField(result, PG_DIAG_SQLSTATE);
+	char *remoteMessage = PQresultErrorField(result, PG_DIAG_MESSAGE_PRIMARY);
+	char *nodeName = ConnectionGetOptionValue(connection, "host");
+	char *nodePort = ConnectionGetOptionValue(connection, "port");
+	char *errorPrefix = "could not connect to node";
+	int sqlState = ERRCODE_CONNECTION_FAILURE;
+
+	if (sqlStateString != NULL)
+	{
+		sqlState = MAKE_SQLSTATE(sqlStateString[0], sqlStateString[1], sqlStateString[2],
+								 sqlStateString[3], sqlStateString[4]);
+
+		/* use more specific error prefix for result failures */
+		if (sqlState != ERRCODE_CONNECTION_FAILURE)
+		{
+			errorPrefix = "could not receive query results from";
+		}
+	}
+
+	/*
+	 * If the PGresult did not contain a message, the connection may provide a
+	 * suitable top level one. At worst, this is an empty string.
+	 */
+	if (remoteMessage == NULL)
+	{
+		char *lastNewlineIndex = NULL;
+
+		remoteMessage = PQerrorMessage(connection);
+		lastNewlineIndex = strrchr(remoteMessage, '\n');
+
+		/* trim trailing newline, if any */
+		if (lastNewlineIndex != NULL)
+		{
+			*lastNewlineIndex = '\0';
+		}
+	}
+
+	ereport(WARNING, (errcode(sqlState),
+					  errmsg("%s %s:%s", errorPrefix, nodeName, nodePort),
+					  errdetail("Client error: %s", remoteMessage)));
+}
+
+
+/*
+ * ReportConnectionError raises a WARNING and reports that we could not
+ * establish the given connection.
+ */
+static void
+ReportConnectionError(PGconn *connection)
+{
+	char *nodeName = ConnectionGetOptionValue(connection, "host");
+	char *nodePort = ConnectionGetOptionValue(connection, "port");
+	char *errorMessage = PQerrorMessage(connection);
+
+	ereport(WARNING, (errcode(ERRCODE_CONNECTION_FAILURE),
+					  errmsg("could not connect to node %s:%s", nodeName, nodePort),
+					  errdetail("Client error: %s", errorMessage)));
+}
+
+
+/*
+ * ConnectionGetOptionValue inspects the provided connection for an option with
+ * a given keyword and returns a new palloc'd string with that options's value.
+ * The function returns NULL if the connection has no setting for an option with
+ * the provided keyword.
+ */
+static char *
+ConnectionGetOptionValue(PGconn *connection, char *optionKeyword)
+{
+	char *optionValue = NULL;
+	PQconninfoOption *option = NULL;
+	PQconninfoOption *conninfoOptions = PQconninfo(connection);
+
+	for (option = conninfoOptions; option->keyword != NULL; option++)
+	{
+		if (strncmp(option->keyword, optionKeyword, NAMEDATALEN) == 0)
+		{
+			optionValue = pstrdup(option->val);
+		}
+	}
+
+	PQconninfoFree(conninfoOptions);
+
+	return optionValue;
+}
--- a/src/backend/distributed/executor/multi_executor.c
+++ b/src/backend/distributed/executor/multi_executor.c
@ -0,0 +1,278 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi_executor.c
+ *
+ * Entrypoint into distributed query execution.
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+
+#include "access/xact.h"
+#include "catalog/dependency.h"
+#include "catalog/namespace.h"
+#include "distributed/multi_executor.h"
+#include "distributed/multi_master_planner.h"
+#include "distributed/multi_planner.h"
+#include "distributed/multi_router_executor.h"
+#include "distributed/multi_resowner.h"
+#include "distributed/multi_server_executor.h"
+#include "distributed/multi_utility.h"
+#include "distributed/worker_protocol.h"
+#include "executor/execdebug.h"
+#include "storage/lmgr.h"
+#include "tcop/utility.h"
+#include "utils/snapmgr.h"
+
+
+/*
+ * multi_ExecutorStart is a hook called at at the beginning of any execution
+ * of any query plan.
+ *
+ * If a distributed relation is the target of the query, perform some validity
+ * checks. If a legal statement, start the distributed execution. After that
+ * the to-be-executed query is replaced with the portion executing solely on
+ * the master.
+ */
+void
+multi_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+	PlannedStmt *planStatement = queryDesc->plannedstmt;
+
+	if (HasCitusToplevelNode(planStatement))
+	{
+		MultiPlan *multiPlan = GetMultiPlan(planStatement);
+		MultiExecutorType executorType = MULTI_EXECUTOR_INVALID_FIRST;
+		Job *workerJob = multiPlan->workerJob;
+
+		executorType = JobExecutorType(multiPlan);
+		if (executorType == MULTI_EXECUTOR_ROUTER)
+		{
+			Task *task = NULL;
+			List *taskList = workerJob->taskList;
+			List *dependendJobList PG_USED_FOR_ASSERTS_ONLY = workerJob->dependedJobList;
+			List *workerTargetList = multiPlan->workerJob->jobQuery->targetList;
+			TupleDesc tupleDescriptor = ExecCleanTypeFromTL(workerTargetList, false);
+
+			/* router executor can only execute distributed plans with a single task */
+			Assert(list_length(taskList) == 1);
+			Assert(dependendJobList == NIL);
+
+			task = (Task *) linitial(taskList);
+
+			/* we need to set tupleDesc in executorStart */
+			queryDesc->tupDesc = tupleDescriptor;
+
+			/* drop into the router executor */
+			RouterExecutorStart(queryDesc, eflags, task);
+		}
+		else
+		{
+			PlannedStmt *masterSelectPlan = MasterNodeSelectPlan(multiPlan);
+			CreateStmt *masterCreateStmt = MasterNodeCreateStatement(multiPlan);
+			List *masterCopyStmtList = MasterNodeCopyStatementList(multiPlan);
+			ListCell *masterCopyStmtCell = NULL;
+			RangeTblEntry *masterRangeTableEntry = NULL;
+			StringInfo jobDirectoryName = NULL;
+
+			/*
+			 * We create a directory on the master node to keep task execution results.
+			 * We also register this directory for automatic cleanup on portal delete.
+			 */
+			jobDirectoryName = JobDirectoryName(workerJob->jobId);
+			CreateDirectory(jobDirectoryName);
+
+			ResourceOwnerEnlargeJobDirectories(CurrentResourceOwner);
+			ResourceOwnerRememberJobDirectory(CurrentResourceOwner, workerJob->jobId);
+
+			/* pick distributed executor to use */
+			if (executorType == MULTI_EXECUTOR_REAL_TIME)
+			{
+				MultiRealTimeExecute(workerJob);
+			}
+			else if (executorType == MULTI_EXECUTOR_TASK_TRACKER)
+			{
+				MultiTaskTrackerExecute(workerJob);
+			}
+
+			/* then create the result relation */
+			ProcessUtility((Node *) masterCreateStmt,
+						   "(temp table creation)",
+						   PROCESS_UTILITY_QUERY,
+						   NULL,
+						   None_Receiver,
+						   NULL);
+
+			/* make the temporary table visible */
+			CommandCounterIncrement();
+
+			/* now copy data from all the remote nodes into temp table */
+			foreach(masterCopyStmtCell, masterCopyStmtList)
+			{
+				Node *masterCopyStmt = (Node *) lfirst(masterCopyStmtCell);
+
+				Assert(IsA(masterCopyStmt, CopyStmt));
+
+				ProcessUtility(masterCopyStmt,
+							   "(copy job)",
+							   PROCESS_UTILITY_QUERY,
+							   NULL,
+							   None_Receiver,
+							   NULL);
+			}
+
+			/* make the copied contents visible */
+			CommandCounterIncrement();
+
+			/*
+			 * Update the QueryDesc's snapshot so it sees the table. That's not
+			 * particularly pretty, but we don't have much of a choice.  One might
+			 * think we could unregister the snapshot, push a new active one,
+			 * update it, register it, and be happy. That only works if it's only
+			 * registered once though...
+			 */
+			queryDesc->snapshot->curcid = GetCurrentCommandId(false);
+
+			/*
+			 * Set the OID of the RTE used in the master select statement to point
+			 * to the now created (and filled) temporary table. The target
+			 * relation's oid is only known now.
+			 */
+			masterRangeTableEntry =
+				(RangeTblEntry *) linitial(masterSelectPlan->rtable);
+			masterRangeTableEntry->relid =
+				RelnameGetRelid(masterRangeTableEntry->eref->aliasname);
+
+			/*
+			 * Replace to-be-run query with the master select query. As the
+			 * planned statement is now replaced we can't call GetMultiPlan() in
+			 * the later hooks, so we set a flag marking this as a distributed
+			 * statement running on the master. That e.g. allows us to drop the
+			 * temp table later.
+			 */
+			queryDesc->plannedstmt = masterSelectPlan;
+			eflags |= EXEC_FLAG_CITUS_MASTER_SELECT;
+		}
+
+	}
+
+	/* if the execution is not done for router executor, drop into standard executor */
+	if (queryDesc->estate == NULL ||
+		!(queryDesc->estate->es_top_eflags & EXEC_FLAG_CITUS_ROUTER_EXECUTOR))
+	{
+		standard_ExecutorStart(queryDesc, eflags);
+	}
+}
+
+
+/* Execute query plan. */
+void
+multi_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count)
+{
+	int eflags = queryDesc->estate->es_top_eflags;
+
+	if (eflags & EXEC_FLAG_CITUS_ROUTER_EXECUTOR)
+	{
+		Task *task = NULL;
+		PlannedStmt *planStatement = queryDesc->plannedstmt;
+		MultiPlan *multiPlan = GetMultiPlan(planStatement);
+		List *taskList = multiPlan->workerJob->taskList;
+
+		/* router executor can only execute distributed plans with a single task */
+		Assert(list_length(taskList) == 1);
+
+		task = (Task *) linitial(taskList);
+
+		/* drop into the router executor */
+		RouterExecutorRun(queryDesc, direction, count, task);
+	}
+	else
+	{
+		/* drop into the standard executor */
+		standard_ExecutorRun(queryDesc, direction, count);
+	}
+}
+
+
+/* Perform actions, like e.g. firing triggers, after the query has run. */
+void
+multi_ExecutorFinish(QueryDesc *queryDesc)
+{
+	int eflags = queryDesc->estate->es_top_eflags;
+
+	if (eflags & EXEC_FLAG_CITUS_ROUTER_EXECUTOR)
+	{
+		/* drop into the router executor */
+		RouterExecutorFinish(queryDesc);
+	}
+	else
+	{
+		/* drop into the standard executor */
+		standard_ExecutorFinish(queryDesc);
+	}
+}
+
+
+/*
+ * multi_ExecutorEnd is a hook called to deallocate resources used during
+ * query execution.
+ *
+ * If the query executed was the portion of a distributed query running on the
+ * master, remove the resources that were needed for distributed execution.
+ */
+void
+multi_ExecutorEnd(QueryDesc *queryDesc)
+{
+	int eflags = queryDesc->estate->es_top_eflags;
+
+	if (eflags & EXEC_FLAG_CITUS_ROUTER_EXECUTOR)
+	{
+		/* drop into the router executor */
+		RouterExecutorEnd(queryDesc);
+	}
+	else
+	{
+		/* drop into the standard executor */
+		standard_ExecutorEnd(queryDesc);
+	}
+
+	/*
+	 * Final step of a distributed query is executing the master node select
+	 * query. We clean up the temp tables after executing it, if we already created it.
+	 */
+	if (eflags & EXEC_FLAG_CITUS_MASTER_SELECT)
+	{
+		PlannedStmt *planStatement = queryDesc->plannedstmt;
+		int savedLogMinMessages = 0;
+		int savedClientMinMessages = 0;
+
+		RangeTblEntry *rangeTableEntry = linitial(planStatement->rtable);
+		Oid masterTableRelid = rangeTableEntry->relid;
+
+		ObjectAddress masterTableObject = {InvalidOid, InvalidOid, 0};
+
+		masterTableObject.classId = RelationRelationId;
+		masterTableObject.objectId = masterTableRelid;
+		masterTableObject.objectSubId = 0;
+
+		/*
+		 * Temporarily change logging level to avoid DEBUG2 logging output by
+		 * performDeletion. This avoids breaking the regression tests which
+		 * use DEBUG2 logging.
+		 */
+		savedLogMinMessages = log_min_messages;
+		savedClientMinMessages = client_min_messages;
+
+		log_min_messages = INFO;
+		client_min_messages = INFO;
+
+		performDeletion(&masterTableObject, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
+
+		log_min_messages = savedLogMinMessages;
+		client_min_messages = savedClientMinMessages;
+	}
+}
--- a/src/backend/distributed/executor/multi_real_time_executor.c
+++ b/src/backend/distributed/executor/multi_real_time_executor.c
@ -0,0 +1,862 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi_real_time_executor.c
+ *
+ * Routines for executing remote tasks as part of a distributed execution plan
+ * in real-time. These routines open up a separate connection for each task they
+ * need to execute, and therefore return their results faster. However, they can
+ * only handle as many tasks as the number of file descriptors (connections)
+ * available. They also can't handle execution primitives that need to write
+ * their results to intermediate files.
+ *
+ * Copyright (c) 2013, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include <unistd.h>
+
+#include "commands/dbcommands.h"
+#include "distributed/multi_client_executor.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/multi_server_executor.h"
+#include "distributed/worker_protocol.h"
+#include "storage/fd.h"
+
+
+/* Local functions forward declarations */
+static ConnectAction ManageTaskExecution(Task *task, TaskExecution *taskExecution);
+static bool TaskExecutionReadyToStart(TaskExecution *taskExecution);
+static bool TaskExecutionCompleted(TaskExecution *taskExecution);
+static void CancelTaskExecutionIfActive(TaskExecution *taskExecution);
+static void CancelRequestIfActive(TaskExecStatus taskStatus, int connectionId);
+
+/* Worker node state hash functions */
+static HTAB * WorkerHash(const char *workerHashName, List *workerNodeList);
+static HTAB * WorkerHashCreate(const char *workerHashName, uint32 workerHashSize);
+static WorkerNodeState * WorkerHashEnter(HTAB *workerHash,
+										 char *nodeName, uint32 nodePort);
+static WorkerNodeState * WorkerHashLookup(HTAB *workerHash,
+										  const char *nodeName, uint32 nodePort);
+static WorkerNodeState * LookupWorkerForTask(HTAB *workerHash, Task *task,
+											 TaskExecution *taskExecution);
+
+/* Throttling functions */
+static bool WorkerConnectionsExhausted(WorkerNodeState *workerNodeState);
+static bool MasterConnectionsExhausted(HTAB *workerHash);
+static uint32 TotalOpenConnectionCount(HTAB *workerHash);
+static void UpdateConnectionCounter(WorkerNodeState *workerNode,
+									ConnectAction connectAction);
+
+
+/*
+ * MultiRealTimeExecute loops over the given tasks, and manages their execution
+ * until either one task permanently fails or all tasks successfully complete.
+ * The function opens up a connection for each task it needs to execute, and
+ * manages these tasks' execution in real-time.
+ */
+void
+MultiRealTimeExecute(Job *job)
+{
+	List *taskList = job->taskList;
+	List *taskExecutionList = NIL;
+	ListCell *taskExecutionCell = NULL;
+	ListCell *taskCell = NULL;
+	uint32 failedTaskId = 0;
+	bool allTasksCompleted = false;
+	bool taskCompleted = false;
+	bool taskFailed = false;
+
+	List *workerNodeList = NIL;
+	HTAB *workerHash = NULL;
+	const char *workerHashName = "Worker node hash";
+
+	workerNodeList = WorkerNodeList();
+	workerHash = WorkerHash(workerHashName, workerNodeList);
+
+	/* initialize task execution structures for remote execution */
+	foreach(taskCell, taskList)
+	{
+		Task *task = (Task *) lfirst(taskCell);
+
+		TaskExecution *taskExecution = InitTaskExecution(task, EXEC_TASK_CONNECT_START);
+		taskExecutionList = lappend(taskExecutionList, taskExecution);
+	}
+
+	/* loop around until all tasks complete, one task fails, or user cancels */
+	while ( !(allTasksCompleted || taskFailed || QueryCancelPending) )
+	{
+		uint32 taskCount = list_length(taskList);
+		uint32 completedTaskCount = 0;
+
+		/* loop around all tasks and manage them */
+		ListCell *taskCell = NULL;
+		ListCell *taskExecutionCell = NULL;
+
+		forboth(taskCell, taskList, taskExecutionCell, taskExecutionList)
+		{
+			Task *task = (Task *) lfirst(taskCell);
+			TaskExecution *taskExecution = (TaskExecution *) lfirst(taskExecutionCell);
+			ConnectAction connectAction = CONNECT_ACTION_NONE;
+			WorkerNodeState *workerNodeState = NULL;
+
+			workerNodeState = LookupWorkerForTask(workerHash, task, taskExecution);
+
+			/* in case the task is about to start, throttle if necessary */
+			if (TaskExecutionReadyToStart(taskExecution) &&
+				(WorkerConnectionsExhausted(workerNodeState) ||
+				 MasterConnectionsExhausted(workerHash)))
+			{
+				continue;
+			}
+
+			/* call the function that performs the core task execution logic */
+			connectAction = ManageTaskExecution(task, taskExecution);
+
+			/* update the connection counter for throttling */
+			UpdateConnectionCounter(workerNodeState, connectAction);
+
+			/*
+			 * If this task failed, we need to iterate over task executions, and
+			 * manually clean out their client-side resources. Hence, we record
+			 * the failure here instead of immediately erroring out.
+			 */
+			taskFailed = TaskExecutionFailed(taskExecution);
+			if (taskFailed)
+			{
+				failedTaskId = taskExecution->taskId;
+				break;
+			}
+
+			taskCompleted = TaskExecutionCompleted(taskExecution);
+			if (taskCompleted)
+			{
+				completedTaskCount++;
+			}
+		}
+
+		/* check if all tasks completed; otherwise sleep to avoid tight loop */
+		if (completedTaskCount == taskCount)
+		{
+			allTasksCompleted = true;
+		}
+		else
+		{
+			long sleepIntervalPerCycle = RemoteTaskCheckInterval * 1000L;
+			pg_usleep(sleepIntervalPerCycle);
+		}
+	}
+
+	/*
+	 * We prevent cancel/die interrupts until we clean up connections to worker
+	 * nodes. Note that for the above while loop, if the user Ctrl+C's a query
+	 * and we emit a warning before looping to the beginning of the while loop,
+	 * we will get canceled away before we can hold any interrupts.
+	 */
+	HOLD_INTERRUPTS();
+
+	/* cancel any active task executions */
+	taskExecutionCell = NULL;
+	foreach(taskExecutionCell, taskExecutionList)
+	{
+		TaskExecution *taskExecution = (TaskExecution *) lfirst(taskExecutionCell);
+		CancelTaskExecutionIfActive(taskExecution);
+	}
+
+	/*
+	 * If cancel might have been sent, give remote backends some time to flush
+	 * their responses. This avoids some broken pipe logs on the backend-side.
+	 */
+	if (taskFailed || QueryCancelPending)
+	{
+		long sleepInterval = RemoteTaskCheckInterval * 1000L;
+		pg_usleep(sleepInterval);
+	}
+
+	/* close connections and open files */
+	taskExecutionCell = NULL;
+	foreach(taskExecutionCell, taskExecutionList)
+	{
+		TaskExecution *taskExecution = (TaskExecution *) lfirst(taskExecutionCell);
+		CleanupTaskExecution(taskExecution);
+	}
+
+	RESUME_INTERRUPTS();
+
+	/*
+	 * If we previously broke out of the execution loop due to a task failure or
+	 * user cancellation request, we can now safely emit an error message (all
+	 * client-side resources have been cleared).
+	 */
+	if (taskFailed)
+	{
+		ereport(ERROR, (errmsg("failed to execute job " UINT64_FORMAT, job->jobId),
+						errdetail("Failure due to failed task %u", failedTaskId)));
+	}
+	else if (QueryCancelPending)
+	{
+		CHECK_FOR_INTERRUPTS();
+	}
+}
+
+
+/*
+ * ManageTaskExecution manages all execution logic for the given task. For this,
+ * the function starts a new "execution" on a node, and tracks this execution's
+ * progress. On failure, the function restarts this execution on another node.
+ * Note that this function directly manages a task's execution by opening up a
+ * separate connection to the worker node for each execution. The function
+ * returns a ConnectAction enum indicating whether a connection has been opened
+ * or closed in this call.
+ */
+static ConnectAction
+ManageTaskExecution(Task *task, TaskExecution *taskExecution)
+{
+	TaskExecStatus *taskStatusArray = taskExecution->taskStatusArray;
+	int32 *connectionIdArray = taskExecution->connectionIdArray;
+	int32 *fileDescriptorArray = taskExecution->fileDescriptorArray;
+	uint32 currentIndex = taskExecution->currentNodeIndex;
+	TaskExecStatus currentStatus = taskStatusArray[currentIndex];
+	List *taskPlacementList = task->taskPlacementList;
+	ShardPlacement *taskPlacement = list_nth(taskPlacementList, currentIndex);
+	char *nodeName = taskPlacement->nodeName;
+	uint32 nodePort = taskPlacement->nodePort;
+	ConnectAction connectAction = CONNECT_ACTION_NONE;
+
+	switch (currentStatus)
+	{
+	case EXEC_TASK_CONNECT_START:
+	{
+		int32 connectionId = INVALID_CONNECTION_ID;
+		char *nodeDatabase = NULL;
+
+		/* we use the same database name on the master and worker nodes */
+		nodeDatabase = get_database_name(MyDatabaseId);
+
+		connectionId = MultiClientConnectStart(nodeName, nodePort, nodeDatabase);
+		connectionIdArray[currentIndex] = connectionId;
+
+		/* if valid, poll the connection until the connection is initiated */
+		if (connectionId != INVALID_CONNECTION_ID)
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_CONNECT_POLL;
+			taskExecution->connectPollCount = 0;
+			connectAction = CONNECT_ACTION_OPENED;
+		}
+		else
+		{
+			AdjustStateForFailure(taskExecution);
+		}
+
+		break;
+	}
+
+	case EXEC_TASK_CONNECT_POLL:
+	{
+		int32 connectionId = connectionIdArray[currentIndex];
+		ConnectStatus pollStatus = MultiClientConnectPoll(connectionId);
+
+		/*
+		 * If the connection is established, we reset the data fetch counter and
+		 * change our status to data fetching.
+		 */
+		if (pollStatus == CLIENT_CONNECTION_READY)
+		{
+			taskExecution->dataFetchTaskIndex = -1;
+			taskStatusArray[currentIndex] = EXEC_FETCH_TASK_LOOP;
+		}
+		else if (pollStatus == CLIENT_CONNECTION_BUSY)
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_CONNECT_POLL;
+		}
+		else if (pollStatus == CLIENT_CONNECTION_BAD)
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+		}
+
+		/* now check if we have been trying to connect for too long */
+		taskExecution->connectPollCount++;
+		if (pollStatus == CLIENT_CONNECTION_BUSY)
+		{
+			uint32 maxCount = REMOTE_NODE_CONNECT_TIMEOUT / RemoteTaskCheckInterval;
+			uint32 currentCount = taskExecution->connectPollCount;
+			if (currentCount >= maxCount)
+			{
+				ereport(WARNING, (errmsg("could not establish asynchronous connection "
+										 "after %u ms", REMOTE_NODE_CONNECT_TIMEOUT)));
+
+				taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+			}
+		}
+
+		break;
+	}
+
+	case EXEC_TASK_FAILED:
+	{
+		/*
+		 * On task failure, we close the connection. We also reset our execution
+		 * status assuming that we might fail on all other worker nodes and come
+		 * back to this failed node. In that case, we will retry the same fetch
+		 * and compute task(s) on this node again.
+		 */
+		int32 connectionId = connectionIdArray[currentIndex];
+		MultiClientDisconnect(connectionId);
+		connectionIdArray[currentIndex] = INVALID_CONNECTION_ID;
+		connectAction = CONNECT_ACTION_CLOSED;
+
+		taskStatusArray[currentIndex] = EXEC_TASK_CONNECT_START;
+
+		/* try next worker node */
+		AdjustStateForFailure(taskExecution);
+
+		break;
+	}
+
+	case EXEC_FETCH_TASK_LOOP:
+	{
+		List *dataFetchTaskList = task->dependedTaskList;
+		int32 dataFetchTaskCount = list_length(dataFetchTaskList);
+
+		/* move to the next data fetch task */
+		taskExecution->dataFetchTaskIndex++;
+
+		if (taskExecution->dataFetchTaskIndex < dataFetchTaskCount)
+		{
+			taskStatusArray[currentIndex] = EXEC_FETCH_TASK_START;
+		}
+		else
+		{
+			taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_START;
+		}
+
+		break;
+	}
+
+	case EXEC_FETCH_TASK_START:
+	{
+		List *dataFetchTaskList = task->dependedTaskList;
+		int32 dataFetchTaskIndex = taskExecution->dataFetchTaskIndex;
+		Task *dataFetchTask = (Task *) list_nth(dataFetchTaskList, dataFetchTaskIndex);
+
+		char *dataFetchQuery = dataFetchTask->queryString;
+		int32 connectionId = connectionIdArray[currentIndex];
+
+		bool querySent = MultiClientSendQuery(connectionId, dataFetchQuery);
+		if (querySent)
+		{
+			taskStatusArray[currentIndex] = EXEC_FETCH_TASK_RUNNING;
+		}
+		else
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+		}
+
+		break;
+	}
+
+	case EXEC_FETCH_TASK_RUNNING:
+	{
+		int32 connectionId = connectionIdArray[currentIndex];
+		ResultStatus resultStatus = MultiClientResultStatus(connectionId);
+		QueryStatus queryStatus = CLIENT_INVALID_QUERY;
+
+		/* check if query results are in progress or unavailable */
+		if (resultStatus == CLIENT_RESULT_BUSY)
+		{
+			taskStatusArray[currentIndex] = EXEC_FETCH_TASK_RUNNING;
+			break;
+		}
+		else if (resultStatus == CLIENT_RESULT_UNAVAILABLE)
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+			break;
+		}
+
+		Assert(resultStatus == CLIENT_RESULT_READY);
+
+		/*
+		 * If the query executed successfully, loop onto the next data fetch
+		 * task. Else if the query failed, try data fetching on another node.
+		 */
+		queryStatus = MultiClientQueryStatus(connectionId);
+		if (queryStatus == CLIENT_QUERY_DONE)
+		{
+			taskStatusArray[currentIndex] = EXEC_FETCH_TASK_LOOP;
+		}
+		else if (queryStatus == CLIENT_QUERY_FAILED)
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+		}
+		else
+		{
+			ereport(FATAL, (errmsg("invalid query status: %d", queryStatus)));
+		}
+
+		break;
+	}
+
+	case EXEC_COMPUTE_TASK_START:
+	{
+		int32 connectionId = connectionIdArray[currentIndex];
+		bool querySent = false;
+
+		/* construct new query to copy query results to stdout */
+		char *queryString = task->queryString;
+		StringInfo computeTaskQuery = makeStringInfo();
+		if (BinaryMasterCopyFormat)
+		{
+			appendStringInfo(computeTaskQuery, COPY_QUERY_TO_STDOUT_BINARY, queryString);
+		}
+		else
+		{
+			appendStringInfo(computeTaskQuery, COPY_QUERY_TO_STDOUT_TEXT, queryString);
+		}
+
+		querySent = MultiClientSendQuery(connectionId, computeTaskQuery->data);
+		if (querySent)
+		{
+			taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_RUNNING;
+		}
+		else
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+		}
+
+		break;
+	}
+
+	case EXEC_COMPUTE_TASK_RUNNING:
+	{
+		int32 connectionId = connectionIdArray[currentIndex];
+		ResultStatus resultStatus = MultiClientResultStatus(connectionId);
+		QueryStatus queryStatus = CLIENT_INVALID_QUERY;
+
+		/* check if query results are in progress or unavailable */
+		if (resultStatus == CLIENT_RESULT_BUSY)
+		{
+			taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_RUNNING;
+			break;
+		}
+		else if (resultStatus == CLIENT_RESULT_UNAVAILABLE)
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+			break;
+		}
+
+		Assert(resultStatus == CLIENT_RESULT_READY);
+
+		/* check if our request to copy query results has been acknowledged */
+		queryStatus = MultiClientQueryStatus(connectionId);
+		if (queryStatus == CLIENT_QUERY_COPY)
+		{
+			StringInfo jobDirectoryName = JobDirectoryName(task->jobId);
+			StringInfo taskFilename = TaskFilename(jobDirectoryName, task->taskId);
+
+			char *filename = taskFilename->data;
+			int fileFlags = (O_APPEND | O_CREAT | O_RDWR | O_TRUNC | PG_BINARY);
+			int fileMode = (S_IRUSR | S_IWUSR);
+
+			int32 fileDescriptor = BasicOpenFile(filename, fileFlags, fileMode);
+			if (fileDescriptor >= 0)
+			{
+				/*
+				 * All files inside the job directory get automatically cleaned
+				 * up on transaction commit or abort.
+				 */
+				fileDescriptorArray[currentIndex] = fileDescriptor;
+				taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_COPYING;
+			}
+			else
+			{
+				ereport(WARNING, (errcode_for_file_access(),
+								  errmsg("could not open file \"%s\": %m", filename)));
+
+				taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+			}
+		}
+		else if (queryStatus == CLIENT_QUERY_FAILED)
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+		}
+		else
+		{
+			ereport(FATAL, (errmsg("invalid query status: %d", queryStatus)));
+		}
+
+		break;
+	}
+
+	case EXEC_COMPUTE_TASK_COPYING:
+	{
+		int32 connectionId = connectionIdArray[currentIndex];
+		int32 fileDesc = fileDescriptorArray[currentIndex];
+		int closed = -1;
+
+		/* copy data from worker node, and write to local file */
+		CopyStatus copyStatus = MultiClientCopyData(connectionId, fileDesc);
+
+		/* if worker node will continue to send more data, keep reading */
+		if (copyStatus == CLIENT_COPY_MORE)
+		{
+			taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_COPYING;
+		}
+		else if (copyStatus == CLIENT_COPY_DONE)
+		{
+			closed = close(fileDesc);
+			fileDescriptorArray[currentIndex] = -1;
+
+			if (closed >= 0)
+			{
+				taskStatusArray[currentIndex] = EXEC_TASK_DONE;
+
+				/* we are done executing; we no longer need the connection */
+				MultiClientDisconnect(connectionId);
+				connectionIdArray[currentIndex] = INVALID_CONNECTION_ID;
+				connectAction = CONNECT_ACTION_CLOSED;
+			}
+			else
+			{
+				ereport(WARNING, (errcode_for_file_access(),
+								  errmsg("could not close copied file: %m")));
+
+				taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+			}
+		}
+		else if (copyStatus == CLIENT_COPY_FAILED)
+		{
+			taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
+
+			closed = close(fileDesc);
+			fileDescriptorArray[currentIndex] = -1;
+
+			if (closed < 0)
+			{
+				ereport(WARNING, (errcode_for_file_access(),
+								  errmsg("could not close copy file: %m")));
+			}
+		}
+
+		break;
+	}
+
+	case EXEC_TASK_DONE:
+	{
+		/* we are done with this task's execution */
+		break;
+	}
+
+	default:
+	{
+		/* we fatal here to avoid leaking client-side resources */
+		ereport(FATAL, (errmsg("invalid execution status: %d", currentStatus)));
+		break;
+	}
+	}
+
+	return connectAction;
+}
+
+
+/* Determines if the given task is ready to start. */
+static bool
+TaskExecutionReadyToStart(TaskExecution *taskExecution)
+{
+	bool readyToStart = false;
+	TaskExecStatus *taskStatusArray = taskExecution->taskStatusArray;
+	uint32 currentIndex = taskExecution->currentNodeIndex;
+	TaskExecStatus taskStatus = taskStatusArray[currentIndex];
+
+	if (taskStatus == EXEC_TASK_CONNECT_START)
+	{
+		readyToStart = true;
+	}
+
+	return readyToStart;
+}
+
+
+/* Determines if the given task successfully completed executing. */
+static bool
+TaskExecutionCompleted(TaskExecution *taskExecution)
+{
+	bool completed = false;
+	uint32 nodeIndex = 0;
+
+	for (nodeIndex = 0; nodeIndex < taskExecution->nodeCount; nodeIndex++)
+	{
+		TaskExecStatus taskStatus = taskExecution->taskStatusArray[nodeIndex];
+		if (taskStatus == EXEC_TASK_DONE)
+		{
+			completed = true;
+			break;
+		}
+	}
+
+	return completed;
+}
+
+
+/* Iterates over all open connections, and cancels any active requests. */
+static void
+CancelTaskExecutionIfActive(TaskExecution *taskExecution)
+{
+	uint32 nodeIndex = 0;
+	for (nodeIndex = 0; nodeIndex < taskExecution->nodeCount; nodeIndex++)
+	{
+		int32 connectionId = taskExecution->connectionIdArray[nodeIndex];
+		if (connectionId != INVALID_CONNECTION_ID)
+		{
+			TaskExecStatus *taskStatusArray = taskExecution->taskStatusArray;
+			TaskExecStatus taskStatus = taskStatusArray[nodeIndex];
+
+			CancelRequestIfActive(taskStatus, connectionId);
+		}
+	}
+}
+
+
+/* Helper function to cancel an ongoing request, if any. */
+static void
+CancelRequestIfActive(TaskExecStatus taskStatus, int connectionId)
+{
+	/*
+	 * We use the task status to determine if we have an active request being
+	 * processed by the worker node. If we do, we send a cancellation request.
+	 * Note that we don't cancel data fetch tasks, and allow them to complete.
+	 */
+	if (taskStatus == EXEC_COMPUTE_TASK_RUNNING)
+	{
+		ResultStatus resultStatus = MultiClientResultStatus(connectionId);
+		if (resultStatus == CLIENT_RESULT_BUSY)
+		{
+			MultiClientCancel(connectionId);
+		}
+	}
+	else if (taskStatus == EXEC_COMPUTE_TASK_COPYING)
+	{
+		MultiClientCancel(connectionId);
+	}
+}
+
+
+/*
+ * WorkerHash creates a worker node hash with the given name. The function
+ * then inserts one entry for each worker node in the given worker node
+ * list.
+ */
+static HTAB *
+WorkerHash(const char *workerHashName, List *workerNodeList)
+{
+	uint32 workerHashSize = list_length(workerNodeList);
+	HTAB *workerHash = WorkerHashCreate(workerHashName, workerHashSize);
+
+	ListCell *workerNodeCell = NULL;
+	foreach(workerNodeCell, workerNodeList)
+	{
+		WorkerNode *workerNode = (WorkerNode *) lfirst(workerNodeCell);
+		char *nodeName = workerNode->workerName;
+		uint32 nodePort = workerNode->workerPort;
+
+		WorkerHashEnter(workerHash, nodeName, nodePort);
+	}
+
+	return workerHash;
+}
+
+
+/*
+ * WorkerHashCreate allocates memory for a worker node hash, initializes an
+ * empty hash, and returns this hash.
+ */
+static HTAB *
+WorkerHashCreate(const char *workerHashName, uint32 workerHashSize)
+{
+	HASHCTL info;
+	int hashFlags = 0;
+	HTAB *workerHash = NULL;
+
+	memset(&info, 0, sizeof(info));
+	info.keysize = WORKER_LENGTH + sizeof(uint32);
+	info.entrysize = sizeof(WorkerNodeState);
+	info.hash = tag_hash;
+	info.hcxt = CurrentMemoryContext;
+	hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	workerHash = hash_create(workerHashName, workerHashSize, &info, hashFlags);
+	if (workerHash == NULL)
+	{
+		ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY),
+						errmsg("could not initialize worker node hash")));
+	}
+
+	return workerHash;
+}
+
+
+/*
+ * WorkerHashEnter creates a new worker node entry in the given worker node
+ * hash, and checks that the worker node entry has been properly created.
+ */
+static WorkerNodeState *
+WorkerHashEnter(HTAB *workerHash, char *nodeName, uint32 nodePort)
+{
+	bool handleFound = false;
+	WorkerNodeState *workerNodeState = NULL;
+	WorkerNodeState workerNodeKey;
+
+	memset(&workerNodeKey, 0, sizeof(WorkerNodeState));
+	strlcpy(workerNodeKey.workerName, nodeName, WORKER_LENGTH);
+	workerNodeKey.workerPort = nodePort;
+
+	workerNodeState = (WorkerNodeState *) hash_search(workerHash, (void *) &workerNodeKey,
+													  HASH_ENTER, &handleFound);
+	if (handleFound)
+	{
+		ereport(WARNING, (errmsg("multiple worker node state entries for node: \"%s:%u\"",
+								 nodeName, nodePort)));
+	}
+
+	memcpy(workerNodeState, &workerNodeKey, sizeof(WorkerNodeState));
+	workerNodeState->openConnectionCount = 0;
+
+	return workerNodeState;
+}
+
+
+/*
+ * WorkerHashLookup looks for the worker node state that corresponds to the given
+ * node name and port number, and returns the found worker node state if any.
+ */
+static WorkerNodeState *
+WorkerHashLookup(HTAB *workerHash, const char *nodeName, uint32 nodePort)
+{
+	bool handleFound = false;
+	WorkerNodeState *workerNodeState = NULL;
+	WorkerNodeState workerNodeKey;
+
+	memset(&workerNodeKey, 0, sizeof(WorkerNodeState));
+	strlcpy(workerNodeKey.workerName, nodeName, WORKER_LENGTH);
+	workerNodeKey.workerPort = nodePort;
+
+	workerNodeState = (WorkerNodeState *) hash_search(workerHash, (void *) &workerNodeKey,
+													  HASH_FIND, &handleFound);
+	if (workerNodeState == NULL)
+	{
+		ereport(ERROR, (errmsg("could not find worker node state for node \"%s:%u\"",
+							   nodeName, nodePort)));
+	}
+
+	return workerNodeState;
+}
+
+
+/*
+ * LookupWorkerForTask looks for the worker node state of the current worker
+ * node of a task execution.
+ */
+static WorkerNodeState *
+LookupWorkerForTask(HTAB *workerHash, Task *task, TaskExecution *taskExecution)
+{
+	uint32 currentIndex = taskExecution->currentNodeIndex;
+	List *taskPlacementList = task->taskPlacementList;
+	ShardPlacement *taskPlacement = list_nth(taskPlacementList, currentIndex);
+	char *nodeName = taskPlacement->nodeName;
+	uint32 nodePort = taskPlacement->nodePort;
+
+	WorkerNodeState *workerNodeState = WorkerHashLookup(workerHash, nodeName, nodePort);
+
+	return workerNodeState;
+}
+
+
+/*
+ * WorkerConnectionsExhausted determines if the current query has exhausted the
+ * maximum number of open connections that can be made to a worker.
+ */
+static bool
+WorkerConnectionsExhausted(WorkerNodeState *workerNodeState)
+{
+	bool reachedLimit = false;
+
+	/*
+	 * A worker cannot accept more than max_connections connections. If we have a
+	 * small number of workers with many shards, then a single query could exhaust
+	 * max_connections unless we throttle here. We use the value of max_connections
+	 * on the master as a proxy for the worker configuration to avoid introducing a
+	 * new configuration value.
+	 */
+	if (workerNodeState->openConnectionCount >= MaxConnections)
+	{
+		reachedLimit = true;
+	}
+
+	return reachedLimit;
+}
+
+
+/*
+ * MasterConnectionsExhausted determines if the current query has exhausted
+ * the maximum number of connections the master process can make.
+ */
+static bool
+MasterConnectionsExhausted(HTAB *workerHash)
+{
+	bool reachedLimit = false;
+
+	uint32 maxConnectionCount = MaxMasterConnectionCount();
+	uint32 totalConnectionCount = TotalOpenConnectionCount(workerHash);
+	if (totalConnectionCount >= maxConnectionCount)
+	{
+		reachedLimit = true;
+	}
+
+	return reachedLimit;
+}
+
+
+/*
+ * TotalOpenConnectionCount counts the total number of open connections across all the
+ * workers.
+ */
+static uint32
+TotalOpenConnectionCount(HTAB *workerHash)
+{
+	uint32 connectionCount = 0;
+	WorkerNodeState *workerNodeState = NULL;
+
+	HASH_SEQ_STATUS status;
+	hash_seq_init(&status, workerHash);
+
+	workerNodeState = (WorkerNodeState *) hash_seq_search(&status);
+	while (workerNodeState != NULL)
+	{
+		connectionCount += workerNodeState->openConnectionCount;
+		workerNodeState = (WorkerNodeState *) hash_seq_search(&status);
+	}
+
+	return connectionCount;
+}
+
+
+/*
+ * UpdateConnectionCounter updates the connection counter for a given worker
+ * node based on the specified connect action.
+ */
+static void
+UpdateConnectionCounter(WorkerNodeState *workerNode, ConnectAction connectAction)
+{
+	if (connectAction == CONNECT_ACTION_OPENED)
+	{
+		workerNode->openConnectionCount++;
+	}
+	else if (connectAction == CONNECT_ACTION_CLOSED)
+	{
+		workerNode->openConnectionCount--;
+	}
+}
--- a/src/backend/distributed/executor/multi_router_executor.c
+++ b/src/backend/distributed/executor/multi_router_executor.c
@ -0,0 +1,563 @@
+/*
+ * multi_router_executor.c
+ *
+ * Routines for executing remote tasks as part of a distributed execution plan
+ * with synchronous connections. The routines utilize the connection cache.
+ * Therefore, only a single connection is opened for each worker. Also, router
+ * executor does not require a master table and a master query. In other words,
+ * the results that are fetched from a single worker is sent to the output console
+ * directly. Lastly, router executor can only execute a single task.
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "libpq-fe.h"
+#include "miscadmin.h"
+
+#include "access/xact.h"
+#include "distributed/connection_cache.h"
+#include "distributed/listutils.h"
+#include "distributed/multi_executor.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/multi_router_executor.h"
+#include "distributed/resource_lock.h"
+#include "executor/executor.h"
+#include "nodes/pg_list.h"
+#include "utils/builtins.h"
+
+#include "utils/elog.h"
+#include "utils/errcodes.h"
+#include "utils/memutils.h"
+#include "utils/palloc.h"
+
+
+/* controls use of locks to enforce safe commutativity */
+bool AllModificationsCommutative = false;
+
+
+static LOCKMODE CommutativityRuleToLockMode(CmdType commandType, bool upsertQuery);
+static void AcquireExecutorShardLock(Task *task, LOCKMODE lockMode);
+static int32 ExecuteDistributedModify(Task *task);
+static void ExecuteSingleShardSelect(Task *task, EState *executorState,
+									 TupleDesc tupleDescriptor,
+									 DestReceiver *destination);
+static bool SendQueryInSingleRowMode(PGconn *connection, char *query);
+static bool StoreQueryResult(PGconn *connection, TupleDesc tupleDescriptor,
+							 Tuplestorestate *tupleStore);
+
+
+/*
+ * RouterExecutorStart sets up the executor state and queryDesc for router
+ * execution.
+ */
+void
+RouterExecutorStart(QueryDesc *queryDesc, int eflags, Task *task)
+{
+	bool topLevel = true;
+	LOCKMODE lockMode = NoLock;
+	EState *executorState = NULL;
+	CmdType commandType = queryDesc->operation;
+
+	/* ensure that the task is not NULL */
+	Assert(task != NULL);
+
+	/* disallow transactions and triggers during distributed commands */
+	PreventTransactionChain(topLevel, "distributed commands");
+	eflags |= EXEC_FLAG_SKIP_TRIGGERS;
+
+	/* signal that it is a router execution */
+	eflags |= EXEC_FLAG_CITUS_ROUTER_EXECUTOR;
+
+	/* build empty executor state to obtain per-query memory context */
+	executorState = CreateExecutorState();
+	executorState->es_top_eflags = eflags;
+	executorState->es_instrument = queryDesc->instrument_options;
+
+	queryDesc->estate = executorState;
+
+#if (PG_VERSION_NUM < 90500)
+	/* make sure that upsertQuery is false for versions that UPSERT is not available */
+	Assert(task->upsertQuery == false);
+#endif
+
+	lockMode = CommutativityRuleToLockMode(commandType, task->upsertQuery);
+
+	if (lockMode != NoLock)
+	{
+		AcquireExecutorShardLock(task, lockMode);
+	}
+}
+
+
+/*
+ * CommutativityRuleToLockMode determines the commutativity rule for the given
+ * command and returns the appropriate lock mode to enforce that rule. The
+ * function assumes a SELECT doesn't modify state and therefore is commutative
+ * with all other commands. The function also assumes that an INSERT commutes
+ * with another INSERT, but not with an UPDATE/DELETE/UPSERT; and an
+ * UPDATE/DELETE/UPSERT doesn't commute with an INSERT, UPDATE, DELETE or UPSERT.
+ *
+ * Note that the above comment defines INSERT INTO ... ON CONFLICT type of queries
+ * as an UPSERT. Since UPSERT is not defined as a separate command type in postgres,
+ * we have to pass it as a second parameter to the function.
+ *
+ * The above mapping is overridden entirely when all_modifications_commutative
+ * is set to true. In that case, all commands just claim a shared lock. This
+ * allows the shard repair logic to lock out modifications while permitting all
+ * commands to otherwise commute.
+ */
+static LOCKMODE
+CommutativityRuleToLockMode(CmdType commandType, bool upsertQuery)
+{
+	LOCKMODE lockMode = NoLock;
+
+	/* bypass commutativity checks when flag enabled */
+	if (AllModificationsCommutative)
+	{
+		return ShareLock;
+	}
+
+	if (commandType == CMD_SELECT)
+	{
+		lockMode = NoLock;
+	}
+	else if (upsertQuery)
+	{
+		lockMode = ExclusiveLock;
+	}
+	else if (commandType == CMD_INSERT)
+	{
+		lockMode = ShareLock;
+	}
+	else if (commandType == CMD_UPDATE || commandType == CMD_DELETE)
+	{
+		lockMode = ExclusiveLock;
+	}
+	else
+	{
+		ereport(ERROR, (errmsg("unrecognized operation code: %d", (int) commandType)));
+	}
+
+	return lockMode;
+}
+
+
+/*
+ * AcquireExecutorShardLock: acquire shard lock needed for execution of
+ * a single task within a distributed plan.
+ */
+static void
+AcquireExecutorShardLock(Task *task, LOCKMODE lockMode)
+{
+	int64 shardId = task->shardId;
+
+	LockShardResource(shardId, lockMode);
+}
+
+
+/*
+ * RouterExecutorRun actually executes a single task on a worker.
+ */
+void
+RouterExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count, Task *task)
+{
+	EState *estate = queryDesc->estate;
+	CmdType operation = queryDesc->operation;
+	MemoryContext oldcontext = NULL;
+
+	Assert(estate != NULL);
+	Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY));
+	Assert(task != NULL);
+
+	/* we only support default scan direction and row fetch count */
+	if (!ScanDirectionIsForward(direction))
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				errmsg("scan directions other than forward scans "
+						"are unsupported")));
+	}
+	if (count != 0)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				errmsg("fetching rows from a query using a cursor "
+						"is unsupported")));
+	}
+
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	if (queryDesc->totaltime != NULL)
+	{
+		InstrStartNode(queryDesc->totaltime);
+	}
+
+	if (operation == CMD_INSERT || operation == CMD_UPDATE ||
+		operation == CMD_DELETE)
+	{
+		int32 affectedRowCount = ExecuteDistributedModify(task);
+		estate->es_processed = affectedRowCount;
+	}
+	else if (operation == CMD_SELECT)
+	{
+		DestReceiver *destination = queryDesc->dest;
+		TupleDesc resultTupleDescriptor = queryDesc->tupDesc;
+
+		ExecuteSingleShardSelect(task, estate, resultTupleDescriptor, destination);
+	}
+	else
+	{
+		ereport(ERROR, (errmsg("unrecognized operation code: %d",
+				(int) operation)));
+	}
+
+	if (queryDesc->totaltime != NULL)
+	{
+		InstrStopNode(queryDesc->totaltime, estate->es_processed);
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+
+}
+
+/*
+ * ExecuteDistributedModify is the main entry point for modifying distributed
+ * tables. A distributed modification is successful if any placement of the
+ * distributed table is successful. ExecuteDistributedModify returns the number
+ * of modified rows in that case and errors in all others. This function will
+ * also generate warnings for individual placement failures.
+ */
+static int32
+ExecuteDistributedModify(Task *task)
+{
+	int32 affectedTupleCount = -1;
+	ListCell *taskPlacementCell = NULL;
+	List *failedPlacementList = NIL;
+	ListCell *failedPlacementCell = NULL;
+
+	foreach(taskPlacementCell, task->taskPlacementList)
+	{
+		ShardPlacement *taskPlacement = (ShardPlacement *) lfirst(taskPlacementCell);
+		char *nodeName = taskPlacement->nodeName;
+		int32 nodePort = taskPlacement->nodePort;
+
+		PGconn *connection = NULL;
+		PGresult *result = NULL;
+		char *currentAffectedTupleString = NULL;
+		int32 currentAffectedTupleCount = -1;
+
+		Assert(taskPlacement->shardState == FILE_FINALIZED);
+
+		connection = GetConnection(nodeName, nodePort);
+		if (connection == NULL)
+		{
+			failedPlacementList = lappend(failedPlacementList, taskPlacement);
+			continue;
+		}
+
+		result = PQexec(connection, task->queryString);
+		if (PQresultStatus(result) != PGRES_COMMAND_OK)
+		{
+			ReportRemoteError(connection, result);
+			PQclear(result);
+
+			failedPlacementList = lappend(failedPlacementList, taskPlacement);
+			continue;
+		}
+
+		currentAffectedTupleString = PQcmdTuples(result);
+		currentAffectedTupleCount = pg_atoi(currentAffectedTupleString, sizeof(int32), 0);
+
+		if ((affectedTupleCount == -1) ||
+			(affectedTupleCount == currentAffectedTupleCount))
+		{
+			affectedTupleCount = currentAffectedTupleCount;
+		}
+		else
+		{
+			ereport(WARNING, (errmsg("modified %d tuples, but expected to modify %d",
+									 currentAffectedTupleCount, affectedTupleCount),
+							  errdetail("modified placement on %s:%d",
+										nodeName, nodePort)));
+		}
+
+		PQclear(result);
+	}
+
+	/* if all placements failed, error out */
+	if (list_length(failedPlacementList) == list_length(task->taskPlacementList))
+	{
+		ereport(ERROR, (errmsg("could not modify any active placements")));
+	}
+
+	/* otherwise, mark failed placements as inactive: they're stale */
+	foreach(failedPlacementCell, failedPlacementList)
+	{
+		ShardPlacement *failedPlacement = (ShardPlacement *) lfirst(failedPlacementCell);
+		uint64 shardLength = 0;
+
+		DeleteShardPlacementRow(failedPlacement->shardId, failedPlacement->nodeName,
+								failedPlacement->nodePort);
+		InsertShardPlacementRow(failedPlacement->shardId, FILE_INACTIVE, shardLength,
+								failedPlacement->nodeName, failedPlacement->nodePort);
+	}
+
+	return affectedTupleCount;
+}
+
+
+/*
+ * ExecuteSingleShardSelect executes the remote select query and sends the
+ * resultant tuples to the given destination receiver. If the query fails on a
+ * given placement, the function attempts it on its replica.
+ */
+static void
+ExecuteSingleShardSelect(Task *task, EState *executorState,
+						 TupleDesc tupleDescriptor, DestReceiver *destination)
+{
+	Tuplestorestate *tupleStore = NULL;
+	bool resultsOK = false;
+	TupleTableSlot *tupleTableSlot = NULL;
+
+	tupleStore = tuplestore_begin_heap(false, false, work_mem);
+
+	resultsOK = ExecuteTaskAndStoreResults(task, tupleDescriptor, tupleStore);
+	if (!resultsOK)
+	{
+		ereport(ERROR, (errmsg("could not receive query results")));
+	}
+
+	tupleTableSlot = MakeSingleTupleTableSlot(tupleDescriptor);
+
+	/* startup the tuple receiver */
+	(*destination->rStartup)(destination, CMD_SELECT, tupleDescriptor);
+
+	/* iterate over tuples in tuple store, and send them to destination */
+	for (;;)
+	{
+		bool nextTuple = tuplestore_gettupleslot(tupleStore, true, false, tupleTableSlot);
+		if (!nextTuple)
+		{
+			break;
+		}
+
+		(*destination->receiveSlot)(tupleTableSlot, destination);
+		executorState->es_processed++;
+
+		ExecClearTuple(tupleTableSlot);
+	}
+
+	/* shutdown the tuple receiver */
+	(*destination->rShutdown)(destination);
+
+	ExecDropSingleTupleTableSlot(tupleTableSlot);
+
+	tuplestore_end(tupleStore);
+}
+
+
+/*
+ * ExecuteTaskAndStoreResults executes the task on the remote node, retrieves
+ * the results and stores them in the given tuple store. If the task fails on
+ * one of the placements, the function retries it on other placements.
+ */
+bool
+ExecuteTaskAndStoreResults(Task *task, TupleDesc tupleDescriptor,
+						   Tuplestorestate *tupleStore)
+{
+	bool resultsOK = false;
+	List *taskPlacementList = task->taskPlacementList;
+	ListCell *taskPlacementCell = NULL;
+
+	/*
+	 * Try to run the query to completion on one placement. If the query fails
+	 * attempt the query on the next placement.
+	 */
+	foreach(taskPlacementCell, taskPlacementList)
+	{
+		ShardPlacement *taskPlacement = (ShardPlacement *) lfirst(taskPlacementCell);
+		char *nodeName = taskPlacement->nodeName;
+		int32 nodePort = taskPlacement->nodePort;
+		bool queryOK = false;
+		bool storedOK = false;
+
+		PGconn *connection = GetConnection(nodeName, nodePort);
+		if (connection == NULL)
+		{
+			continue;
+		}
+
+		queryOK = SendQueryInSingleRowMode(connection, task->queryString);
+		if (!queryOK)
+		{
+			PurgeConnection(connection);
+			continue;
+		}
+
+		storedOK = StoreQueryResult(connection, tupleDescriptor, tupleStore);
+		if (storedOK)
+		{
+			resultsOK = true;
+			break;
+		}
+		else
+		{
+			tuplestore_clear(tupleStore);
+			PurgeConnection(connection);
+		}
+	}
+
+	return resultsOK;
+}
+
+
+/*
+ * SendQueryInSingleRowMode sends the given query on the connection in an
+ * asynchronous way. The function also sets the single-row mode on the
+ * connection so that we receive results a row at a time.
+ */
+static bool
+SendQueryInSingleRowMode(PGconn *connection, char *query)
+{
+	int querySent = 0;
+	int singleRowMode = 0;
+
+	querySent = PQsendQuery(connection, query);
+	if (querySent == 0)
+	{
+		ReportRemoteError(connection, NULL);
+		return false;
+	}
+
+	singleRowMode = PQsetSingleRowMode(connection);
+	if (singleRowMode == 0)
+	{
+		ReportRemoteError(connection, NULL);
+		return false;
+	}
+
+	return true;
+}
+
+
+/*
+ * StoreQueryResult gets the query results from the given connection, builds
+ * tuples from the results and stores them in the given tuple-store. If the
+ * function can't receive query results, it returns false. Note that this
+ * function assumes the query has already been sent on the connection and the
+ * tuplestore has earlier been initialized.
+ */
+static bool
+StoreQueryResult(PGconn *connection, TupleDesc tupleDescriptor,
+				 Tuplestorestate *tupleStore)
+{
+	AttInMetadata *attributeInputMetadata = TupleDescGetAttInMetadata(tupleDescriptor);
+	uint32 expectedColumnCount = tupleDescriptor->natts;
+	char **columnArray = (char **) palloc0(expectedColumnCount * sizeof(char *));
+	MemoryContext ioContext = AllocSetContextCreate(CurrentMemoryContext,
+													"StoreQueryResult",
+													ALLOCSET_DEFAULT_MINSIZE,
+													ALLOCSET_DEFAULT_INITSIZE,
+													ALLOCSET_DEFAULT_MAXSIZE);
+
+	Assert(tupleStore != NULL);
+
+	for (;;)
+	{
+		uint32 rowIndex = 0;
+		uint32 columnIndex = 0;
+		uint32 rowCount = 0;
+		uint32 columnCount = 0;
+		ExecStatusType resultStatus = 0;
+
+		PGresult *result = PQgetResult(connection);
+		if (result == NULL)
+		{
+			break;
+		}
+
+		resultStatus = PQresultStatus(result);
+		if ((resultStatus != PGRES_SINGLE_TUPLE) && (resultStatus != PGRES_TUPLES_OK))
+		{
+			ReportRemoteError(connection, result);
+			PQclear(result);
+
+			return false;
+		}
+
+		rowCount = PQntuples(result);
+		columnCount = PQnfields(result);
+		Assert(columnCount == expectedColumnCount);
+
+		for (rowIndex = 0; rowIndex < rowCount; rowIndex++)
+		{
+			HeapTuple heapTuple = NULL;
+			MemoryContext oldContext = NULL;
+			memset(columnArray, 0, columnCount * sizeof(char *));
+
+			for (columnIndex = 0; columnIndex < columnCount; columnIndex++)
+			{
+				if (PQgetisnull(result, rowIndex, columnIndex))
+				{
+					columnArray[columnIndex] = NULL;
+				}
+				else
+				{
+					columnArray[columnIndex] = PQgetvalue(result, rowIndex, columnIndex);
+				}
+			}
+
+			/*
+			 * Switch to a temporary memory context that we reset after each tuple. This
+			 * protects us from any memory leaks that might be present in I/O functions
+			 * called by BuildTupleFromCStrings.
+			 */
+			oldContext = MemoryContextSwitchTo(ioContext);
+
+			heapTuple = BuildTupleFromCStrings(attributeInputMetadata, columnArray);
+
+			MemoryContextSwitchTo(oldContext);
+
+			tuplestore_puttuple(tupleStore, heapTuple);
+			MemoryContextReset(ioContext);
+		}
+
+		PQclear(result);
+	}
+
+	pfree(columnArray);
+
+	return true;
+}
+
+/*
+* RouterExecutorFinish cleans up after a distributed execution.
+*/
+void
+RouterExecutorFinish(QueryDesc *queryDesc)
+{
+	EState *estate = queryDesc->estate;
+	Assert(estate != NULL);
+
+	estate->es_finished = true;
+}
+
+
+/*
+ * RouterExecutorEnd cleans up the executor state after a distributed
+ * execution.
+ */
+void
+RouterExecutorEnd(QueryDesc *queryDesc)
+{
+	EState *estate = queryDesc->estate;
+
+	Assert(estate != NULL);
+	Assert(estate->es_finished);
+
+	FreeExecutorState(estate);
+	queryDesc->estate = NULL;
+	queryDesc->totaltime = NULL;
+}
--- a/src/backend/distributed/executor/multi_server_executor.c
+++ b/src/backend/distributed/executor/multi_server_executor.c
@ -0,0 +1,315 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi_server_executor.c
+ *
+ * Function definitions for distributed task execution for real-time
+ * and task-tracker executors, and routines common to both. The common
+ * routines are implement backend-side logic; and they trigger executions
+ * on the client-side via function hooks that they load.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include <unistd.h>
+
+#include "distributed/multi_client_executor.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/multi_resowner.h"
+#include "distributed/multi_server_executor.h"
+#include "distributed/worker_protocol.h"
+
+
+int RemoteTaskCheckInterval = 100; /* per cycle sleep interval in millisecs */
+int TaskExecutorType = MULTI_EXECUTOR_REAL_TIME; /* distributed executor type */
+bool BinaryMasterCopyFormat = false; /* copy data from workers in binary format */
+
+
+/*
+ * JobExecutorType selects the executor type for the given multiPlan using the task
+ * executor type config value. The function then checks if the given multiPlan needs
+ * more resources than those provided to it by other config values, and issues
+ * warnings accordingly. If the selected executor type cannot execute the given
+ * multiPlan, the function errors out.
+ */
+MultiExecutorType
+JobExecutorType(MultiPlan *multiPlan)
+{
+	Job *job = multiPlan->workerJob;
+	Query *masterQuery = multiPlan->masterQuery;
+	List *workerTaskList = job->taskList;
+	List *workerNodeList = WorkerNodeList();
+	int taskCount = list_length(workerTaskList);
+	int workerNodeCount = list_length(workerNodeList);
+	double tasksPerNode = taskCount / ((double) workerNodeCount);
+	int dependedJobCount = list_length(job->dependedJobList);
+
+	MultiExecutorType executorType = TaskExecutorType;
+
+	/* check if the first task is a modify task, short-circuit if so */
+	if (taskCount > 0)
+	{
+		Task *firstTask = (Task *) linitial(workerTaskList);
+
+		if (firstTask->taskType == MODIFY_TASK)
+		{
+			return MULTI_EXECUTOR_ROUTER;
+		}
+	}
+
+	if (executorType == MULTI_EXECUTOR_REAL_TIME)
+	{
+		double reasonableConnectionCount = 0;
+
+		/* if we need to open too many connections per worker, warn the user */
+		if (tasksPerNode >= MaxConnections)
+		{
+			ereport(WARNING, (errmsg("this query uses more connections than the "
+									 "configured max_connections limit"),
+							  errhint("Consider increasing max_connections or setting "
+									  "citusdb.task_executor_type to "
+									  "\"task-tracker\".")));
+		}
+
+		/*
+		 * If we need to open too many outgoing connections, warn the user.
+		 * The real-time executor caps the number of tasks it starts by the same limit,
+		 * but we still issue this warning because it degrades performance.
+		 */
+		reasonableConnectionCount = MaxMasterConnectionCount();
+		if (taskCount >= reasonableConnectionCount)
+		{
+			ereport(WARNING, (errmsg("this query uses more file descriptors than the "
+									 "configured max_files_per_process limit"),
+							  errhint("Consider increasing max_files_per_process or "
+									  "setting citusdb.task_executor_type to "
+									  "\"task-tracker\".")));
+		}
+
+		/* if we have repartition jobs with real time executor, error out */
+		if (dependedJobCount > 0)
+		{
+			ereport(ERROR, (errmsg("cannot use real time executor with repartition jobs"),
+							errhint("Set citusdb.task_executor_type to "
+									"\"task-tracker\".")));
+		}
+	}
+	else if (executorType == MULTI_EXECUTOR_TASK_TRACKER)
+	{
+		/* if we have more tasks per node than what can be tracked, warn the user */
+		if (tasksPerNode >= MaxTrackedTasksPerNode)
+		{
+			ereport(WARNING, (errmsg("this query assigns more tasks per node than the "
+									 "configured max_tracked_tasks_per_node limit")));
+		}
+	}
+	else if (executorType == MULTI_EXECUTOR_ROUTER)
+	{
+		Task *workerTask = NULL;
+		List *workerDependentTaskList = NIL;
+		bool masterQueryHasAggregates = false;
+
+		/* if we have repartition jobs with router executor, error out */
+		if (dependedJobCount > 0)
+		{
+			ereport(ERROR, (errmsg("cannot use router executor with repartition jobs"),
+							errhint("Set citusdb.task_executor_type to "
+									"\"task-tracker\".")));
+		}
+
+		/* if the query hits more than one shards, error out*/
+		if (taskCount != 1)
+		{
+			ereport(ERROR, (errmsg("cannot use router executor with queries that "
+								   "hit multiple shards"),
+							errhint("Set citusdb.task_executor_type to \"real-time\" or "
+									"\"task-tracker\".")));
+		}
+
+		/* if the query has dependent data fetch tasks */
+		workerTask = list_nth(workerTaskList, 0);
+		workerDependentTaskList = workerTask->dependedTaskList;
+		if (list_length(workerDependentTaskList) > 0)
+		{
+			ereport(ERROR, (errmsg("cannot use router executor with JOINs"),
+							errhint("Set citusdb.task_executor_type to \"real-time\" or "
+									"\"task-tracker\".")));
+		}
+
+		/* ORDER BY is always applied on the master table with the current planner */
+		if (masterQuery != NULL && list_length(masterQuery->sortClause) > 0)
+		{
+			ereport(ERROR, (errmsg("cannot use router executor with ORDER BY clauses"),
+							errhint("Set citusdb.task_executor_type to \"real-time\" or "
+									"\"task-tracker\".")));
+		}
+
+		/*
+		 * Note that worker query having an aggregate means that the master query should have either
+		 * an aggregate or a function expression which has to be executed for the correct results.
+		 */
+		masterQueryHasAggregates = job->jobQuery->hasAggs;
+		if (masterQueryHasAggregates)
+		{
+			ereport(ERROR, (errmsg("cannot use router executor with aggregates"),
+							errhint("Set citusdb.task_executor_type to \"real-time\" or "
+									"\"task-tracker\".")));
+		}
+	}
+
+	return executorType;
+}
+
+
+/*
+ * MaxMasterConnectionCount returns the number of connections a master can open.
+ * A master cannot create more than a certain number of file descriptors (FDs).
+ * Every task requires 2 FDs, one file and one connection. Some FDs are taken by
+ * the VFD pool and there is currently no way to reclaim these before opening a
+ * connection. We therefore assume some FDs to be reserved for VFDs, based on
+ * observing a typical size of the pool on a CitusDB master.
+ */
+int
+MaxMasterConnectionCount(void)
+{
+	return Max((max_files_per_process - RESERVED_FD_COUNT) / 2, 1);
+}
+
+
+/*
+ * RemoveJobDirectory gets automatically called at portal drop (end of query) or
+ * at transaction abort. The function removes the job directory and releases the
+ * associated job resource from the resource manager.
+ */
+void
+RemoveJobDirectory(uint64 jobId)
+{
+	StringInfo jobDirectoryName = JobDirectoryName(jobId);
+	RemoveDirectory(jobDirectoryName);
+
+	ResourceOwnerForgetJobDirectory(CurrentResourceOwner, jobId);
+}
+
+
+/*
+ * InitTaskExecution creates a task execution structure for the given task, and
+ * initializes execution related fields.
+ */
+TaskExecution *
+InitTaskExecution(Task *task, TaskExecStatus initialTaskExecStatus)
+{
+	/* each task placement (assignment) corresponds to one worker node */
+	uint32 nodeCount = list_length(task->taskPlacementList);
+	uint32 nodeIndex = 0;
+
+	TaskExecution *taskExecution = palloc0(sizeof(TaskExecution));
+	taskExecution->jobId = task->jobId;
+	taskExecution->taskId = task->taskId;
+	taskExecution->nodeCount = nodeCount;
+	taskExecution->connectPollCount = 0;
+	taskExecution->currentNodeIndex = 0;
+	taskExecution->dataFetchTaskIndex = -1;
+	taskExecution->failureCount = 0;
+
+	taskExecution->taskStatusArray = palloc0(nodeCount * sizeof(TaskExecStatus));
+	taskExecution->transmitStatusArray = palloc0(nodeCount * sizeof(TransmitExecStatus));
+	taskExecution->connectionIdArray = palloc0(nodeCount * sizeof(int32));
+	taskExecution->fileDescriptorArray = palloc0(nodeCount * sizeof(int32));
+
+	for (nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+	{
+		taskExecution->taskStatusArray[nodeIndex] = initialTaskExecStatus;
+		taskExecution->transmitStatusArray[nodeIndex] = EXEC_TRANSMIT_UNASSIGNED;
+		taskExecution->connectionIdArray[nodeIndex] = INVALID_CONNECTION_ID;
+		taskExecution->fileDescriptorArray[nodeIndex] = -1;
+	}
+
+	return taskExecution;
+}
+
+
+/*
+ * CleanupTaskExecution iterates over all connections and file descriptors for
+ * the given task execution. The function first closes all open connections and
+ * file descriptors, and then frees memory allocated for the task execution.
+ */
+void
+CleanupTaskExecution(TaskExecution *taskExecution)
+{
+	uint32 nodeIndex = 0;
+	for (nodeIndex = 0; nodeIndex < taskExecution->nodeCount; nodeIndex++)
+	{
+		int32 connectionId = taskExecution->connectionIdArray[nodeIndex];
+		int32 fileDescriptor = taskExecution->fileDescriptorArray[nodeIndex];
+
+		/* close open connection */
+		if (connectionId != INVALID_CONNECTION_ID)
+		{
+			MultiClientDisconnect(connectionId);
+			taskExecution->connectionIdArray[nodeIndex] = INVALID_CONNECTION_ID;
+		}
+
+		/* close open file */
+		if (fileDescriptor >= 0)
+		{
+			int closed = close(fileDescriptor);
+			taskExecution->fileDescriptorArray[nodeIndex] = -1;
+
+			if (closed < 0)
+			{
+				ereport(WARNING, (errcode_for_file_access(),
+								  errmsg("could not close copy file: %m")));
+			}
+		}
+	}
+
+	/* deallocate memory and reset all fields */
+	pfree(taskExecution->taskStatusArray);
+	pfree(taskExecution->connectionIdArray);
+	pfree(taskExecution->fileDescriptorArray);
+	memset(taskExecution, 0, sizeof(TaskExecution));
+}
+
+
+/* Determines if the given task exceeded its failure threshold. */
+bool
+TaskExecutionFailed(TaskExecution *taskExecution)
+{
+	if (taskExecution->failureCount >= MAX_TASK_EXECUTION_FAILURES)
+	{
+		return true;
+	}
+
+	return false;
+}
+
+
+/*
+ * AdjustStateForFailure increments the failure count for given task execution.
+ * The function also determines the next worker node that should be contacted
+ * for remote execution.
+ */
+void
+AdjustStateForFailure(TaskExecution *taskExecution)
+{
+	int maxNodeIndex = taskExecution->nodeCount - 1;
+	Assert(maxNodeIndex >= 0);
+
+	if (taskExecution->currentNodeIndex < maxNodeIndex)
+	{
+		taskExecution->currentNodeIndex++;	 /* try next worker node */
+	}
+	else
+	{
+		taskExecution->currentNodeIndex = 0; /* go back to the first worker node */
+	}
+
+	taskExecution->dataFetchTaskIndex = -1;	/* reset data fetch counter */
+	taskExecution->failureCount++;			/* record failure */
+}
--- a/src/backend/distributed/executor/multi_task_tracker_executor.c
+++ b/src/backend/distributed/executor/multi_task_tracker_executor.c
--- a/src/backend/distributed/executor/multi_utility.c
+++ b/src/backend/distributed/executor/multi_utility.c
--- a/src/backend/distributed/master/master_create_shards.c
+++ b/src/backend/distributed/master/master_create_shards.c
@ -0,0 +1,234 @@
+/*-------------------------------------------------------------------------
+ *
+ * master_create_shards.c
+ *
+ * This file contains functions to distribute a table by creating shards for it
+ * across a set of worker nodes.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "fmgr.h"
+#include "libpq-fe.h"
+#include "miscadmin.h"
+#include "port.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/errno.h>
+
+#include "catalog/namespace.h"
+#include "catalog/pg_class.h"
+#include "distributed/connection_cache.h"
+#include "distributed/listutils.h"
+#include "distributed/master_metadata_utility.h"
+#include "distributed/master_protocol.h"
+#include "distributed/multi_join_order.h"
+#include "distributed/pg_dist_partition.h"
+#include "distributed/pg_dist_shard.h"
+#include "distributed/resource_lock.h"
+#include "distributed/worker_manager.h"
+#include "lib/stringinfo.h"
+#include "nodes/pg_list.h"
+#include "nodes/primnodes.h"
+#include "postmaster/postmaster.h"
+#include "storage/fd.h"
+#include "storage/lock.h"
+#include "utils/builtins.h"
+#include "utils/elog.h"
+#include "utils/errcodes.h"
+#include "utils/lsyscache.h"
+#include "utils/palloc.h"
+
+
+/* local function forward declarations */
+static void CheckHashPartitionedTable(Oid distributedTableId);
+static text * IntegerToText(int32 value);
+
+
+/* declarations for dynamic loading */
+PG_FUNCTION_INFO_V1(master_create_worker_shards);
+
+
+/*
+ * master_create_worker_shards creates empty shards for the given table based
+ * on the specified number of initial shards. The function first gets a list of
+ * candidate nodes and issues DDL commands on the nodes to create empty shard
+ * placements on those nodes. The function then updates metadata on the master
+ * node to make this shard (and its placements) visible. Note that the function
+ * assumes the table is hash partitioned and calculates the min/max hash token
+ * ranges for each shard, giving them an equal split of the hash space.
+ */
+Datum
+master_create_worker_shards(PG_FUNCTION_ARGS)
+{
+	text *tableNameText = PG_GETARG_TEXT_P(0);
+	int32 shardCount = PG_GETARG_INT32(1);
+	int32 replicationFactor = PG_GETARG_INT32(2);
+
+	Oid distributedTableId = ResolveRelationId(tableNameText);
+	char relationKind = get_rel_relkind(distributedTableId);
+	char *tableName = text_to_cstring(tableNameText);
+	char shardStorageType = '\0';
+	List *workerNodeList = NIL;
+	List *ddlCommandList = NIL;
+	int32 workerNodeCount = 0;
+	uint32 placementAttemptCount = 0;
+	uint64 hashTokenIncrement = 0;
+	List *existingShardList = NIL;
+	int64 shardIndex = 0;
+
+	/* make sure table is hash partitioned */
+	CheckHashPartitionedTable(distributedTableId);
+
+	/* we plan to add shards: get an exclusive metadata lock */
+	LockRelationDistributionMetadata(distributedTableId, ExclusiveLock);
+
+	/* validate that shards haven't already been created for this table */
+	existingShardList = LoadShardList(distributedTableId);
+	if (existingShardList != NIL)
+	{
+		ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						errmsg("table \"%s\" has already had shards created for it",
+							   tableName)));
+	}
+
+	/* make sure that at least one shard is specified */
+	if (shardCount <= 0)
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("shard_count must be positive")));
+	}
+
+	/* make sure that at least one replica is specified */
+	if (replicationFactor <= 0)
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("replication_factor must be positive")));
+	}
+
+	/* calculate the split of the hash space */
+	hashTokenIncrement = HASH_TOKEN_COUNT / shardCount;
+
+	/* load and sort the worker node list for deterministic placement */
+	workerNodeList = WorkerNodeList();
+	workerNodeList = SortList(workerNodeList, CompareWorkerNodes);
+
+	/* make sure we don't process cancel signals until all shards are created */
+	HOLD_INTERRUPTS();
+
+	/* retrieve the DDL commands for the table */
+	ddlCommandList = GetTableDDLEvents(distributedTableId);
+
+	workerNodeCount = list_length(workerNodeList);
+	if (replicationFactor > workerNodeCount)
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("replication_factor (%d) exceeds number of worker nodes "
+							   "(%d)", replicationFactor, workerNodeCount),
+						errhint("Add more worker nodes or try again with a lower "
+								"replication factor.")));
+	}
+
+	/* if we have enough nodes, add an extra placement attempt for backup */
+	placementAttemptCount = (uint32) replicationFactor;
+	if (workerNodeCount > replicationFactor)
+	{
+		placementAttemptCount++;
+	}
+
+	/* set shard storage type according to relation type */
+	if (relationKind == RELKIND_FOREIGN_TABLE)
+	{
+		shardStorageType = SHARD_STORAGE_FOREIGN;
+	}
+	else
+	{
+		shardStorageType = SHARD_STORAGE_TABLE;
+	}
+
+	for (shardIndex = 0; shardIndex < shardCount; shardIndex++)
+	{
+		uint32 roundRobinNodeIndex = shardIndex % workerNodeCount;
+
+		/* initialize the hash token space for this shard */
+		text *minHashTokenText = NULL;
+		text *maxHashTokenText = NULL;
+		int32 shardMinHashToken = INT32_MIN + (shardIndex * hashTokenIncrement);
+		int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1);
+		Datum shardIdDatum = master_get_new_shardid(NULL);
+		int64 shardId = DatumGetInt64(shardIdDatum);
+
+		/* if we are at the last shard, make sure the max token value is INT_MAX */
+		if (shardIndex == (shardCount - 1))
+		{
+			shardMaxHashToken = INT32_MAX;
+		}
+
+		/* insert the shard metadata row along with its min/max values */
+		minHashTokenText = IntegerToText(shardMinHashToken);
+		maxHashTokenText = IntegerToText(shardMaxHashToken);
+
+		/*
+		 * Grabbing the shard metadata lock isn't technically necessary since
+		 * we already hold an exclusive lock on the partition table, but we'll
+		 * acquire it for the sake of completeness. As we're adding new active
+		 * placements, the mode must be exclusive.
+		 */
+		LockShardDistributionMetadata(shardId, ExclusiveLock);
+
+		CreateShardPlacements(shardId, ddlCommandList, workerNodeList,
+		                      roundRobinNodeIndex, replicationFactor);
+
+		InsertShardRow(distributedTableId, shardId, shardStorageType,
+					   minHashTokenText, maxHashTokenText);
+	}
+
+	if (QueryCancelPending)
+	{
+		ereport(WARNING, (errmsg("cancel requests are ignored during shard creation")));
+		QueryCancelPending = false;
+	}
+
+	RESUME_INTERRUPTS();
+
+	PG_RETURN_VOID();
+}
+
+
+/*
+ * CheckHashPartitionedTable looks up the partition information for the given
+ * tableId and checks if the table is hash partitioned. If not, the function
+ * throws an error.
+ */
+static void
+CheckHashPartitionedTable(Oid distributedTableId)
+{
+	char partitionType = PartitionMethod(distributedTableId);
+	if (partitionType != DISTRIBUTE_BY_HASH)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("unsupported table partition type: %c", partitionType)));
+	}
+}
+
+
+/* Helper function to convert an integer value to a text type */
+static text *
+IntegerToText(int32 value)
+{
+	text *valueText = NULL;
+	StringInfo valueString = makeStringInfo();
+	appendStringInfo(valueString, "%d", value);
+
+	valueText = cstring_to_text(valueString->data);
+
+	return valueText;
+}
--- a/src/backend/distributed/master/master_delete_protocol.c
+++ b/src/backend/distributed/master/master_delete_protocol.c
@ -0,0 +1,446 @@
+/*-------------------------------------------------------------------------
+ *
+ * master_delete_protocol.c
+ *
+ * Routine for deleting shards in the distributed cluster. This function takes
+ * in a delete command and deletes a shard if and only if all rows in the shard
+ * satisfy the conditions in the delete command.
+ *
+ * Copyright (c) 2014, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include "catalog/pg_class.h"
+#include "commands/dbcommands.h"
+#include "distributed/master_metadata_utility.h"
+#include "distributed/master_protocol.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/multi_client_executor.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/multi_server_executor.h"
+#include "distributed/pg_dist_partition.h"
+#include "distributed/worker_protocol.h"
+#include "optimizer/clauses.h"
+#include "optimizer/predtest.h"
+#include "optimizer/restrictinfo.h"
+#include "optimizer/var.h"
+#include "tcop/tcopprot.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/inval.h"
+#include "utils/lsyscache.h"
+
+
+/* Local functions forward declarations */
+static void CheckTableCount(Query *deleteQuery);
+static void CheckDeleteCriteria(Node *deleteCriteria);
+static void CheckPartitionColumn(Oid relationId, Node *whereClause);
+static List * ShardsMatchingDeleteCriteria(Oid relationId, List *shardList,
+										   Node *deleteCriteria);
+static bool ExecuteRemoteCommand(const char *nodeName, uint32 nodePort,
+								 StringInfo queryString);
+
+
+/* exports for SQL callable functions */
+PG_FUNCTION_INFO_V1(master_apply_delete_command);
+
+
+/*
+ * master_apply_delete_command takes in a delete command, finds shards that
+ * match the criteria defined in the delete command, drops the found shards from
+ * the worker nodes, and updates the corresponding metadata on the master node.
+ * This function drops a shard if and only if all rows in the shard satisfy
+ * the conditions in the delete command. Note that this function only accepts
+ * conditions on the partition key and if no condition is provided then all
+ * shards are deleted.
+ *
+ * We mark shard placements that we couldn't drop as to be deleted later. If a
+ * shard satisfies the given conditions, we delete it from shard metadata table
+ * even though related shard placements are not deleted.
+ */
+Datum
+master_apply_delete_command(PG_FUNCTION_ARGS)
+{
+	text *queryText = PG_GETARG_TEXT_P(0);
+	char *queryString = text_to_cstring(queryText);
+	char *relationName = NULL;
+	text *relationNameText = NULL;
+	Oid relationId = InvalidOid;
+	List *shardIntervalList = NIL;
+	ListCell *shardIntervalCell = NULL;
+	List *deletableShardIntervalList = NIL;
+	List *queryTreeList = NIL;
+	Query *deleteQuery = NULL;
+	Node *whereClause = NULL;
+	Node *deleteCriteria = NULL;
+	Node *queryTreeNode = NULL;
+	DeleteStmt *deleteStatement = NULL;
+	int32 deleteCriteriaShardCount = 0;
+	LOCKTAG lockTag;
+	bool sessionLock = false;
+	bool dontWait = false;
+	char partitionMethod = 0;
+
+	queryTreeNode = ParseTreeNode(queryString);
+	if (!IsA(queryTreeNode, DeleteStmt))
+	{
+		ereport(ERROR, (errmsg("query \"%s\" is not a delete statement",
+							   queryString)));
+	}
+
+	deleteStatement = (DeleteStmt *) queryTreeNode;
+	relationName = deleteStatement->relation->relname;
+	relationNameText = cstring_to_text(relationName);
+
+	relationId = ResolveRelationId(relationNameText);
+	CheckDistributedTable(relationId);
+
+	queryTreeList = pg_analyze_and_rewrite(queryTreeNode, queryString, NULL, 0);
+	deleteQuery = (Query *) linitial(queryTreeList);
+	CheckTableCount(deleteQuery);
+
+	/* get where clause and flatten it */
+	whereClause = (Node *) deleteQuery->jointree->quals;
+	deleteCriteria = eval_const_expressions(NULL, whereClause);
+
+	partitionMethod = PartitionMethod(relationId);
+	if ((partitionMethod == DISTRIBUTE_BY_HASH) && (deleteCriteria != NULL))
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+		                errmsg("cannot delete from distributed table"),
+		                errdetail("Delete statements on hash-partitioned tables "
+		                          "with where clause is not supported")));
+	}
+
+	CheckDeleteCriteria(deleteCriteria);
+	CheckPartitionColumn(relationId, deleteCriteria);
+
+	/* acquire lock */
+	SET_LOCKTAG_ADVISORY(lockTag, MyDatabaseId, relationId, 0, 0);
+	LockAcquire(&lockTag, ExclusiveLock, sessionLock, dontWait);
+
+	shardIntervalList = LoadShardIntervalList(relationId);
+
+	/* drop all shards if where clause is not present */
+	if (deleteCriteria == NULL)
+	{
+		deletableShardIntervalList = shardIntervalList;
+		ereport(DEBUG2, (errmsg("dropping all shards for \"%s\"", relationName)));
+	}
+	else
+	{
+		deletableShardIntervalList = ShardsMatchingDeleteCriteria(relationId,
+		                                                          shardIntervalList,
+		                                                          deleteCriteria);
+	}
+
+	foreach(shardIntervalCell, deletableShardIntervalList)
+	{
+		List *shardPlacementList = NIL;
+		List *droppedPlacementList = NIL;
+		List *lingeringPlacementList= NIL;
+		ListCell *shardPlacementCell = NULL;
+		ListCell *droppedPlacementCell = NULL;
+		ListCell *lingeringPlacementCell = NULL;
+		ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell);
+		uint64 shardId = shardInterval->shardId;
+		char *quotedShardName = NULL;
+
+		/* if shard doesn't have an alias, extend regular table name */
+		char *shardName = LoadShardAlias(relationId, shardId);
+		if (shardName == NULL)
+		{
+			shardName = get_rel_name(relationId);
+			AppendShardIdToName(&shardName, shardId);
+		}
+
+		quotedShardName = quote_qualified_identifier(NULL, shardName);
+
+		shardPlacementList = ShardPlacementList(shardId);
+		foreach(shardPlacementCell, shardPlacementList)
+		{
+			ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell);
+			char *workerName = shardPlacement->nodeName;
+			uint32 workerPort = shardPlacement->nodePort;
+			bool dropSuccessful = false;
+			StringInfo workerDropQuery = makeStringInfo();
+
+			char tableType = get_rel_relkind(relationId);
+			if (tableType == RELKIND_RELATION)
+			{
+				appendStringInfo(workerDropQuery, DROP_REGULAR_TABLE_COMMAND, quotedShardName);
+			}
+			else if (tableType == RELKIND_FOREIGN_TABLE)
+			{
+				appendStringInfo(workerDropQuery, DROP_FOREIGN_TABLE_COMMAND, quotedShardName);
+			}
+
+			dropSuccessful = ExecuteRemoteCommand(workerName, workerPort, workerDropQuery);
+			if (dropSuccessful)
+			{
+				droppedPlacementList = lappend(droppedPlacementList, shardPlacement);
+			}
+			else
+			{
+				lingeringPlacementList = lappend(lingeringPlacementList, shardPlacement);
+			}
+		}
+
+		/* make sure we don't process cancel signals */
+		HOLD_INTERRUPTS();
+
+		foreach(droppedPlacementCell, droppedPlacementList)
+		{
+			ShardPlacement *placement = (ShardPlacement *) lfirst(droppedPlacementCell);
+			char *workerName = placement->nodeName;
+			uint32 workerPort = placement->nodePort;
+
+			DeleteShardPlacementRow(shardId, workerName, workerPort);
+		}
+
+		/* mark shard placements that we couldn't drop as to be deleted */
+		foreach(lingeringPlacementCell, lingeringPlacementList)
+		{
+			ShardPlacement *placement = (ShardPlacement *) lfirst(lingeringPlacementCell);
+			char *workerName = placement->nodeName;
+			uint32 workerPort = placement->nodePort;
+			uint64 oldShardLength = placement->shardLength;
+
+			DeleteShardPlacementRow(shardId, workerName, workerPort);
+			InsertShardPlacementRow(shardId, FILE_TO_DELETE, oldShardLength,
+									workerName, workerPort);
+
+			ereport(WARNING, (errmsg("could not delete shard \"%s\" on node "
+									 "\"%s:%u\"", shardName, workerName, workerPort),
+							  errdetail("Marking this shard placement for deletion")));
+		}
+
+		DeleteShardRow(shardId);
+
+		if (QueryCancelPending)
+		{
+			ereport(WARNING, (errmsg("cancel requests are ignored during shard deletion")));
+			QueryCancelPending = false;
+		}
+
+		RESUME_INTERRUPTS();
+    }
+
+	deleteCriteriaShardCount = list_length(deletableShardIntervalList);
+	PG_RETURN_INT32(deleteCriteriaShardCount);
+}
+
+
+/* Checks that delete is only on one table. */
+static void
+CheckTableCount(Query *deleteQuery)
+{
+	int rangeTableCount = list_length(deleteQuery->rtable);
+	if (rangeTableCount > 1)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot delete from distributed table"),
+						errdetail("Delete on multiple tables is not supported")));
+	}
+}
+
+
+/* Checks that delete criteria only consists of simple operator expressions. */
+static void
+CheckDeleteCriteria(Node *deleteCriteria)
+{
+	bool simpleOpExpression  = true;
+
+	if (deleteCriteria == NULL)
+	{
+		return;
+	}
+
+	if (is_opclause(deleteCriteria))
+	{
+		simpleOpExpression = SimpleOpExpression((Expr *) deleteCriteria);
+	}
+	else if (IsA(deleteCriteria, BoolExpr))
+	{
+		ListCell *opExpressionCell = NULL;
+		BoolExpr *deleteCriteriaExpression = (BoolExpr *) deleteCriteria;
+		List *opExpressionList = deleteCriteriaExpression->args;
+
+		foreach(opExpressionCell, opExpressionList)
+		{
+			Expr *opExpression = (Expr *) lfirst(opExpressionCell);
+			if (!SimpleOpExpression(opExpression))
+			{
+				simpleOpExpression = false;
+				break;
+			}
+		}
+	}
+	else
+	{
+		simpleOpExpression = false;	
+	}
+
+	if (!simpleOpExpression)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot delete from distributed table"),
+						errdetail("Delete query has a complex operator expression")));
+	}
+}
+
+
+ /*
+  * CheckPartitionColumn checks that the given where clause is based only on the
+  * partition key of the given relation id.
+  */
+static void
+CheckPartitionColumn(Oid relationId, Node *whereClause)
+{
+	Var *partitionColumn = PartitionKey(relationId);
+    ListCell *columnCell = NULL;
+
+	List *columnList = pull_var_clause_default(whereClause);
+	foreach(columnCell, columnList)
+	{
+		Var *var = (Var *) lfirst(columnCell);
+		if (var->varattno != partitionColumn->varattno)
+		{
+			ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							errmsg("cannot delete from distributed table"),
+							errdetail("Where clause includes a column other than "
+									  "partition column")));
+		}
+	}
+}
+
+
+/*
+ * ShardsMatchingDeleteCriteria selects shards to be deleted from the shard
+ * interval list based on the delete criteria, and returns selected shards in
+ * another list. We add a shard to the list if and only if all rows in the shard
+ * satisfy the delete criteria. Note that this function does not expect
+ * deleteCriteria to be NULL.
+ */
+static List *
+ShardsMatchingDeleteCriteria(Oid relationId, List *shardIntervalList,
+                             Node *deleteCriteria)
+{
+	List *dropShardIntervalList = NIL;
+	List *deleteCriteriaList = NIL;
+	ListCell *shardIntervalCell = NULL;
+
+	/* build the base expression for constraint */
+	Index rangeTableIndex = 1;
+	Var *partitionColumn = PartitionColumn(relationId, rangeTableIndex);
+	Node *baseConstraint = BuildBaseConstraint(partitionColumn);
+
+	Assert(deleteCriteria != NULL);
+	deleteCriteriaList = list_make1(deleteCriteria);
+
+	/* walk over shard list and check if shards can be dropped */
+	foreach(shardIntervalCell, shardIntervalList)
+	{
+		ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell);
+		if (shardInterval->minValueExists && shardInterval->maxValueExists)
+		{
+			List *restrictInfoList = NIL;
+			bool dropShard = false;
+			BoolExpr *andExpr = NULL;
+			Expr *lessThanExpr = NULL;
+			Expr *greaterThanExpr = NULL;
+			RestrictInfo *lessThanRestrictInfo = NULL;
+			RestrictInfo *greaterThanRestrictInfo = NULL;
+
+			/* set the min/max values in the base constraint */
+			UpdateConstraint(baseConstraint, shardInterval);
+
+			andExpr = (BoolExpr *) baseConstraint;
+			lessThanExpr = (Expr *) linitial(andExpr->args);
+			greaterThanExpr = (Expr *) lsecond(andExpr->args);
+
+			lessThanRestrictInfo = make_simple_restrictinfo(lessThanExpr);
+			greaterThanRestrictInfo = make_simple_restrictinfo(greaterThanExpr);
+
+			restrictInfoList = lappend(restrictInfoList, lessThanRestrictInfo);
+			restrictInfoList = lappend(restrictInfoList, greaterThanRestrictInfo);
+
+			dropShard = predicate_implied_by(deleteCriteriaList, restrictInfoList);
+			if (dropShard)
+			{
+				dropShardIntervalList = lappend(dropShardIntervalList, shardInterval);
+				ereport(DEBUG2, (errmsg("delete criteria includes shardId "
+										UINT64_FORMAT, shardInterval->shardId)));
+			}
+		}
+	}
+
+	return dropShardIntervalList;
+}
+
+
+/*
+ * ExecuteRemoteCommand executes the given SQL command. This command could be an
+ * Insert, Update, or Delete statement, or a utility command that returns
+ * nothing. If query is successfuly executed, the function returns true.
+ * Otherwise, it returns false.
+ */
+static bool
+ExecuteRemoteCommand(const char *nodeName, uint32 nodePort, StringInfo queryString)
+{
+	char *nodeDatabase = get_database_name(MyDatabaseId);
+	int32 connectionId = -1;
+	QueryStatus queryStatus = CLIENT_INVALID_QUERY;
+	bool querySent = false;
+	bool queryReady = false;
+	bool queryDone = false;
+
+	connectionId = MultiClientConnect(nodeName, nodePort, nodeDatabase);
+	if (connectionId == INVALID_CONNECTION_ID)
+	{
+		return false;
+	}
+
+	querySent = MultiClientSendQuery(connectionId, queryString->data);
+	if (!querySent)
+	{
+		MultiClientDisconnect(connectionId);
+		return false;
+	}
+
+	while (!queryReady)
+	{
+		ResultStatus resultStatus = MultiClientResultStatus(connectionId);
+		if (resultStatus == CLIENT_RESULT_READY)
+		{
+			queryReady = true;
+		}
+		else if (resultStatus == CLIENT_RESULT_BUSY)
+		{
+			long sleepIntervalPerCycle = RemoteTaskCheckInterval * 1000L;
+			pg_usleep(sleepIntervalPerCycle);
+		}
+		else
+		{
+			MultiClientDisconnect(connectionId);
+			return false;
+		}
+	}
+
+	queryStatus = MultiClientQueryStatus(connectionId);
+	if (queryStatus == CLIENT_QUERY_DONE)
+	{
+		queryDone = true;
+	}
+
+	MultiClientDisconnect(connectionId);
+	return queryDone;
+}
--- a/src/backend/distributed/master/master_metadata_utility.c
+++ b/src/backend/distributed/master/master_metadata_utility.c
@ -0,0 +1,587 @@
+/*-------------------------------------------------------------------------
+ *
+ * master_metadata_utility.c
+ *    Routines for reading and modifying master node's metadata.
+ *
+ * Copyright (c) 2014, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "funcapi.h"
+
+#include "access/htup_details.h"
+#include "access/xact.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_type.h"
+#include "distributed/citus_nodes.h"
+#include "distributed/master_metadata_utility.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/multi_join_order.h"
+#include "distributed/multi_logical_optimizer.h"
+#include "distributed/pg_dist_partition.h"
+#include "distributed/pg_dist_shard.h"
+#include "distributed/pg_dist_shard_placement.h"
+#include "distributed/worker_manager.h"
+#include "nodes/makefuncs.h"
+#include "parser/scansup.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/fmgroids.h"
+#include "utils/inval.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "utils/tqual.h"
+
+
+/* Local functions forward declarations */
+static uint64 * AllocateUint64(uint64 value);
+
+
+/*
+ * LoadShardIntervalList returns a list of shard intervals related for a given
+ * distributed table. The function returns an empty list if no shards can be
+ * found for the given relation.
+ */
+List *
+LoadShardIntervalList(Oid relationId)
+{
+	DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(relationId);
+	List *shardList = NIL;
+	int i = 0;
+
+	for (i = 0; i < cacheEntry->shardIntervalArrayLength; i++)
+	{
+		ShardInterval *newShardInterval = NULL;
+		newShardInterval = (ShardInterval *) palloc0(sizeof(ShardInterval));
+
+		CopyShardInterval(&cacheEntry->shardIntervalArray[i], newShardInterval);
+
+		shardList = lappend(shardList, newShardInterval);
+	}
+
+	return shardList;
+}
+
+
+/*
+ * LoadShardList reads list of shards for given relationId from pg_dist_shard,
+ * and returns the list of found shardIds.
+ */
+List *
+LoadShardList(Oid relationId)
+{
+	DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(relationId);
+	List *shardList = NIL;
+	int i = 0;
+
+	for (i = 0; i < cacheEntry->shardIntervalArrayLength; i++)
+	{
+		ShardInterval *currentShardInterval = &cacheEntry->shardIntervalArray[i];
+		uint64 *shardIdPointer = AllocateUint64(currentShardInterval->shardId);
+
+		shardList = lappend(shardList, shardIdPointer);
+	}
+
+	return shardList;
+}
+
+
+/* Allocates eight bytes, and copies given value's contents those bytes. */
+static uint64 *
+AllocateUint64(uint64 value)
+{
+	uint64 *allocatedValue = (uint64 *) palloc0(sizeof(uint64));
+	Assert(sizeof(uint64) >= 8);
+
+	(*allocatedValue) = value;
+
+	return allocatedValue;
+}
+
+
+/*
+ * LoadShardAlias finds the row for given relation and shardId in pg_dist_shard,
+ * finds the shard alias in this row if any, and then deep copies this alias.
+ */
+char *
+LoadShardAlias(Oid relationId, uint64 shardId)
+{
+	SysScanDesc scanDescriptor = NULL;
+	ScanKeyData scanKey[1];
+	int scanKeyCount = 1;
+	HeapTuple heapTuple = NULL;
+	Datum shardAliasDatum = 0;
+	bool shardAliasNull = false;
+	char *shardAlias = NULL;
+
+	Relation pgDistShard = heap_open(DistShardRelationId(), AccessShareLock);
+	TupleDesc tupleDescriptor = RelationGetDescr(pgDistShard);
+
+	ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
+				BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
+
+	scanDescriptor = systable_beginscan(pgDistShard,
+										DistShardShardidIndexId(), true,
+										NULL, scanKeyCount, scanKey);
+
+	/*
+	 * Normally, we should have at most one tuple here as we have a unique index
+	 * on shardId. However, if users want to drop this uniqueness constraint,
+	 * and look up the shardalias based on the relation and shardId pair, we
+	 * still allow that. We don't have any users relaying on this feature. Thus,
+	 * we may consider to remove this check.
+	 */
+	heapTuple = systable_getnext(scanDescriptor);
+	while (HeapTupleIsValid(heapTuple))
+	{
+		Form_pg_dist_shard pgDistShardForm = (Form_pg_dist_shard) GETSTRUCT(heapTuple);
+		if (pgDistShardForm->logicalrelid == relationId)
+		{
+			break;
+		}
+
+		heapTuple = systable_getnext(scanDescriptor);
+	}
+
+	/* if no tuple found, error out */
+	if (!HeapTupleIsValid(heapTuple))
+	{
+		ereport(ERROR, (errmsg("could not find valid entry for relationId: %u "
+							   "and shard " UINT64_FORMAT, relationId, shardId)));
+	}
+
+	/* if shard alias exists, deep copy cstring */
+	shardAliasDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardalias,
+								   tupleDescriptor, &shardAliasNull);
+	if (!shardAliasNull)
+	{
+		shardAlias = TextDatumGetCString(shardAliasDatum);
+	}
+
+	systable_endscan(scanDescriptor);
+	heap_close(pgDistShard, AccessShareLock);
+
+	return shardAlias;
+}
+
+
+/*
+ * CopyShardInterval copies fields from the specified source ShardInterval
+ * into the fields of the provided destination ShardInterval.
+ */
+void
+CopyShardInterval(ShardInterval *srcInterval, ShardInterval *destInterval)
+{
+	destInterval->type = srcInterval->type;
+	destInterval->relationId = srcInterval->relationId;
+	destInterval->storageType = srcInterval->storageType;
+	destInterval->valueTypeId = srcInterval->valueTypeId;
+	destInterval->valueTypeLen = srcInterval->valueTypeLen;
+	destInterval->valueByVal = srcInterval->valueByVal;
+	destInterval->minValueExists = srcInterval->minValueExists;
+	destInterval->maxValueExists = srcInterval->maxValueExists;
+	destInterval->shardId = srcInterval->shardId;
+
+	destInterval->minValue = 0;
+	if (destInterval->minValueExists)
+	{
+		destInterval->minValue = datumCopy(srcInterval->minValue,
+										   srcInterval->valueByVal,
+										   srcInterval->valueTypeLen);
+	}
+
+	destInterval->maxValue = 0;
+	if (destInterval->maxValueExists)
+	{
+		destInterval->maxValue = datumCopy(srcInterval->maxValue,
+										   srcInterval->valueByVal,
+										   srcInterval->valueTypeLen);
+	}
+}
+
+
+/*
+ * ShardLength finds shard placements for the given shardId, extracts the length
+ * of a finalized shard, and returns the shard's length. This function errors
+ * out if we cannot find any finalized shard placements for the given shardId.
+ */
+uint64
+ShardLength(uint64 shardId)
+{
+	uint64 shardLength = 0;
+
+	List *shardPlacementList = FinalizedShardPlacementList(shardId);
+	if (shardPlacementList == NIL)
+	{
+		ereport(ERROR, (errmsg("could not find length of shard " UINT64_FORMAT, shardId),
+					    errdetail("Could not find any shard placements for the shard.")));
+	}
+	else
+	{
+		ShardPlacement *shardPlacement = (ShardPlacement *) linitial(shardPlacementList);
+		shardLength = shardPlacement->shardLength;
+	}
+
+	return shardLength;
+}
+
+
+/*
+ * FinalizedShardPlacementList finds shard placements for the given shardId from
+ * system catalogs, chooses placements that are in finalized state, and returns
+ * these shard placements in a new list.
+ */
+List *
+FinalizedShardPlacementList(uint64 shardId)
+{
+	List *finalizedPlacementList = NIL;
+	List *shardPlacementList = ShardPlacementList(shardId);
+
+	ListCell *shardPlacementCell = NULL;
+	foreach(shardPlacementCell, shardPlacementList)
+	{
+		ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell);
+		if (shardPlacement->shardState == FILE_FINALIZED)
+		{
+			finalizedPlacementList = lappend(finalizedPlacementList, shardPlacement);
+		}
+	}
+
+	return finalizedPlacementList;
+}
+
+
+/*
+ * ShardPlacementList finds shard placements for the given shardId from system
+ * catalogs, converts these placements to their in-memory representation, and
+ * returns the converted shard placements in a new list.
+ */
+List *
+ShardPlacementList(uint64 shardId)
+{
+	List *shardPlacementList = NIL;
+	Relation pgShardPlacement = NULL;
+	SysScanDesc scanDescriptor = NULL;
+	ScanKeyData scanKey[1];
+	int scanKeyCount = 1;
+	bool indexOK = true;
+	HeapTuple heapTuple = NULL;
+
+	pgShardPlacement = heap_open(DistShardPlacementRelationId(), AccessShareLock);
+
+	ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_placement_shardid,
+				BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
+
+	scanDescriptor = systable_beginscan(pgShardPlacement,
+										DistShardPlacementShardidIndexId(), indexOK,
+										NULL, scanKeyCount, scanKey);
+
+	heapTuple = systable_getnext(scanDescriptor);
+	while (HeapTupleIsValid(heapTuple))
+	{
+		TupleDesc tupleDescriptor = RelationGetDescr(pgShardPlacement);
+
+		ShardPlacement *placement = TupleToShardPlacement(tupleDescriptor, heapTuple);
+		shardPlacementList = lappend(shardPlacementList, placement);
+
+		heapTuple = systable_getnext(scanDescriptor);
+	}
+
+	systable_endscan(scanDescriptor);
+	heap_close(pgShardPlacement, AccessShareLock);
+
+	/* if no shard placements are found, warn the user */
+	if (shardPlacementList == NIL)
+	{
+		ereport(WARNING, (errmsg("could not find any shard placements for shardId "
+								 UINT64_FORMAT, shardId)));
+	}
+
+	return shardPlacementList;
+}
+
+
+/*
+ * TupleToShardPlacement takes in a heap tuple from pg_dist_shard_placement, and
+ * converts this tuple to an equivalent struct in memory. The function assumes
+ * the caller already has locks on the tuple, and doesn't perform any locking.
+ */
+ShardPlacement *
+TupleToShardPlacement(TupleDesc tupleDescriptor, HeapTuple heapTuple)
+{
+	ShardPlacement *shardPlacement = NULL;
+	bool isNull = false;
+
+	Oid tupleOid = HeapTupleGetOid(heapTuple);
+	Datum shardId = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_shardid,
+								 tupleDescriptor, &isNull);
+	Datum shardLength = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_shardlength,
+									 tupleDescriptor, &isNull);
+	Datum shardState = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_shardstate,
+									tupleDescriptor, &isNull);
+	Datum nodeName = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_nodename,
+								  tupleDescriptor, &isNull);
+	Datum nodePort = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_nodeport,
+								  tupleDescriptor, &isNull);
+
+	Assert(!HeapTupleHasNulls(heapTuple));
+
+	shardPlacement = CitusMakeNode(ShardPlacement);
+	shardPlacement->tupleOid = tupleOid;
+	shardPlacement->shardId = DatumGetInt64(shardId);
+	shardPlacement->shardLength = DatumGetInt64(shardLength);
+	shardPlacement->shardState = DatumGetUInt32(shardState);
+	shardPlacement->nodeName = TextDatumGetCString(nodeName);
+	shardPlacement->nodePort = DatumGetUInt32(nodePort);
+
+	return shardPlacement;
+}
+
+
+/*
+ * InsertShardRow opens the shard system catalog, and inserts a new row with the
+ * given values into that system catalog. Note that we allow the user to pass in
+ * null min/max values in case they are creating an empty shard.
+ */
+void
+InsertShardRow(Oid relationId, uint64 shardId, char storageType,
+			   text *shardMinValue, text *shardMaxValue)
+{
+	Relation pgDistShard = NULL;
+	TupleDesc tupleDescriptor = NULL;
+	HeapTuple heapTuple = NULL;
+	Datum values[Natts_pg_dist_shard];
+	bool isNulls[Natts_pg_dist_shard];
+
+	/* form new shard tuple */
+	memset(values, 0, sizeof(values));
+	memset(isNulls, false, sizeof(isNulls));
+
+	values[Anum_pg_dist_shard_logicalrelid - 1] = ObjectIdGetDatum(relationId);
+	values[Anum_pg_dist_shard_shardid - 1] = Int64GetDatum(shardId);
+	values[Anum_pg_dist_shard_shardstorage - 1] = CharGetDatum(storageType);
+
+	/* check if shard min/max values are null */
+	if (shardMinValue != NULL && shardMaxValue != NULL)
+	{
+		values[Anum_pg_dist_shard_shardminvalue - 1] = PointerGetDatum(shardMinValue);
+		values[Anum_pg_dist_shard_shardmaxvalue - 1] = PointerGetDatum(shardMaxValue);
+
+		/* we always set shard alias to null */
+		isNulls[Anum_pg_dist_shard_shardalias - 1] = true;
+	}
+	else
+	{
+		isNulls[Anum_pg_dist_shard_shardminvalue - 1] = true;
+		isNulls[Anum_pg_dist_shard_shardmaxvalue - 1] = true;
+		isNulls[Anum_pg_dist_shard_shardalias - 1] = true;
+	}
+
+	/* open shard relation and insert new tuple */
+	pgDistShard = heap_open(DistShardRelationId(), RowExclusiveLock);
+
+	tupleDescriptor = RelationGetDescr(pgDistShard);
+	heapTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
+
+	simple_heap_insert(pgDistShard, heapTuple);
+	CatalogUpdateIndexes(pgDistShard, heapTuple);
+	CommandCounterIncrement();
+
+	/* close relation and invalidate previous cache entry */
+	heap_close(pgDistShard, RowExclusiveLock);
+	CacheInvalidateRelcacheByRelid(relationId);
+}
+
+
+/*
+ * InsertShardPlacementRow opens the shard placement system catalog, and inserts
+ * a new row with the given values into that system catalog.
+ */
+void
+InsertShardPlacementRow(uint64 shardId, char shardState, uint64 shardLength,
+						char *nodeName, uint32 nodePort)
+{
+	Relation pgDistShardPlacement = NULL;
+	TupleDesc tupleDescriptor = NULL;
+	HeapTuple heapTuple = NULL;
+	Datum values[Natts_pg_dist_shard_placement];
+	bool isNulls[Natts_pg_dist_shard_placement];
+
+	/* form new shard placement tuple */
+	memset(values, 0, sizeof(values));
+	memset(isNulls, false, sizeof(isNulls));
+
+	values[Anum_pg_dist_shard_placement_shardid - 1] = Int64GetDatum(shardId);
+	values[Anum_pg_dist_shard_placement_shardstate - 1] = CharGetDatum(shardState);
+	values[Anum_pg_dist_shard_placement_shardlength - 1] = Int64GetDatum(shardLength);
+	values[Anum_pg_dist_shard_placement_nodename - 1] = CStringGetTextDatum(nodeName);
+	values[Anum_pg_dist_shard_placement_nodeport - 1] = UInt32GetDatum(nodePort);
+
+	/* open shard placement relation and insert new tuple */
+	pgDistShardPlacement = heap_open(DistShardPlacementRelationId(), RowExclusiveLock);
+
+	tupleDescriptor = RelationGetDescr(pgDistShardPlacement);
+	heapTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
+
+	simple_heap_insert(pgDistShardPlacement, heapTuple);
+	CatalogUpdateIndexes(pgDistShardPlacement, heapTuple);
+	CommandCounterIncrement();
+
+	/* close relation */
+	heap_close(pgDistShardPlacement, RowExclusiveLock);
+}
+
+
+/*
+ * DeleteShardRow opens the shard system catalog, finds the unique row that has
+ * the given shardId, and deletes this row.
+ */
+void
+DeleteShardRow(uint64 shardId)
+{
+	Relation pgDistShard = NULL;
+	SysScanDesc scanDescriptor = NULL;
+	ScanKeyData scanKey[1];
+	int scanKeyCount = 1;
+	bool indexOK = true;
+	HeapTuple heapTuple = NULL;
+	Form_pg_dist_shard pgDistShardForm = NULL;
+	Oid distributedRelationId = InvalidOid;
+
+	pgDistShard = heap_open(DistShardRelationId(), RowExclusiveLock);
+
+	ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
+				BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
+
+	scanDescriptor = systable_beginscan(pgDistShard,
+										DistShardShardidIndexId(), indexOK,
+										NULL, scanKeyCount, scanKey);
+
+	heapTuple = systable_getnext(scanDescriptor);
+	if (!HeapTupleIsValid(heapTuple))
+	{
+		ereport(ERROR, (errmsg("could not find valid entry for shard "
+							   UINT64_FORMAT, shardId)));
+	}
+
+	pgDistShardForm = (Form_pg_dist_shard) GETSTRUCT(heapTuple);
+	distributedRelationId = pgDistShardForm->logicalrelid;
+
+	simple_heap_delete(pgDistShard, &heapTuple->t_self);
+	CommandCounterIncrement();
+
+	systable_endscan(scanDescriptor);
+	heap_close(pgDistShard, RowExclusiveLock);
+
+	/* invalidate previous cache entry */
+	CacheInvalidateRelcacheByRelid(distributedRelationId);
+}
+
+
+/*
+ * DeleteShardPlacementRow opens the shard placement system catalog, finds the
+ * first (unique) row that corresponds to the given shardId and worker node, and
+ * deletes this row.
+ */
+void
+DeleteShardPlacementRow(uint64 shardId, char *workerName, uint32 workerPort)
+{
+	Relation pgDistShardPlacement = NULL;
+	SysScanDesc scanDescriptor = NULL;
+	ScanKeyData scanKey[1];
+	int scanKeyCount = 1;
+	bool indexOK = true;
+	HeapTuple heapTuple = NULL;
+	bool heapTupleFound = false;
+
+	pgDistShardPlacement = heap_open(DistShardPlacementRelationId(), RowExclusiveLock);
+
+	ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_placement_shardid,
+				BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
+
+	scanDescriptor = systable_beginscan(pgDistShardPlacement,
+										DistShardPlacementShardidIndexId(), indexOK,
+										NULL, scanKeyCount, scanKey);
+
+	heapTuple = systable_getnext(scanDescriptor);
+	while (HeapTupleIsValid(heapTuple))
+	{
+		TupleDesc tupleDescriptor = RelationGetDescr(pgDistShardPlacement);
+
+		ShardPlacement *placement = TupleToShardPlacement(tupleDescriptor, heapTuple);
+		if (strncmp(placement->nodeName, workerName, WORKER_LENGTH) == 0 &&
+			placement->nodePort == workerPort)
+		{
+			heapTupleFound = true;
+			break;
+		}
+
+		heapTuple = systable_getnext(scanDescriptor);
+	}
+
+	/* if we couldn't find the shard placement to delete, error out */
+	if (!heapTupleFound)
+	{
+		ereport(ERROR, (errmsg("could not find valid entry for shard placement "
+							   UINT64_FORMAT " on node \"%s:%u\"",
+							   shardId, workerName, workerPort)));
+	}
+
+	simple_heap_delete(pgDistShardPlacement, &heapTuple->t_self);
+	CommandCounterIncrement();
+
+	systable_endscan(scanDescriptor);
+	heap_close(pgDistShardPlacement, RowExclusiveLock);
+}
+
+
+/*
+ * BuildDistributionKeyFromColumnName builds a simple distribution key consisting
+ * only out of a reference to the column of name columnName. Errors out if the
+ * specified column does not exist or is not suitable to be used as a
+ * distribution column.
+ */
+Node *
+BuildDistributionKeyFromColumnName(Relation distributedRelation, char *columnName)
+{
+	HeapTuple columnTuple = NULL;
+	Form_pg_attribute columnForm = NULL;
+	Var *column = NULL;
+	char *tableName = RelationGetRelationName(distributedRelation);
+
+	/* it'd probably better to downcase identifiers consistent with SQL case folding */
+	truncate_identifier(columnName, strlen(columnName), true);
+
+	/* lookup column definition */
+	columnTuple = SearchSysCacheAttName(RelationGetRelid(distributedRelation),
+										columnName);
+	if (!HeapTupleIsValid(columnTuple))
+	{
+		ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN),
+						errmsg("column \"%s\" of relation \"%s\" does not exist",
+							   columnName, tableName)));
+	}
+
+	columnForm = (Form_pg_attribute) GETSTRUCT(columnTuple);
+
+	/* check if the column may be referenced in the distribution key */
+	if (columnForm->attnum <= 0)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot reference system column \"%s\" in relation \"%s\"",
+							   columnName, tableName)));
+	}
+
+	/* build Var referencing only the chosen distribution column */
+	column = makeVar(1, columnForm->attnum, columnForm->atttypid,
+					 columnForm->atttypmod, columnForm->attcollation, 0);
+
+	ReleaseSysCache(columnTuple);
+
+	return (Node *) column;
+}
--- a/src/backend/distributed/master/master_node_protocol.c
+++ b/src/backend/distributed/master/master_node_protocol.c
@ -0,0 +1,756 @@
+/*-------------------------------------------------------------------------
+ *
+ * master_node_protocol.c
+ *	  Routines for requesting information from the master node for creating or
+ *	  updating shards.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include "access/htup_details.h"
+#include "catalog/catalog.h"
+#include "catalog/dependency.h"
+#include "catalog/indexing.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_index.h"
+#include "catalog/pg_type.h"
+#include "commands/sequence.h"
+#include "distributed/citus_ruleutils.h"
+#include "distributed/listutils.h"
+#include "distributed/master_protocol.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/pg_dist_shard.h"
+#include "distributed/pg_dist_partition.h"
+#include "distributed/worker_manager.h"
+#include "foreign/foreign.h"
+#include "libpq/ip.h"
+#include "libpq/libpq-be.h"
+#include "nodes/pg_list.h"
+#include "storage/lock.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#if (PG_VERSION_NUM >= 90500)
+#include "utils/ruleutils.h"
+#endif
+#include "utils/syscache.h"
+#include "utils/tqual.h"
+
+
+/* Shard related configuration */
+int ShardReplicationFactor = 2; /* desired replication factor for shards */
+int ShardMaxSize = 1048576;		/* maximum size in KB one shard can grow to */
+int ShardPlacementPolicy = SHARD_PLACEMENT_ROUND_ROBIN;
+
+
+static char * hostname_client_addr(void);
+static Datum WorkerNodeGetDatum(WorkerNode *workerNode, TupleDesc tupleDescriptor);
+
+
+/* exports for SQL callable functions */
+PG_FUNCTION_INFO_V1(master_get_table_metadata);
+PG_FUNCTION_INFO_V1(master_get_table_ddl_events);
+PG_FUNCTION_INFO_V1(master_get_new_shardid);
+PG_FUNCTION_INFO_V1(master_get_local_first_candidate_nodes);
+PG_FUNCTION_INFO_V1(master_get_round_robin_candidate_nodes);
+PG_FUNCTION_INFO_V1(master_get_active_worker_nodes);
+
+
+/*
+ * master_get_table_metadata takes in a relation name, and returns partition
+ * related metadata for the relation. These metadata are grouped and returned in
+ * a tuple, and are used by the caller when creating new shards. The function
+ * errors if given relation does not exist, or is not partitioned.
+ */
+Datum
+master_get_table_metadata(PG_FUNCTION_ARGS)
+{
+	text *relationName = PG_GETARG_TEXT_P(0);
+	Oid relationId = ResolveRelationId(relationName);
+
+	DistTableCacheEntry *partitionEntry = NULL;
+	TypeFuncClass resultTypeClass = 0;
+	Datum partitionKeyExpr = 0;
+	Datum partitionKey = 0;
+	Datum metadataDatum = 0;
+	HeapTuple metadataTuple = NULL;
+	TupleDesc metadataDescriptor = NULL;
+	uint64 shardMaxSizeInBytes = 0;
+	char relationType = 0;
+	char storageType = 0;
+	Datum values[TABLE_METADATA_FIELDS];
+	bool isNulls[TABLE_METADATA_FIELDS];
+
+	/* find partition tuple for partitioned relation */
+	partitionEntry = DistributedTableCacheEntry(relationId);
+
+	/* create tuple descriptor for return value */
+	resultTypeClass = get_call_result_type(fcinfo, NULL, &metadataDescriptor);
+	if (resultTypeClass != TYPEFUNC_COMPOSITE)
+	{
+		ereport(ERROR, (errmsg("return type must be a row type")));
+	}
+
+	/* get decompiled expression tree for partition key */
+	partitionKeyExpr =
+		PointerGetDatum(cstring_to_text(partitionEntry->partitionKeyString));
+	partitionKey = DirectFunctionCall2(pg_get_expr, partitionKeyExpr,
+									   ObjectIdGetDatum(relationId));
+
+	/* form heap tuple for table metadata */
+	memset(values, 0, sizeof(values));
+	memset(isNulls, false, sizeof(isNulls));
+
+	shardMaxSizeInBytes = (int64) ShardMaxSize * 1024L;
+
+	/* get storage type */
+	relationType = get_rel_relkind(relationId);
+	if (relationType == RELKIND_RELATION)
+	{
+		storageType = SHARD_STORAGE_TABLE;
+	}
+	else if (relationType == RELKIND_FOREIGN_TABLE)
+	{
+		bool cstoreTable = CStoreTable(relationId);
+		if (cstoreTable)
+		{
+			storageType = SHARD_STORAGE_COLUMNAR;
+		}
+		else
+		{
+			storageType = SHARD_STORAGE_FOREIGN;
+		}
+	}
+
+	values[0] = ObjectIdGetDatum(relationId);
+	values[1] = storageType;
+	values[2] = partitionEntry->partitionMethod;
+	values[3] = partitionKey;
+	values[4] = Int32GetDatum(ShardReplicationFactor);
+	values[5] = Int64GetDatum(shardMaxSizeInBytes);
+	values[6] = Int32GetDatum(ShardPlacementPolicy);
+
+	metadataTuple = heap_form_tuple(metadataDescriptor, values, isNulls);
+	metadataDatum = HeapTupleGetDatum(metadataTuple);
+
+	PG_RETURN_DATUM(metadataDatum);
+}
+
+
+/*
+ * CStoreTable returns true if the given relationId belongs to a foreign cstore
+ * table, otherwise it returns false.
+ */
+bool
+CStoreTable(Oid relationId)
+{
+	bool cstoreTable = false;
+
+	char relationKind = get_rel_relkind(relationId);
+	if (relationKind == RELKIND_FOREIGN_TABLE)
+	{
+		ForeignTable *foreignTable = GetForeignTable(relationId);
+		ForeignServer *server = GetForeignServer(foreignTable->serverid);
+		ForeignDataWrapper *foreignDataWrapper = GetForeignDataWrapper(server->fdwid);
+
+		if (strncmp(foreignDataWrapper->fdwname, CSTORE_FDW_NAME, NAMEDATALEN) == 0)
+		{
+			cstoreTable = true;
+		}
+	}
+
+	return cstoreTable;
+}
+
+
+/*
+ * master_get_table_ddl_events takes in a relation name, and returns the set of
+ * DDL commands needed to reconstruct the relation. The returned DDL commands
+ * are similar in flavor to schema definitions that pgdump returns. The function
+ * errors if given relation does not exist.
+ */
+Datum
+master_get_table_ddl_events(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *functionContext = NULL;
+	ListCell *tableDDLEventCell = NULL;
+
+	/*
+	 * On the very first call to this function, we first use the given relation
+	 * name to get to the relation. We then recreate the list of DDL statements
+	 * issued for this relation, and save the first statement's position in the
+	 * function context.
+	 */
+	if (SRF_IS_FIRSTCALL())
+	{
+		text *relationName = PG_GETARG_TEXT_P(0);
+		Oid relationId = ResolveRelationId(relationName);
+
+		MemoryContext oldContext = NULL;
+		List *tableDDLEventList = NIL;
+
+		/* create a function context for cross-call persistence */
+		functionContext = SRF_FIRSTCALL_INIT();
+
+		/* switch to memory context appropriate for multiple function calls */
+		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
+
+		/* allocate DDL statements, and then save position in DDL statements */
+		tableDDLEventList = GetTableDDLEvents(relationId);
+		tableDDLEventCell = list_head(tableDDLEventList);
+
+		functionContext->user_fctx = tableDDLEventCell;
+		
+		MemoryContextSwitchTo(oldContext);
+	}
+
+	/*
+	 * On every call to this function, we get the current position in the
+	 * statement list. We then iterate to the next position in the list and
+	 * return the current statement, if we have not yet reached the end of
+	 * list.
+	 */
+	functionContext = SRF_PERCALL_SETUP();
+
+	tableDDLEventCell = (ListCell *) functionContext->user_fctx;
+	if (tableDDLEventCell != NULL)
+	{
+		char *ddlStatement = (char *) lfirst(tableDDLEventCell);
+		text *ddlStatementText = cstring_to_text(ddlStatement);		
+		
+		functionContext->user_fctx = lnext(tableDDLEventCell);
+
+		SRF_RETURN_NEXT(functionContext, PointerGetDatum(ddlStatementText));
+	}
+	else
+	{
+		SRF_RETURN_DONE(functionContext);
+	}
+}
+
+
+/*
+ * master_get_new_shardid allocates and returns a unique shardId for the shard
+ * to be created. This allocation occurs both in shared memory and in write
+ * ahead logs; writing to logs avoids the risk of having shardId collisions.
+ *
+ * Please note that the caller is still responsible for finalizing shard data
+ * and the shardId with the master node. Further note that this function relies
+ * on an internal sequence created in initdb to generate unique identifiers.
+ */
+Datum
+master_get_new_shardid(PG_FUNCTION_ARGS)
+{
+	text *sequenceName = cstring_to_text(SHARDID_SEQUENCE_NAME);
+	Oid   sequenceId = ResolveRelationId(sequenceName);
+	Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
+
+	/* generate new and unique shardId from sequence */
+	Datum shardIdDatum = DirectFunctionCall1(nextval_oid, sequenceIdDatum);
+	int64 shardId = DatumGetInt64(shardIdDatum);
+
+	PG_RETURN_INT64(shardId);
+}
+
+
+/*
+ * master_get_local_first_candidate_nodes returns a set of candidate host names
+ * and port numbers on which to place new shards. The function makes sure to
+ * always allocate the first candidate node as the node the caller is connecting
+ * from; and allocates additional nodes until the shard replication factor is
+ * met. The function errors if the caller's remote node name is not found in the
+ * membership list, or if the number of available nodes falls short of the
+ * replication factor.
+ */
+Datum
+master_get_local_first_candidate_nodes(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *functionContext = NULL;
+	uint32 desiredNodeCount = 0;
+	uint32 currentNodeCount = 0;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldContext  = NULL;
+		TupleDesc tupleDescriptor = NULL;
+		uint32 liveNodeCount = 0;
+		bool hasOid = false;
+
+		/* create a function context for cross-call persistence */
+		functionContext = SRF_FIRSTCALL_INIT();
+
+		/* switch to memory context appropriate for multiple function calls */
+		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
+
+		functionContext->user_fctx = NIL;
+		functionContext->max_calls = ShardReplicationFactor;
+
+		/* if enough live nodes, return an extra candidate node as backup */
+		liveNodeCount = WorkerGetLiveNodeCount();
+		if (liveNodeCount > ShardReplicationFactor)
+		{
+			functionContext->max_calls = ShardReplicationFactor + 1;
+		}
+
+		/*
+		 * This tuple descriptor must match the output parameters declared for
+		 * the function in pg_proc.
+		 */
+		tupleDescriptor = CreateTemplateTupleDesc(CANDIDATE_NODE_FIELDS, hasOid);
+		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port",
+						   INT8OID, -1, 0);
+
+		functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor);
+
+		MemoryContextSwitchTo(oldContext);
+	}
+
+	functionContext = SRF_PERCALL_SETUP();
+	desiredNodeCount = functionContext->max_calls;
+	currentNodeCount = functionContext->call_cntr;
+
+	if (currentNodeCount < desiredNodeCount)
+	{
+		MemoryContext oldContext = NULL;
+		List *currentNodeList = NIL;
+		WorkerNode *candidateNode = NULL;
+		Datum candidateDatum = 0;
+
+		/* switch to memory context appropriate for multiple function calls */
+		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
+		currentNodeList = functionContext->user_fctx;
+
+		if (currentNodeCount == 0)
+		{
+			/* choose first candidate node to be the client's host */
+			char *remoteHostname = hostname_client_addr();
+
+			/* if hostname is localhost.localdomain, change it to localhost */
+			int nameCompare = strncmp(remoteHostname, "localhost.localdomain",
+									  WORKER_LENGTH);
+			if (nameCompare == 0)
+			{
+				remoteHostname = pstrdup("localhost");
+			}
+
+			candidateNode = WorkerGetNodeWithName(remoteHostname);
+			if (candidateNode == NULL)
+			{
+				ereport(ERROR, (errmsg("could not find worker node for hostname: %s",
+									   remoteHostname)));
+			}
+		}
+		else
+		{
+			/* find a candidate node different from those already selected */
+			candidateNode = WorkerGetCandidateNode(currentNodeList);
+			if (candidateNode == NULL)
+			{
+				ereport(ERROR, (errmsg("could only find %u of %u required nodes",
+									   currentNodeCount, desiredNodeCount)));
+			}
+		}
+
+		currentNodeList = lappend(currentNodeList, candidateNode);
+		functionContext->user_fctx = currentNodeList;
+
+		MemoryContextSwitchTo(oldContext);
+
+		candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc);
+
+		SRF_RETURN_NEXT(functionContext, candidateDatum);
+	}
+	else
+	{
+		SRF_RETURN_DONE(functionContext);
+	}
+}
+
+
+/*
+ * master_get_round_robin_candidate_nodes returns a set of candidate host names
+ * and port numbers on which to place new shards. The function uses the round
+ * robin policy to choose the nodes and tries to ensure that there is an even
+ * distribution of shards across the worker nodes. This function errors out if
+ * the number of available nodes falls short of the replication factor.
+ */
+Datum
+master_get_round_robin_candidate_nodes(PG_FUNCTION_ARGS)
+{
+	uint64 shardId = PG_GETARG_INT64(0);
+	FuncCallContext *functionContext = NULL;
+	uint32 desiredNodeCount = 0;
+	uint32 currentNodeCount = 0;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldContext  = NULL;
+		TupleDesc tupleDescriptor = NULL;
+		List *workerNodeList = NIL;
+		TypeFuncClass resultTypeClass = 0;
+		uint32 workerNodeCount = 0;
+
+		/* create a function context for cross-call persistence */
+		functionContext = SRF_FIRSTCALL_INIT();
+
+		/* switch to memory context appropriate for multiple function calls */
+		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
+
+		/* get the worker node list and sort it for determinism */
+		workerNodeList = WorkerNodeList();
+		workerNodeList = SortList(workerNodeList, CompareWorkerNodes);
+
+		functionContext->user_fctx = workerNodeList;
+		functionContext->max_calls = ShardReplicationFactor;
+
+		/* if we enough live nodes, return an extra candidate node as backup */
+		workerNodeCount = (uint32) list_length(workerNodeList);
+		if (workerNodeCount > ShardReplicationFactor)
+		{
+			functionContext->max_calls = ShardReplicationFactor + 1;
+		}
+
+		/* create tuple descriptor for return value */
+		resultTypeClass = get_call_result_type(fcinfo, NULL, &tupleDescriptor);
+		if (resultTypeClass != TYPEFUNC_COMPOSITE)
+		{
+			ereport(ERROR, (errmsg("return type must be a row type")));
+		}
+
+		functionContext->tuple_desc = tupleDescriptor;
+
+		MemoryContextSwitchTo(oldContext);
+	}
+
+	functionContext = SRF_PERCALL_SETUP();
+	desiredNodeCount = functionContext->max_calls;
+	currentNodeCount = functionContext->call_cntr;
+
+	if (currentNodeCount < desiredNodeCount)
+	{
+		List *workerNodeList = functionContext->user_fctx;
+		WorkerNode *candidateNode = NULL;
+		Datum candidateDatum = 0;
+
+		candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId,
+														 currentNodeCount);
+		if (candidateNode == NULL)
+		{
+			ereport(ERROR, (errmsg("could only find %u of %u required nodes",
+								   currentNodeCount, desiredNodeCount)));
+		}
+
+		candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc);
+
+		SRF_RETURN_NEXT(functionContext, candidateDatum);
+	}
+	else
+	{
+		SRF_RETURN_DONE(functionContext);
+	}
+}
+
+
+/*
+ * master_get_active_worker_nodes returns a set of active worker host names and
+ * port numbers in deterministic order. Currently we assume that all worker
+ * nodes in pg_worker_list.conf are active.
+ */
+Datum
+master_get_active_worker_nodes(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *functionContext = NULL;
+	uint32 workerNodeIndex = 0;
+	uint32 workerNodeCount = 0;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldContext  = NULL;
+		List *workerNodeList = NIL;
+		uint32 workerNodeCount = 0;
+		TupleDesc tupleDescriptor = NULL;
+		bool hasOid = false;
+
+		/* create a function context for cross-call persistence */
+		functionContext = SRF_FIRSTCALL_INIT();
+
+		/* switch to memory context appropriate for multiple function calls */
+		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
+
+		workerNodeList = WorkerNodeList();
+		workerNodeCount = (uint32) list_length(workerNodeList);
+
+		functionContext->user_fctx = workerNodeList;
+		functionContext->max_calls = workerNodeCount;
+
+		/*
+		 * This tuple descriptor must match the output parameters declared for
+		 * the function in pg_proc.
+		 */
+		tupleDescriptor = CreateTemplateTupleDesc(WORKER_NODE_FIELDS, hasOid);
+		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port",
+						   INT8OID, -1, 0);
+
+		functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor);
+
+		MemoryContextSwitchTo(oldContext);
+	}
+
+	functionContext = SRF_PERCALL_SETUP();
+	workerNodeIndex = functionContext->call_cntr;
+	workerNodeCount = functionContext->max_calls;
+
+	if (workerNodeIndex < workerNodeCount)
+	{
+		List *workerNodeList = functionContext->user_fctx;
+		WorkerNode *workerNode = list_nth(workerNodeList, workerNodeIndex);
+
+		Datum workerNodeDatum = WorkerNodeGetDatum(workerNode,
+												   functionContext->tuple_desc);
+
+		SRF_RETURN_NEXT(functionContext, workerNodeDatum);
+	}
+	else
+	{
+		SRF_RETURN_DONE(functionContext);
+	}
+}
+
+
+/* Finds the relationId from a potentially qualified relation name. */
+Oid
+ResolveRelationId(text *relationName)
+{
+	List *relationNameList = NIL;
+	RangeVar *relation = NULL;
+	Oid relationId = InvalidOid;
+	bool failOK = false;        /* error if relation cannot be found */
+
+	/* resolve relationId from passed in schema and relation name */
+	relationNameList = textToQualifiedNameList(relationName);
+	relation = makeRangeVarFromNameList(relationNameList);
+	relationId = RangeVarGetRelid(relation, NoLock, failOK);
+
+	return relationId;
+}
+
+
+/*
+ * GetTableDDLEvents takes in a relationId, and returns the list of DDL commands
+ * needed to reconstruct the relation. These DDL commands are all palloced; and
+ * include the table's schema definition, optional column storage and statistics
+ * definitions, and index and constraint defitions.
+ */
+List *
+GetTableDDLEvents(Oid relationId)
+{
+	List *tableDDLEventList = NIL;
+	char tableType = 0;
+	char *tableSchemaDef = NULL;
+	char *tableColumnOptionsDef = NULL;
+	char *schemaName = NULL;
+	Oid schemaId = InvalidOid;
+
+	Relation pgIndex = NULL;
+	SysScanDesc scanDescriptor = NULL;
+	ScanKeyData	scanKey[1];
+	int scanKeyCount = 1;
+	HeapTuple heapTuple = NULL;
+
+	/* if foreign table, fetch extension and server definitions */
+	tableType = get_rel_relkind(relationId);
+	if (tableType == RELKIND_FOREIGN_TABLE)
+	{
+		char *extensionDef = pg_get_extensiondef_string(relationId);
+		char *serverDef = pg_get_serverdef_string(relationId);
+
+		if (extensionDef != NULL)
+		{
+			tableDDLEventList = lappend(tableDDLEventList, extensionDef);
+		}
+		tableDDLEventList = lappend(tableDDLEventList, serverDef);
+	}
+
+	/* create schema if the table is not in the default namespace (public) */
+	schemaId = get_rel_namespace(relationId);
+	schemaName = get_namespace_name(schemaId);
+	if (strncmp(schemaName, "public", NAMEDATALEN) != 0)
+	{
+		StringInfo schemaNameDef = makeStringInfo();
+		appendStringInfo(schemaNameDef, CREATE_SCHEMA_COMMAND, schemaName);
+
+		tableDDLEventList = lappend(tableDDLEventList, schemaNameDef->data);
+	}
+
+	/* fetch table schema and column option definitions */
+	tableSchemaDef = pg_get_tableschemadef_string(relationId);
+	tableColumnOptionsDef = pg_get_tablecolumnoptionsdef_string(relationId);
+	
+	tableDDLEventList = lappend(tableDDLEventList, tableSchemaDef);
+	if (tableColumnOptionsDef != NULL)
+	{
+		tableDDLEventList = lappend(tableDDLEventList, tableColumnOptionsDef);
+	}
+	
+	/* open system catalog and scan all indexes that belong to this table */
+	pgIndex = heap_open(IndexRelationId, AccessShareLock);
+
+	ScanKeyInit(&scanKey[0], Anum_pg_index_indrelid,
+				BTEqualStrategyNumber, F_OIDEQ, relationId);
+
+	scanDescriptor = systable_beginscan(pgIndex,
+										IndexIndrelidIndexId, true, /* indexOK */
+										NULL, scanKeyCount, scanKey);
+
+	heapTuple = systable_getnext(scanDescriptor);
+	while (HeapTupleIsValid(heapTuple))
+	{
+		Form_pg_index indexForm = (Form_pg_index) GETSTRUCT(heapTuple);
+		Oid indexId = indexForm->indexrelid;
+		bool isConstraint = false;
+		char *statementDef = NULL;
+
+		/*
+		 * A primary key index is always created by a constraint statement.
+		 * A unique key index is created by a constraint if and only if the
+		 * index has a corresponding constraint entry in pg_depend. Any other
+		 * index form is never associated with a constraint.
+		 */
+		if (indexForm->indisprimary)
+		{
+			isConstraint = true;
+		}
+		else if (indexForm->indisunique)
+		{
+			Oid constraintId = get_index_constraint(indexId);
+			isConstraint = OidIsValid(constraintId);
+		}
+		else
+		{
+			isConstraint = false;
+		}
+
+		/* get the corresponding constraint or index statement */
+		if (isConstraint)
+		{
+			Oid constraintId = get_index_constraint(indexId);
+			Assert(constraintId != InvalidOid);
+
+#if (PG_VERSION_NUM >= 90500)
+			statementDef = pg_get_constraintdef_command(constraintId);
+#else
+			statementDef = pg_get_constraintdef_string(constraintId);
+#endif
+		}
+		else
+		{
+			statementDef = pg_get_indexdef_string(indexId);
+		}
+		
+		/* append found constraint or index definition to the list */
+		tableDDLEventList = lappend(tableDDLEventList, statementDef);
+
+		/* if table is clustered on this index, append definition to the list */
+		if (indexForm->indisclustered)
+		{
+			char *clusteredDef = pg_get_indexclusterdef_string(indexId);
+			Assert(clusteredDef != NULL);
+
+			tableDDLEventList = lappend(tableDDLEventList, clusteredDef);
+		}
+
+		heapTuple = systable_getnext(scanDescriptor);
+	}
+
+	/* clean up scan and close system catalog */
+	systable_endscan(scanDescriptor);
+	heap_close(pgIndex, AccessShareLock);
+
+	return tableDDLEventList;
+}
+
+
+/*
+ * hostname_client_addr allocates memory for the connecting client's fully
+ * qualified hostname, and returns this name. If there is no such connection or
+ * the connection is over Unix domain socket, the function errors.
+ */
+static char *
+hostname_client_addr(void)
+{
+	Port *port = MyProcPort;
+	char *remoteHost = NULL;
+	int remoteHostLen = NI_MAXHOST;
+	int flags = NI_NAMEREQD;	/* require fully qualified hostname */
+	int	nameFound = 0;
+
+	if (port == NULL)
+	{
+		ereport(ERROR, (errmsg("cannot find tcp/ip connection to client")));
+	}
+
+	switch (port->raddr.addr.ss_family)
+	{
+		case AF_INET:
+#ifdef HAVE_IPV6
+		case AF_INET6:
+#endif
+			break;
+		default:
+			ereport(ERROR, (errmsg("invalid address family in connection")));
+			break;
+	}
+
+	remoteHost = palloc0(remoteHostLen);
+
+	nameFound = pg_getnameinfo_all(&port->raddr.addr, port->raddr.salen,
+								   remoteHost, remoteHostLen, NULL, 0, flags);
+	if (nameFound != 0)
+	{
+		ereport(ERROR, (errmsg("could not resolve client hostname: %s",
+							   gai_strerror(nameFound))));
+	}
+
+	return remoteHost;
+}
+
+
+/*
+ * WorkerNodeGetDatum converts the worker node passed to it into its datum
+ * representation. To do this, the function first creates the heap tuple from
+ * the worker node name and port. Then, the function converts the heap tuple
+ * into a datum and returns it.
+ */
+static Datum
+WorkerNodeGetDatum(WorkerNode *workerNode, TupleDesc tupleDescriptor)
+{
+	Datum values[WORKER_NODE_FIELDS];
+	bool isNulls[WORKER_NODE_FIELDS];
+	HeapTuple workerNodeTuple = NULL;
+	Datum workerNodeDatum = 0;
+
+	memset(values, 0, sizeof(values));
+	memset(isNulls, false, sizeof(isNulls));
+
+	values[0] = CStringGetTextDatum(workerNode->workerName);
+	values[1] = Int64GetDatum((int64) workerNode->workerPort);
+
+	workerNodeTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
+	workerNodeDatum = HeapTupleGetDatum(workerNodeTuple);
+
+	return workerNodeDatum;
+}
--- a/src/backend/distributed/master/master_repair_shards.c
+++ b/src/backend/distributed/master/master_repair_shards.c
@ -0,0 +1,264 @@
+/*-------------------------------------------------------------------------
+ *
+ * master_repair_shards.c
+ *
+ * This file contains functions to repair unhealthy shard placements using data
+ * from healthy ones.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "fmgr.h"
+#include "miscadmin.h"
+
+#include <string.h>
+
+#include "catalog/pg_class.h"
+#include "distributed/connection_cache.h"
+#include "distributed/master_protocol.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/multi_router_executor.h"
+#include "distributed/resource_lock.h"
+#include "distributed/worker_manager.h"
+#include "distributed/worker_protocol.h"
+#include "lib/stringinfo.h"
+#include "nodes/pg_list.h"
+#include "storage/lock.h"
+#include "utils/builtins.h"
+#include "utils/elog.h"
+#include "utils/errcodes.h"
+#include "utils/lsyscache.h"
+#include "utils/palloc.h"
+
+
+/* local function forward declarations */
+static ShardPlacement * SearchShardPlacementInList(List *shardPlacementList,
+												   text *nodeName, uint32 nodePort);
+static List * RecreateTableDDLCommandList(Oid relationId);
+static bool CopyDataFromFinalizedPlacement(Oid distributedTableId, int64 shardId,
+										   ShardPlacement *healthyPlacement,
+										   ShardPlacement *placementToRepair);
+
+
+/* declarations for dynamic loading */
+PG_FUNCTION_INFO_V1(master_copy_shard_placement);
+
+
+/*
+ * master_copy_shard_placement implements a user-facing UDF to copy data from
+ * a healthy (source) node to an inactive (target) node. To accomplish this it
+ * entirely recreates the table structure before copying all data. During this
+ * time all modifications are paused to the shard. After successful repair, the
+ * inactive placement is marked healthy and modifications may continue. If the
+ * repair fails at any point, this function throws an error, leaving the node
+ * in an unhealthy state.
+ */
+Datum
+master_copy_shard_placement(PG_FUNCTION_ARGS)
+{
+	int64 shardId = PG_GETARG_INT64(0);
+	text *sourceNodeName = PG_GETARG_TEXT_P(1);
+	int32 sourceNodePort = PG_GETARG_INT32(2);
+	text *targetNodeName = PG_GETARG_TEXT_P(3);
+	int32 targetNodePort = PG_GETARG_INT32(4);
+	ShardInterval *shardInterval = LoadShardInterval(shardId);
+	Oid distributedTableId = shardInterval->relationId;
+
+	List *shardPlacementList = NIL;
+	ShardPlacement *sourcePlacement = NULL;
+	ShardPlacement *targetPlacement = NULL;
+	WorkerNode *targetNode = NULL;
+	List *ddlCommandList = NIL;
+	bool dataCopied = false;
+	char relationKind = '\0';
+
+	/*
+	 * By taking an exclusive lock on the shard, we both stop all modifications
+	 * (INSERT, UPDATE, or DELETE) and prevent concurrent repair operations from
+	 * being able to operate on this shard.
+	 */
+	LockShardResource(shardId, ExclusiveLock);
+
+	/*
+	 * We've stopped data modifications of this shard, but we plan to move
+	 * a placement to the healthy state, so we need to grab a shard metadata
+	 * lock (in exclusive mode) as well.
+	 */
+	LockShardDistributionMetadata(shardId, ExclusiveLock);
+
+	shardPlacementList = ShardPlacementList(shardId);
+	sourcePlacement = SearchShardPlacementInList(shardPlacementList, sourceNodeName,
+												 sourceNodePort);
+	if (sourcePlacement->shardState != FILE_FINALIZED)
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("source placement must be in finalized state")));
+	}
+
+	targetPlacement = SearchShardPlacementInList(shardPlacementList, targetNodeName,
+												 targetNodePort);
+	if (targetPlacement->shardState != FILE_INACTIVE)
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("target placement must be in inactive state")));
+	}
+
+	relationKind = get_rel_relkind(distributedTableId);
+	if (relationKind == RELKIND_FOREIGN_TABLE)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot repair shard"),
+						errdetail("Repairing shards backed by foreign tables is "
+								  "not supported.")));
+	}
+
+	targetNode = palloc0(sizeof(WorkerNode));
+	targetNode->inWorkerFile = true;
+	strlcpy(targetNode->workerName, targetPlacement->nodeName, WORKER_LENGTH);
+	targetNode->workerPort = targetPlacement->nodePort;
+
+	/* retrieve DDL commands needed to drop and recreate table*/
+	ddlCommandList = RecreateTableDDLCommandList(distributedTableId);
+
+	/* remove existing (unhealthy) placement row; CreateShardPlacements will recreate */
+	DeleteShardPlacementRow(targetPlacement->shardId, targetPlacement->nodeName,
+							targetPlacement->nodePort);
+
+	/* finally, drop/recreate remote table and add back row (in healthy state) */
+	CreateShardPlacements(shardId, ddlCommandList, list_make1(targetNode), 0, 1);
+
+	HOLD_INTERRUPTS();
+
+	dataCopied = CopyDataFromFinalizedPlacement(distributedTableId, shardId,
+												sourcePlacement, targetPlacement);
+	if (!dataCopied)
+	{
+		ereport(ERROR, (errmsg("could not copy shard data"),
+						errhint("Consult recent messages in the server logs for "
+								"details.")));
+	}
+
+	RESUME_INTERRUPTS();
+
+	PG_RETURN_VOID();
+}
+
+
+/*
+ * SearchShardPlacementInList searches a provided list for a shard placement
+ * with the specified node name and port. This function throws an error if no
+ * such placement exists in the provided list.
+ */
+static ShardPlacement *
+SearchShardPlacementInList(List *shardPlacementList, text *nodeNameText, uint32 nodePort)
+{
+	ListCell *shardPlacementCell = NULL;
+	ShardPlacement *matchingPlacement = NULL;
+	char *nodeName = text_to_cstring(nodeNameText);
+
+	foreach(shardPlacementCell, shardPlacementList)
+	{
+		ShardPlacement *shardPlacement = lfirst(shardPlacementCell);
+
+		if (strncmp(nodeName, shardPlacement->nodeName, MAX_NODE_LENGTH) == 0 &&
+			nodePort == shardPlacement->nodePort)
+		{
+			matchingPlacement = shardPlacement;
+			break;
+		}
+	}
+
+	if (matchingPlacement == NULL)
+	{
+		ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION),
+						errmsg("could not find placement matching \"%s:%d\"",
+							   nodeName, nodePort),
+						errhint("Confirm the placement still exists and try again.")));
+	}
+
+	return matchingPlacement;
+}
+
+
+/*
+ * RecreateTableDDLCommandList returns a list of DDL statements similar to that
+ * returned by GetTableDDLEvents except that the list begins with a "DROP TABLE"
+ * or "DROP FOREIGN TABLE" statement to facilitate total recreation of a placement.
+ */
+static List *
+RecreateTableDDLCommandList(Oid relationId)
+{
+	char *relationName = get_rel_name(relationId);
+	StringInfo dropCommand = makeStringInfo();
+	List *createCommandList = NIL;
+	List *dropCommandList = NIL;
+	List *recreateCommandList = NIL;
+	char relationKind = get_rel_relkind(relationId);
+
+	/* build appropriate DROP command based on relation kind */
+	if (relationKind == RELKIND_RELATION)
+	{
+		appendStringInfo(dropCommand, DROP_REGULAR_TABLE_COMMAND,
+						 quote_identifier(relationName));
+	}
+	else if (relationKind == RELKIND_FOREIGN_TABLE)
+	{
+		appendStringInfo(dropCommand, DROP_FOREIGN_TABLE_COMMAND,
+						 quote_identifier(relationName));
+	}
+	else
+	{
+		ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						errmsg("repair target is not a regular or foreign table")));
+	}
+
+	dropCommandList = list_make1(dropCommand->data);
+
+	createCommandList = GetTableDDLEvents(relationId);
+
+	recreateCommandList = list_concat(dropCommandList, createCommandList);
+
+	return recreateCommandList;
+}
+
+
+/*
+ * CopyDataFromFinalizedPlacement copies a the data for a shard (identified by
+ * a relation and shard identifier) from a healthy placement to one needing
+ * repair. The unhealthy placement must already have an empty relation in place
+ * to receive rows from the healthy placement. This function returns a boolean
+ * indicating success or failure.
+ */
+static bool
+CopyDataFromFinalizedPlacement(Oid distributedTableId, int64 shardId,
+							   ShardPlacement *healthyPlacement,
+							   ShardPlacement *placementToRepair)
+{
+	char *relationName = get_rel_name(distributedTableId);
+	const char *shardName = NULL;
+	StringInfo copyRelationQuery = makeStringInfo();
+	List *queryResultList = NIL;
+	bool copySuccessful = false;
+
+	AppendShardIdToName(&relationName, shardId);
+	shardName = quote_identifier(relationName);
+
+	appendStringInfo(copyRelationQuery, WORKER_APPEND_TABLE_TO_SHARD,
+					 quote_literal_cstr(shardName), /* table to append */
+					 quote_literal_cstr(shardName), /* remote table name */
+					 quote_literal_cstr(healthyPlacement->nodeName), /* remote host */
+					 healthyPlacement->nodePort); /* remote port */
+
+	queryResultList = ExecuteRemoteQuery(placementToRepair->nodeName,
+										 placementToRepair->nodePort, copyRelationQuery);
+	if (queryResultList != NIL)
+	{
+		copySuccessful = true;
+	}
+
+	return copySuccessful;
+}
--- a/src/backend/distributed/master/master_stage_protocol.c
+++ b/src/backend/distributed/master/master_stage_protocol.c
@ -0,0 +1,550 @@
+/*-------------------------------------------------------------------------
+ *
+ * master_stage_protocol.c
+ *
+ * Routines for staging PostgreSQL table data as shards into the distributed
+ * cluster. These user-defined functions are similar to the psql-side \stage
+ * command, but also differ from them in that users stage data from tables and
+ * not files, and that they can also append to existing shards.
+ *
+ * Copyright (c) 2013, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include "access/htup_details.h"
+#include "access/xact.h"
+#include "catalog/indexing.h"
+#include "distributed/master_metadata_utility.h"
+#include "distributed/master_protocol.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/multi_join_order.h"
+#include "distributed/pg_dist_partition.h"
+#include "distributed/pg_dist_shard.h"
+#include "distributed/resource_lock.h"
+#include "distributed/worker_manager.h"
+#include "distributed/worker_protocol.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/inval.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+#include "utils/rel.h"
+#include "utils/tqual.h"
+
+
+/* Local functions forward declarations */
+static bool WorkerCreateShard(char *nodeName, uint32 nodePort,
+							  uint64 shardId, List *ddlCommandList);
+static bool WorkerShardStats(char *nodeName, uint32 nodePort, Oid relationId,
+							 char *shardName, uint64 *shardLength,
+							 text **shardMinValue, text **shardMaxValue);
+static uint64 WorkerTableSize(char *nodeName, uint32 nodePort, char *tableName);
+static StringInfo WorkerPartitionValue(char *nodeName, uint32 nodePort, Oid relationId,
+									   char *shardName, char *selectQuery);
+
+
+/* exports for SQL callable functions */
+PG_FUNCTION_INFO_V1(master_create_empty_shard);
+PG_FUNCTION_INFO_V1(master_append_table_to_shard);
+
+
+/*
+ * master_create_empty_shard creates an empty shard for the given distributed
+ * table. For this, the function first gets a list of candidate nodes, connects
+ * to these nodes, and issues DDL commands on the nodes to create empty shard
+ * placements. The function then updates metadata on the master node to make
+ * this shard (and its placements) visible.
+ */
+Datum
+master_create_empty_shard(PG_FUNCTION_ARGS)
+{
+	text *relationNameText = PG_GETARG_TEXT_P(0);
+	char *relationName = text_to_cstring(relationNameText);
+	Datum shardIdDatum = 0;
+	int64 shardId = INVALID_SHARD_ID;
+	List *ddlEventList = NULL;
+	uint32 attemptableNodeCount = 0;
+	uint32 liveNodeCount = 0;
+
+	uint32 candidateNodeCount = 0;
+	List *candidateNodeList = NIL;
+	text *nullMinValue = NULL;
+	text *nullMaxValue = NULL;
+	char tableType = 0;
+	char partitionMethod = 0;
+
+	Oid relationId = ResolveRelationId(relationNameText);
+	CheckDistributedTable(relationId);
+
+	tableType = get_rel_relkind(relationId);
+	if (tableType != RELKIND_RELATION)
+	{
+		ereport(ERROR, (errmsg("relation \"%s\" is not a regular table", relationName)));
+	}
+
+	partitionMethod = PartitionMethod(relationId);
+	if (partitionMethod == DISTRIBUTE_BY_HASH)
+	{
+		ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table",
+								relationName),
+						errdetail("We currently don't support creating shards "
+								  "on hash-partitioned tables")));
+	}
+
+	/* generate new and unique shardId from sequence */
+	shardIdDatum = master_get_new_shardid(NULL);
+	shardId = DatumGetInt64(shardIdDatum);
+
+	/* get table DDL commands to replay on the worker node */
+	ddlEventList = GetTableDDLEvents(relationId);
+
+	/* if enough live nodes, add an extra candidate node as backup */
+	attemptableNodeCount = ShardReplicationFactor;
+	liveNodeCount = WorkerGetLiveNodeCount();
+	if (liveNodeCount > ShardReplicationFactor)
+	{
+		attemptableNodeCount = ShardReplicationFactor + 1;
+	}
+
+	/* first retrieve a list of random nodes for shard placements */
+	while (candidateNodeCount < attemptableNodeCount)
+	{
+		WorkerNode *candidateNode = WorkerGetCandidateNode(candidateNodeList);
+		if (candidateNode == NULL)
+		{
+			ereport(ERROR, (errmsg("could only find %u of %u possible nodes",
+								   candidateNodeCount, attemptableNodeCount)));
+		}
+
+		candidateNodeList = lappend(candidateNodeList, candidateNode);
+		candidateNodeCount++;
+	}
+
+	CreateShardPlacements(shardId, ddlEventList, candidateNodeList, 0,
+	                      ShardReplicationFactor);
+
+	InsertShardRow(relationId, shardId, SHARD_STORAGE_TABLE, nullMinValue, nullMaxValue);
+
+	PG_RETURN_INT64(shardId);
+}
+
+
+/*
+ * master_append_table_to_shard appends the given table's contents to the given
+ * shard, and updates shard metadata on the master node. If the function fails
+ * to append table data to all shard placements, it doesn't update any metadata
+ * and errors out. Else if the function fails to append table data to some of
+ * the shard placements, it marks those placements as invalid. These invalid
+ * placements will get cleaned up during shard rebalancing.
+ */
+Datum
+master_append_table_to_shard(PG_FUNCTION_ARGS)
+{
+	uint64 shardId = PG_GETARG_INT64(0);
+	text *sourceTableNameText = PG_GETARG_TEXT_P(1);
+	text *sourceNodeNameText = PG_GETARG_TEXT_P(2);
+	uint32 sourceNodePort = PG_GETARG_UINT32(3);
+	char *sourceTableName = text_to_cstring(sourceTableNameText);
+	char *sourceNodeName = text_to_cstring(sourceNodeNameText);
+
+	char *shardName = NULL;
+	List *shardPlacementList = NIL;
+	List *succeededPlacementList = NIL;
+	List *failedPlacementList = NIL;
+	ListCell *shardPlacementCell = NULL;
+	ListCell *succeededPlacementCell = NULL;
+	ListCell *failedPlacementCell = NULL;
+	bool statsOK = false;
+	uint64 newShardLength = 0;
+	uint64 shardMaxSizeInBytes = 0;
+	float4 shardFillLevel = 0.0;
+	text *newMinValue = NULL;
+	text *newMaxValue = NULL;
+	char partitionMethod = 0;
+
+	ShardInterval *shardInterval = LoadShardInterval(shardId);
+	Oid relationId = shardInterval->relationId;
+
+	char storageType = shardInterval->storageType;
+	if (storageType != SHARD_STORAGE_TABLE)
+	{
+		ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId),
+						errdetail("The underlying shard is not a regular table")));
+	}
+
+	partitionMethod = PartitionMethod(relationId);
+	if (partitionMethod == DISTRIBUTE_BY_HASH)
+	{
+		ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId),
+						errdetail("We currently don't support appending to shards "
+								  "in hash-partitioned tables")));
+	}
+
+	/*
+	 * We lock on the shardId, but do not unlock. When the function returns, and
+	 * the transaction for this function commits, this lock will automatically
+	 * be released. This ensures appends to a shard happen in a serial manner.
+	 */
+	LockShardResource(shardId, AccessExclusiveLock);
+
+	/* if shard doesn't have an alias, extend regular table name */
+	shardName = LoadShardAlias(relationId, shardId);
+	if (shardName == NULL)
+	{
+		shardName = get_rel_name(relationId);
+		AppendShardIdToName(&shardName, shardId);
+	}
+
+	shardPlacementList = FinalizedShardPlacementList(shardId);
+	if (shardPlacementList == NIL)
+	{
+		ereport(ERROR, (errmsg("could not find any shard placements for shardId "
+							   UINT64_FORMAT, shardId),
+						errhint("Try running master_create_empty_shard() first")));
+	}
+
+	/* issue command to append table to each shard placement */
+	foreach(shardPlacementCell, shardPlacementList)
+	{
+		ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell);
+		char *workerName = shardPlacement->nodeName;
+		uint32 workerPort = shardPlacement->nodePort;
+		List *queryResultList = NIL;
+
+		StringInfo workerAppendQuery = makeStringInfo();
+		appendStringInfo(workerAppendQuery, WORKER_APPEND_TABLE_TO_SHARD,
+						 quote_literal_cstr(shardName),
+						 quote_literal_cstr(sourceTableName),
+						 quote_literal_cstr(sourceNodeName), sourceNodePort);
+
+		queryResultList = ExecuteRemoteQuery(workerName, workerPort, workerAppendQuery);
+		if (queryResultList != NIL)
+		{
+			succeededPlacementList = lappend(succeededPlacementList, shardPlacement);
+		}
+		else
+		{
+			failedPlacementList = lappend(failedPlacementList, shardPlacement);
+		}
+	}
+
+	/* before updating metadata, check that we appended to at least one shard */
+	if (succeededPlacementList == NIL)
+	{
+		ereport(ERROR, (errmsg("could not append table to any shard placement")));
+	}
+
+	/* make sure we don't process cancel signals */
+	HOLD_INTERRUPTS();
+
+	/* mark shard placements that we couldn't append to as inactive */
+	foreach(failedPlacementCell, failedPlacementList)
+	{
+		ShardPlacement *placement = (ShardPlacement *) lfirst(failedPlacementCell);
+		char *workerName = placement->nodeName;
+		uint32 workerPort = placement->nodePort;
+		uint64 oldShardLength = placement->shardLength;
+
+		DeleteShardPlacementRow(shardId, workerName, workerPort);
+		InsertShardPlacementRow(shardId, FILE_INACTIVE, oldShardLength,
+								workerName, workerPort);
+
+		ereport(WARNING, (errmsg("could not append table to shard \"%s\" on node "
+								 "\"%s:%u\"", shardName, workerName, workerPort),
+						  errdetail("Marking this shard placement as inactive")));
+	}
+
+	RESUME_INTERRUPTS();
+
+	/* get appended shard's statistics from a shard placement */
+	foreach(succeededPlacementCell, succeededPlacementList)
+	{
+		ShardPlacement *placement = (ShardPlacement *) lfirst(succeededPlacementCell);
+		char *workerName = placement->nodeName;
+		uint32 workerPort = placement->nodePort;
+
+		statsOK = WorkerShardStats(workerName, workerPort, relationId, shardName,
+								   &newShardLength, &newMinValue, &newMaxValue);
+		if (statsOK)
+		{
+			break;
+		}
+	}
+
+	/*
+	 * If for some reason we appended data to a shard, but failed to retrieve
+	 * statistics we just WARN here to avoid losing shard-state updates. Note
+	 * that this means we will return 0 as the shard fill-factor, and this shard
+	 * also won't be pruned as the statistics will be empty. If the failure was
+	 * transient, a subsequent append call will fetch the correct statistics.
+	 */
+	if (!statsOK)
+	{
+		ereport(WARNING, (errmsg("could not get statistics for shard placement"),
+						  errdetail("Setting shard statistics to NULL")));
+	}
+
+	/* make sure we don't process cancel signals */
+	HOLD_INTERRUPTS();
+
+	/* update metadata for each shard placement we appended to */
+	succeededPlacementCell = NULL;
+	foreach(succeededPlacementCell, succeededPlacementList)
+	{
+		ShardPlacement *placement = (ShardPlacement *) lfirst(succeededPlacementCell);
+		char *workerName = placement->nodeName;
+		uint32 workerPort = placement->nodePort;
+
+		DeleteShardPlacementRow(shardId, workerName, workerPort);
+		InsertShardPlacementRow(shardId, FILE_FINALIZED, newShardLength,
+								workerName, workerPort);
+	}
+
+	DeleteShardRow(shardId);
+	InsertShardRow(relationId, shardId, storageType, newMinValue, newMaxValue);
+
+	if (QueryCancelPending)
+	{
+		ereport(WARNING, (errmsg("cancel requests are ignored during table appends")));
+		QueryCancelPending = false;
+	}
+
+	RESUME_INTERRUPTS();
+
+	/* calculate ratio of current shard size compared to shard max size */
+	shardMaxSizeInBytes = (int64) ShardMaxSize * 1024L;
+	shardFillLevel = ((float4) newShardLength / (float4) shardMaxSizeInBytes);
+
+	PG_RETURN_FLOAT4(shardFillLevel);
+}
+
+
+/*
+ * CheckDistributedTable checks if the given relationId corresponds to a
+ * distributed table. If it does not, the function errors out.
+ */
+void
+CheckDistributedTable(Oid relationId)
+{
+	char *relationName = get_rel_name(relationId);
+
+	/* check that the relationId belongs to a table */
+	char tableType = get_rel_relkind(relationId);
+	if (!(tableType == RELKIND_RELATION || tableType == RELKIND_FOREIGN_TABLE))
+	{
+		ereport(ERROR, (errmsg("relation \"%s\" is not a table", relationName)));
+	}
+
+	if (!IsDistributedTable(relationId))
+	{
+		ereport(ERROR, (errmsg("relation \"%s\" is not a distributed table",
+							   relationName)));
+	}
+}
+
+
+/*
+ * CreateShardPlacements attempts to create a certain number of placements
+ * (provided by the replicationFactor argument) on the provided list of worker
+ * nodes. Beginning at the provided start index, DDL commands are attempted on
+ * worker nodes (via WorkerCreateShards). If there are more worker nodes than
+ * required for replication, one remote failure is tolerated. If the provided
+ * replication factor is not attained, an error is raised (placements remain on
+ * nodes if some DDL commands had been successful).
+ */
+void
+CreateShardPlacements(int64 shardId, List *ddlEventList, List *workerNodeList,
+                      int workerStartIndex, int replicationFactor)
+{
+	int attemptCount = replicationFactor;
+	int workerNodeCount = list_length(workerNodeList);
+	int placementsCreated = 0;
+	int attemptNumber = 0;
+
+	/* if we have enough nodes, add an extra placement attempt for backup */
+	if (workerNodeCount > replicationFactor)
+	{
+		attemptCount++;
+	}
+
+	for (attemptNumber = 0; attemptNumber < attemptCount; attemptNumber++)
+	{
+		int workerNodeIndex = (workerStartIndex + attemptNumber) % workerNodeCount;
+		WorkerNode *workerNode = (WorkerNode *) list_nth(workerNodeList, workerNodeIndex);
+		char *nodeName = workerNode->workerName;
+		uint32 nodePort = workerNode->workerPort;
+
+		bool created = WorkerCreateShard(nodeName, nodePort, shardId, ddlEventList);
+		if (created)
+		{
+			const RelayFileState shardState = FILE_FINALIZED;
+			const uint64 shardSize = 0;
+
+			InsertShardPlacementRow(shardId, shardState, shardSize, nodeName, nodePort);
+			placementsCreated++;
+		}
+		else
+		{
+			ereport(WARNING, (errmsg("could not create shard on \"%s:%u\"",
+			                         nodeName, nodePort)));
+		}
+
+		if (placementsCreated >= replicationFactor)
+		{
+			break;
+		}
+	}
+
+	/* check if we created enough shard replicas */
+	if (placementsCreated < replicationFactor)
+	{
+		ereport(ERROR, (errmsg("could only create %u of %u of required shard replicas",
+		                       placementsCreated, replicationFactor)));
+	}
+}
+
+
+/*
+ * WorkerCreateShard applies DDL commands for the given shardId to create the
+ * shard on the worker node. Note that this function opens a new connection for
+ * each DDL command, and could leave the shard in an half-initialized state.
+ */
+static bool
+WorkerCreateShard(char *nodeName, uint32 nodePort,
+				  uint64 shardId, List *ddlCommandList)
+{
+	bool shardCreated = true;
+	ListCell *ddlCommandCell = NULL;
+
+	foreach(ddlCommandCell, ddlCommandList)
+	{
+		char *ddlCommand = (char *) lfirst(ddlCommandCell);
+		char *escapedDDLCommand = quote_literal_cstr(ddlCommand);
+		List *queryResultList = NIL;
+
+		StringInfo applyDDLCommand = makeStringInfo();
+		appendStringInfo(applyDDLCommand, WORKER_APPLY_SHARD_DDL_COMMAND,
+						 shardId, escapedDDLCommand);
+
+		queryResultList = ExecuteRemoteQuery(nodeName, nodePort, applyDDLCommand);
+		if (queryResultList == NIL)
+		{
+			shardCreated = false;
+			break;
+		}
+	}
+
+	return shardCreated;
+}
+
+
+/*
+ * WorkerShardStats queries the worker node, and retrieves shard statistics that
+ * we assume have changed after new table data have been appended to the shard.
+ */
+static bool
+WorkerShardStats(char *nodeName, uint32 nodePort, Oid relationId, char *shardName,
+				 uint64 *shardLength, text **shardMinValue, text **shardMaxValue)
+{
+	bool shardStatsOK = true;
+
+	PG_TRY();
+	{
+		uint64 tableSize = WorkerTableSize(nodeName, nodePort, shardName);
+		StringInfo minValue = WorkerPartitionValue(nodeName, nodePort, relationId,
+												   shardName, SHARD_MIN_VALUE_QUERY);
+		StringInfo maxValue = WorkerPartitionValue(nodeName, nodePort, relationId,
+												   shardName, SHARD_MAX_VALUE_QUERY);
+
+		(*shardLength) = tableSize;
+		(*shardMinValue) = cstring_to_text_with_len(minValue->data, minValue->len);
+		(*shardMaxValue) = cstring_to_text_with_len(maxValue->data, maxValue->len);
+	}
+	PG_CATCH();
+	{
+		shardStatsOK = false;
+	}
+	PG_END_TRY();
+
+	return shardStatsOK;
+}
+
+
+/*
+ * WorkerTableSize queries the worker node to extract the disk space used by the
+ * given relation. The function assumes the relation represents a regular table.
+ */
+static uint64
+WorkerTableSize(char *nodeName, uint32 nodePort, char *tableName)
+{
+	uint64 tableSize = 0;
+	List *queryResultList = NIL;
+	StringInfo tableSizeString = NULL;
+	char *tableSizeStringEnd = NULL;
+
+	StringInfo tableSizeQuery = makeStringInfo();
+	appendStringInfo(tableSizeQuery, SHARD_TABLE_SIZE_QUERY, tableName);
+
+	queryResultList = ExecuteRemoteQuery(nodeName, nodePort, tableSizeQuery);
+	if (queryResultList == NIL)
+	{
+		ereport(ERROR, (errmsg("could not receive table size from node "
+							   "\"%s:%u\"", nodeName, nodePort)));
+	}
+
+	tableSizeString = (StringInfo) linitial(queryResultList);
+
+	errno = 0;
+	tableSize = strtoull(tableSizeString->data, &tableSizeStringEnd, 0);
+	if (errno != 0 || (*tableSizeStringEnd) != '\0')
+	{
+		ereport(ERROR, (errmsg("could not extract table size for table \"%s\"",
+							   tableName)));
+	}
+
+	return tableSize;
+}
+
+
+/*
+ * WorkerPartitionValue helps in extracting partition column's min or max value
+ * from the given shard. For this, the function resolves the given distributed
+ * relation's partition column, connects to the worker node, and runs a select
+ * query on the given shard.
+ */
+static StringInfo
+WorkerPartitionValue(char *nodeName, uint32 nodePort, Oid relationId,
+					 char *shardName, char *selectQuery)
+{
+	StringInfo partitionValue = NULL;
+	List *queryResultList = NIL;
+	uint32 unusedTableId = 1;
+
+	Var *partitionColumn = PartitionColumn(relationId, unusedTableId);
+	char *partitionColumnName = get_attname(relationId, partitionColumn->varattno);
+
+	StringInfo partitionValueQuery = makeStringInfo();
+	appendStringInfo(partitionValueQuery, selectQuery, partitionColumnName, shardName);
+
+	/*
+	 * Note that the following call omits the partition column value's size, and
+	 * simply casts the results to a (char *). If the user partitioned the table
+	 * on a binary byte array, this approach fails and should be fixed.
+	 */
+	queryResultList = ExecuteRemoteQuery(nodeName, nodePort, partitionValueQuery);
+	if (queryResultList == NIL)
+	{
+		ereport(ERROR, (errmsg("could not receive shard min/max values from node "
+							   "\"%s:%u\"", nodeName, nodePort)));
+	}
+
+	partitionValue = (StringInfo) linitial(queryResultList);
+	return partitionValue;
+}
--- a/src/backend/distributed/master/pg_worker_list.conf.sample
+++ b/src/backend/distributed/master/pg_worker_list.conf.sample
@ -0,0 +1,27 @@
+# ------------------------------------------
+# Citus Database Worker Node Membership List
+# ------------------------------------------
+#
+# This file contains list of worker node names; these names are used both for
+# initializing the worker nodes, and later for communicating with them. Records
+# in this file are in the following format:
+#
+# HOSTNAME     [PORT]     [RACK]
+#
+# (The uppercase items must be replaced by actual values.)
+#
+# HOSTNAME specifies the DNS resolvable host name for the worker node. In test
+# environments, localhost may be used to loopback to the current node.
+#
+# PORT specifies the port number to connect to at the specified host. This value
+# is optional; in its absence, the port configuration value is used as the
+# default.
+#
+# RACK specifies the host's network location for the purposes of performing rack
+# aware data distribution. This value is optional; in its absence, a generic
+# value is used as the default.
+
+# Put your actual configuration here
+# ----------------------------------
+#
+# HOSTNAME     [PORT]     [RACK]
--- a/src/backend/distributed/master/worker_node_manager.c
+++ b/src/backend/distributed/master/worker_node_manager.c
@ -0,0 +1,807 @@
+/*-------------------------------------------------------------------------
+ *
+ * worker_node_manager.c
+ *	  Routines for reading worker nodes from membership file, and allocating
+ *	  candidate nodes for shard placement.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include "commands/dbcommands.h"
+#include "distributed/worker_manager.h"
+#include "distributed/multi_client_executor.h"
+#include "libpq/hba.h"
+#include "postmaster/postmaster.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "utils/guc.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+
+
+/* Config variables managed via guc.c */
+char *WorkerListFileName;            /* location of pg_worker_list.conf */
+int MaxWorkerNodesTracked = 2048;    /* determines worker node hash table size */
+
+static HTAB *WorkerNodesHash = NULL; /* worker node hash in shared memory */
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+
+/* Local functions forward declarations */
+static bool OddNumber(uint32 number);
+static WorkerNode * FindRandomNodeNotInList(HTAB *WorkerNodesHash,
+											List *currentNodeList);
+static bool ListMember(List *currentList, WorkerNode *workerNode);
+static Size WorkerNodeShmemSize(void);
+static void WorkerNodeShmemAndWorkerListInit(void);
+static uint32 WorkerNodeHashCode(const void *key, Size keySize);
+static int WorkerNodeCompare(const void *lhsKey, const void *rhsKey, Size keySize);
+static List * ParseWorkerNodeFile(const char *workerNodeFilename);
+static void ResetWorkerNodesHash(HTAB *WorkerNodesHash);
+static bool WorkerNodeResponsive(const char *workerName, uint32 workerPort);
+
+
+/* ------------------------------------------------------------
+ * Worker node selection functions follow
+ * ------------------------------------------------------------
+ */
+
+/*
+ * WorkerGetCandidateNode takes in a list of worker nodes, and then allocates a
+ * new worker node. The allocation is performed according to the following
+ * policy: if the list is empty, a random node is allocated; if the list has one
+ * node (or an odd number of nodes), the new node is allocated on a different
+ * rack than the first node; and if the list has two nodes (or an even number of
+ * nodes), the new node is allocated on the same rack as the first node, but is
+ * different from all the nodes in the list. This node allocation policy ensures
+ * that shard locality is maintained within a rack, but no single rack failure
+ * can result in data loss.
+ *
+ * Note that the function returns null if the worker membership list does not
+ * contain enough nodes to allocate a new worker node.
+ */
+WorkerNode *
+WorkerGetCandidateNode(List *currentNodeList)
+{
+	WorkerNode *workerNode = NULL;
+	bool wantSameRack = false;
+	uint32 tryCount = WORKER_RACK_TRIES;
+	uint32 tryIndex = 0;
+
+	/*
+	 * We check if the shard has already been placed on all nodes known to us.
+	 * This check is rather defensive, and has the drawback of performing a full
+	 * scan over the worker node hash for determining the number of live nodes.
+	 */
+	uint32 currentNodeCount = list_length(currentNodeList);
+	uint32 liveNodeCount = WorkerGetLiveNodeCount();
+	if (currentNodeCount >= liveNodeCount)
+	{
+		return NULL;
+	}
+
+	/* if current node list is empty, randomly pick one node and return */
+	if (currentNodeCount == 0)
+	{
+		workerNode = FindRandomNodeNotInList(WorkerNodesHash, NIL);
+		return workerNode;
+	}
+
+	/*
+	 * If the current list has an odd number of nodes (1, 3, 5, etc), we want to
+	 * place the shard on a different rack than the first node's rack.
+	 * Otherwise, we want to place the shard on the same rack as the first node.
+	 */
+	if (OddNumber(currentNodeCount))
+	{
+		wantSameRack = false;
+	}
+	else
+	{
+		wantSameRack = true;
+	}
+
+	/*
+	 * We try to find a worker node that fits our rack-aware placement strategy.
+	 * If after a predefined number of tries, we still cannot find such a node,
+	 * we simply give up and return the last worker node we found.
+	 */
+	for (tryIndex = 0; tryIndex < tryCount; tryIndex++)
+	{
+		WorkerNode *firstNode = (WorkerNode *) linitial(currentNodeList);
+		char *firstRack = firstNode->workerRack;
+		char *workerRack = NULL;
+		bool sameRack = false;
+
+		workerNode = FindRandomNodeNotInList(WorkerNodesHash, currentNodeList);
+		workerRack = workerNode->workerRack;
+
+		sameRack = (strncmp(workerRack, firstRack, WORKER_LENGTH) == 0);
+		if ((sameRack && wantSameRack) || (!sameRack && !wantSameRack))
+		{
+			break;
+		}
+	}
+
+	return workerNode;
+}
+
+
+/*
+ * WorkerGetRoundRobinCandidateNode takes in a list of worker nodes and returns
+ * a candidate worker node from that list. To select this node, this function
+ * uses the round-robin policy. An ideal round-robin implementation requires
+ * keeping shared state for shard placements; and we instead approximate our
+ * implementation by relying on the ever-increasing shardId. So, the first
+ * worker node selected will be the node at the (shardId MOD worker node count)
+ * index and the remaining candidate nodes will be the next nodes in the list.
+ *
+ * Note that the function returns null if the worker membership list does not
+ * contain enough nodes to place all replicas.
+ */
+WorkerNode *
+WorkerGetRoundRobinCandidateNode(List *workerNodeList, uint64 shardId,
+								 uint32 placementIndex)
+{
+	uint32 workerNodeCount = list_length(workerNodeList);
+	WorkerNode *candidateNode = NULL;
+
+	if (placementIndex < workerNodeCount)
+	{
+		uint32 candidateNodeIndex = (shardId + placementIndex) % workerNodeCount;
+		candidateNode = (WorkerNode *) list_nth(workerNodeList, candidateNodeIndex);
+	}
+
+	return candidateNode;
+}
+
+
+/*
+ * WorkerGetNodeWithName finds and returns a node from the membership list that
+ * has the given hostname. The function returns null if no such node exists.
+ */
+WorkerNode *
+WorkerGetNodeWithName(const char *hostname)
+{
+	WorkerNode *workerNode = NULL;
+
+	HASH_SEQ_STATUS status;
+	hash_seq_init(&status, WorkerNodesHash);
+
+	workerNode = (WorkerNode *) hash_seq_search(&status);
+	while (workerNode != NULL)
+	{
+		if (workerNode->inWorkerFile)
+		{
+			int nameCompare = strncmp(workerNode->workerName, hostname, WORKER_LENGTH);
+			if (nameCompare == 0)
+			{
+				hash_seq_term(&status);
+				break;
+			}
+		}
+
+		workerNode = (WorkerNode *) hash_seq_search(&status);
+	}
+
+	return workerNode;
+}
+
+
+/* Returns the number of live nodes in the cluster. */
+uint32
+WorkerGetLiveNodeCount(void)
+{
+	WorkerNode *workerNode = NULL;
+	uint32 liveWorkerCount = 0;
+
+	HASH_SEQ_STATUS status;
+	hash_seq_init(&status, WorkerNodesHash);
+
+	workerNode = (WorkerNode *) hash_seq_search(&status);
+	while (workerNode != NULL)
+	{
+		if (workerNode->inWorkerFile)
+		{
+			liveWorkerCount++;
+		}
+
+		workerNode = (WorkerNode *) hash_seq_search(&status);
+	}
+
+	return liveWorkerCount;
+}
+
+
+/* Inserts the live worker nodes to a list, and returns the list. */
+List *
+WorkerNodeList(void)
+{
+	List *workerNodeList = NIL;
+	WorkerNode *workerNode = NULL;
+
+	HASH_SEQ_STATUS status;
+	hash_seq_init(&status, WorkerNodesHash);
+
+	workerNode = (WorkerNode *) hash_seq_search(&status);
+	while (workerNode != NULL)
+	{
+		if (workerNode->inWorkerFile)
+		{
+			workerNodeList = lappend(workerNodeList, workerNode);
+		}
+
+		workerNode = (WorkerNode *) hash_seq_search(&status);
+	}
+
+	return workerNodeList;
+}
+
+
+/*
+ * WorkerNodeActive looks up a worker node with the given name and port number
+ * in the current membership list. If such a worker node exists, the function
+ * returns true.
+ */
+bool
+WorkerNodeActive(const char *nodeName, uint32 nodePort)
+{
+	bool workerNodeActive = false;
+	bool handleFound = false;
+	WorkerNode *workerNode = NULL;
+	void *hashKey = NULL;
+
+	WorkerNode *searchedNode = (WorkerNode *) palloc0(sizeof(WorkerNode));
+	strlcpy(searchedNode->workerName, nodeName, WORKER_LENGTH);
+	searchedNode->workerPort = nodePort;
+
+	hashKey = (void *) searchedNode;
+	workerNode = (WorkerNode *) hash_search(WorkerNodesHash, hashKey,
+											HASH_FIND, &handleFound);
+	if (workerNode != NULL)
+	{
+		if (workerNode->inWorkerFile)
+		{
+			workerNodeActive = true;
+		}
+	}
+
+	return workerNodeActive;
+}
+
+
+/* Returns true if given number is odd; returns false otherwise. */
+static bool
+OddNumber(uint32 number)
+{
+	bool oddNumber = ((number % 2) == 1);
+	return oddNumber;
+}
+
+
+/*
+ * FindRandomNodeNotInList finds a random node from the shared hash that is not
+ * a member of the current node list. The caller is responsible for making the
+ * necessary node count checks to ensure that such a node exists.
+ *
+ * Note that this function has a selection bias towards nodes whose positions in
+ * the shared hash are sequentially adjacent to the positions of nodes that are
+ * in the current node list. This bias follows from our decision to first pick a
+ * random node in the hash, and if that node is a member of the current list, to
+ * simply iterate to the next node in the hash. Overall, this approach trades in
+ * some selection bias for simplicity in design and for bounded execution time.
+ */
+static WorkerNode *
+FindRandomNodeNotInList(HTAB *WorkerNodesHash, List *currentNodeList)
+{
+	WorkerNode *workerNode = NULL;
+	HASH_SEQ_STATUS status;
+	uint32 workerNodeCount = 0;
+	uint32 currentNodeCount = 0;
+	bool lookForWorkerNode = true;
+	uint32 workerPosition = 0;
+	uint32 workerIndex = 0;
+
+	workerNodeCount = hash_get_num_entries(WorkerNodesHash);
+	currentNodeCount = list_length(currentNodeList);
+	Assert(workerNodeCount > currentNodeCount);
+
+	/*
+	 * We determine a random position within the worker hash between [1, N],
+	 * assuming that the number of elements in the hash is N. We then get to
+	 * this random position by iterating over the worker hash. Please note that
+	 * the random seed has already been set by the postmaster when starting up.
+	 */
+	workerPosition = (random() % workerNodeCount) + 1;
+	hash_seq_init(&status, WorkerNodesHash);
+
+	for (workerIndex = 0; workerIndex < workerPosition; workerIndex++)
+	{
+		workerNode = (WorkerNode *) hash_seq_search(&status);
+	}
+
+	while (lookForWorkerNode)
+	{
+		bool listMember = ListMember(currentNodeList, workerNode);
+
+		if (workerNode->inWorkerFile && !listMember)
+		{
+			lookForWorkerNode = false;
+		}
+		else
+		{
+			/* iterate to the next worker node in the hash */
+			workerNode = (WorkerNode *) hash_seq_search(&status);
+
+			/* reached end of hash; start from the beginning */
+			if (workerNode == NULL)
+			{
+				hash_seq_init(&status, WorkerNodesHash);
+				workerNode = (WorkerNode *) hash_seq_search(&status);
+			}
+		}
+	}
+
+	/* we stopped scanning before completion; therefore clean up scan */
+	hash_seq_term(&status);
+
+	return workerNode;
+}
+
+
+/* Checks if given worker node is a member of the current list. */
+static bool
+ListMember(List *currentList, WorkerNode *workerNode)
+{
+	bool listMember = false;
+	Size keySize = WORKER_LENGTH + sizeof(uint32);
+
+	ListCell *currentCell = NULL;
+	foreach(currentCell, currentList)
+	{
+		WorkerNode *currentNode = (WorkerNode *) lfirst(currentCell);
+		if (WorkerNodeCompare(workerNode, currentNode, keySize) == 0)
+		{
+			listMember = true;
+		}
+	}
+
+	return listMember;
+}
+
+
+/* ------------------------------------------------------------
+ * Worker node shared hash functions follow
+ * ------------------------------------------------------------
+ */
+
+/* Organize, at startup, that the resources for worker node management are allocated. */
+void
+WorkerNodeRegister(void)
+{
+	RequestAddinShmemSpace(WorkerNodeShmemSize());
+
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = WorkerNodeShmemAndWorkerListInit;
+}
+
+
+/* Estimates the shared memory size used for managing worker nodes. */
+static Size
+WorkerNodeShmemSize(void)
+{
+	Size size = 0;
+	Size hashSize = 0;
+
+	hashSize = hash_estimate_size(MaxWorkerNodesTracked, sizeof(WorkerNode));
+	size = add_size(size, hashSize);
+
+	return size;
+}
+
+
+/* Initializes the shared memory used for managing worker nodes. */
+static void
+WorkerNodeShmemAndWorkerListInit(void)
+{
+	HASHCTL info;
+	int hashFlags = 0;
+	long maxTableSize = 0;
+	long initTableSize = 0;
+
+	maxTableSize = (long) MaxWorkerNodesTracked;
+	initTableSize = maxTableSize / 8;
+
+	/*
+	 * Allocate the control structure for the hash table that maps worker node
+	 * name and port numbers (char[]:uint32) to general node membership and
+	 * health information.
+	 */
+	memset(&info, 0, sizeof(info));
+	info.keysize = WORKER_LENGTH + sizeof(uint32);
+	info.entrysize = sizeof(WorkerNode);
+	info.hash = WorkerNodeHashCode;
+	info.match = WorkerNodeCompare;
+	hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
+
+	WorkerNodesHash = ShmemInitHash("Worker Node Hash",
+									initTableSize, maxTableSize,
+									&info, hashFlags);
+
+	/*
+	 * Load the intial contents of the worker node hash table from the
+	 * configuration file.
+	 */
+	LoadWorkerNodeList(WorkerListFileName);
+
+	if (prev_shmem_startup_hook != NULL)
+	{
+		prev_shmem_startup_hook();
+	}
+}
+
+
+/*
+ * WorkerNodeHashCode computes the hash code for a worker node from the node's
+ * host name and port number. Nodes that only differ by their rack locations
+ * hash to the same value.
+ */
+static uint32
+WorkerNodeHashCode(const void *key, Size keySize)
+{
+	const WorkerNode *worker = (const WorkerNode *) key;
+	const char *workerName = worker->workerName;
+	const uint32 *workerPort = &(worker->workerPort);
+
+	/* standard hash function outlined in Effective Java, Item 8 */
+	uint32 result = 17;
+	result = 37 * result + string_hash(workerName, WORKER_LENGTH);
+	result = 37 * result + tag_hash(workerPort, sizeof(uint32));
+	return result;
+}
+
+
+/*
+ * CompareWorkerNodes compares two pointers to worker nodes using the exact
+ * same logic employed by WorkerNodeCompare.
+ */
+int
+CompareWorkerNodes(const void *leftElement, const void *rightElement)
+{
+	const void *leftWorker = *((const void **) leftElement);
+	const void *rightWorker = *((const void **) rightElement);
+	int compare = 0;
+	Size ignoredKeySize = 0;
+
+	compare = WorkerNodeCompare(leftWorker, rightWorker, ignoredKeySize);
+
+	return compare;
+}
+
+
+/*
+ * WorkerNodeCompare compares two worker nodes by their host name and port
+ * number. Two nodes that only differ by their rack locations are considered to
+ * be equal to each other.
+ */
+static int
+WorkerNodeCompare(const void *lhsKey, const void *rhsKey, Size keySize)
+{
+	const WorkerNode *workerLhs = (const WorkerNode *) lhsKey;
+	const WorkerNode *workerRhs = (const WorkerNode *) rhsKey;
+
+	int nameCompare = 0;
+	int portCompare = 0;
+
+	nameCompare = strncmp(workerLhs->workerName, workerRhs->workerName, WORKER_LENGTH);
+	if (nameCompare != 0)
+	{
+		return nameCompare;
+	}
+
+	portCompare = workerLhs->workerPort - workerRhs->workerPort;
+	return portCompare;
+}
+
+
+/*
+ * LoadWorkerNodeList reads and parses given membership file, and loads worker
+ * nodes from this membership file into the shared hash. The function relies on
+ * hba.c's tokenization method for parsing, and therefore the membership file
+ * has the same syntax as other configuration files such as ph_hba.conf.
+ *
+ * Note that this function allows for reloading membership configuration files
+ * at runtime. When that happens, old worker nodes that do not appear in the
+ * file are marked as stale, but are still kept in the shared hash.
+ */
+void
+LoadWorkerNodeList(const char *workerFilename)
+{
+	List *workerList = NIL;
+	ListCell *workerCell = NULL;
+	uint32 workerCount = 0;
+
+	workerList = ParseWorkerNodeFile(workerFilename);
+
+	workerCount = list_length(workerList);
+	if (workerCount > MaxWorkerNodesTracked)
+	{
+		ereport(FATAL, (errcode(ERRCODE_CONFIG_FILE_ERROR),
+						errmsg("worker node count: %u exceeds max allowed value: %d",
+							   workerCount, MaxWorkerNodesTracked)));
+	}
+	else
+	{
+		ereport(INFO, (errmsg("reading nodes from worker file: %s", workerFilename)));
+	}
+
+	/* before reading file's lines, reset worker node hash */
+	ResetWorkerNodesHash(WorkerNodesHash);
+
+	/* parse file lines */
+	foreach(workerCell, workerList)
+	{
+		WorkerNode *workerNode = NULL;
+		WorkerNode *parsedNode = lfirst(workerCell);
+		void *hashKey = NULL;
+		bool handleFound = false;
+
+		/*
+		 * Search for the parsed worker node in the hash, and then insert parsed
+		 * values. When searching, we make the hashKey point to the beginning of
+		 * the parsed node; we previously set the key length and key comparison
+		 * function to include both the node name and the port number.
+		 */
+		hashKey = (void *) parsedNode;
+		workerNode = (WorkerNode *) hash_search(WorkerNodesHash, hashKey,
+												HASH_ENTER, &handleFound);
+
+		if (handleFound)
+		{
+			/* display notification if worker node's rack changed */
+			char *oldWorkerRack = workerNode->workerRack;
+			char *newWorkerRack = parsedNode->workerRack;
+
+			if (strncmp(oldWorkerRack, newWorkerRack, WORKER_LENGTH) != 0)
+			{
+				ereport(INFO, (errmsg("worker node: \"%s:%u\" changed rack location",
+									  workerNode->workerName, workerNode->workerPort)));
+			}
+
+			/* display warning if worker node already appeared in this file */
+			if (workerNode->inWorkerFile)
+			{
+				ereport(WARNING, (errmsg("multiple lines for worker node: \"%s:%u\"",
+										 workerNode->workerName,
+										 workerNode->workerPort)));
+			}
+		}
+
+		strlcpy(workerNode->workerName, parsedNode->workerName, WORKER_LENGTH);
+		strlcpy(workerNode->workerRack, parsedNode->workerRack, WORKER_LENGTH);
+		workerNode->workerPort = parsedNode->workerPort;
+		workerNode->inWorkerFile = parsedNode->inWorkerFile;
+
+		pfree(parsedNode);
+	}
+}
+
+
+/*
+ * ParseWorkerNodeFile opens and parses the node name and node port from the
+ * specified configuration file.
+ */
+static List *
+ParseWorkerNodeFile(const char *workerNodeFilename)
+{
+	FILE *workerFileStream = NULL;
+	List *workerNodeList = NIL;
+	char workerNodeLine[MAXPGPATH];
+	char *workerFilePath = make_absolute_path(workerNodeFilename);
+	char *workerPatternTemplate = "%%%u[^# \t]%%*[ \t]%%%u[^# \t]%%*[ \t]%%%u[^# \t]";
+	char workerLinePattern[1024];
+	const int workerNameIndex = 0;
+	const int workerPortIndex = 1;
+
+	memset(workerLinePattern, '\0', sizeof(workerLinePattern));
+
+	workerFileStream = AllocateFile(workerFilePath, PG_BINARY_R);
+	if (workerFileStream == NULL)
+	{
+		if (errno == ENOENT)
+		{
+			ereport(DEBUG1, (errmsg("worker list file located at \"%s\" is not present",
+									workerFilePath)));
+		}
+		else
+		{
+			ereport(ERROR, (errcode_for_file_access(),
+							errmsg("could not open worker list file \"%s\": %m",
+								   workerFilePath)));
+		}
+		return NIL;
+	}
+
+	/* build pattern to contain node name length limit */
+	snprintf(workerLinePattern, sizeof(workerLinePattern), workerPatternTemplate,
+			 WORKER_LENGTH, MAX_PORT_LENGTH, WORKER_LENGTH);
+
+	while (fgets(workerNodeLine, sizeof(workerNodeLine), workerFileStream) != NULL)
+	{
+		const int workerLineLength = strnlen(workerNodeLine, MAXPGPATH);
+		WorkerNode *workerNode = NULL;
+		char *linePointer = NULL;
+		int32 nodePort = PostPortNumber; /* default port number */
+		int fieldCount = 0;
+		bool lineIsInvalid = false;
+		char nodeName[WORKER_LENGTH + 1];
+		char nodeRack[WORKER_LENGTH + 1];
+		char nodePortString[MAX_PORT_LENGTH + 1];
+
+		memset(nodeName, '\0', sizeof(nodeName));
+		strlcpy(nodeRack, WORKER_DEFAULT_RACK, sizeof(nodeRack));
+		memset(nodePortString, '\0', sizeof(nodePortString));
+
+		if (workerLineLength == MAXPGPATH - 1)
+		{
+			ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR),
+							errmsg("worker node list file line exceeds the maximum "
+								   "length of %d", MAXPGPATH)));
+		}
+
+		/* trim trailing newlines preserved by fgets, if any */
+		linePointer = workerNodeLine + workerLineLength - 1;
+		while (linePointer >= workerNodeLine &&
+			   (*linePointer == '\n' || *linePointer == '\r'))
+		{
+			*linePointer-- = '\0';
+		}
+
+		/* skip leading whitespace */
+		for (linePointer = workerNodeLine; *linePointer; linePointer++)
+		{
+			if (!isspace((unsigned char) *linePointer))
+			{
+				break;
+			}
+		}
+
+		/* if the entire line is whitespace or a comment, skip it */
+		if (*linePointer == '\0' || *linePointer == '#')
+		{
+			continue;
+		}
+
+		/* parse line; node name is required, but port and rack are optional */
+		fieldCount = sscanf(linePointer, workerLinePattern,
+							nodeName, nodePortString, nodeRack);
+
+		/* adjust field count for zero based indexes */
+		fieldCount--;
+
+		/* raise error if no fields were assigned */
+		if (fieldCount < workerNameIndex)
+		{
+			lineIsInvalid = true;
+		}
+
+		/* no special treatment for nodeName: already parsed by sscanf */
+
+		/* if a second token was specified, convert to integer port */
+		if (fieldCount >= workerPortIndex)
+		{
+			char *nodePortEnd = NULL;
+
+			errno = 0;
+			nodePort = strtol(nodePortString, &nodePortEnd, 10);
+
+			if (errno != 0 || (*nodePortEnd) != '\0' || nodePort <= 0)
+			{
+				lineIsInvalid = true;
+			}
+		}
+
+		if (lineIsInvalid)
+		{
+			ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR),
+							errmsg("could not parse worker node line: %s",
+								   workerNodeLine),
+							errhint("Lines in the worker node file must contain a valid "
+									"node name and, optionally, a positive port number. "
+									"Comments begin with a '#' character and extend to "
+									"the end of their line.")));
+		}
+
+		/* allocate worker node structure and set fields */
+		workerNode = (WorkerNode *) palloc0(sizeof(WorkerNode));
+
+		strlcpy(workerNode->workerName, nodeName, WORKER_LENGTH + 1);
+		strlcpy(workerNode->workerRack, nodeRack, WORKER_LENGTH + 1);
+		workerNode->workerPort = nodePort;
+		workerNode->inWorkerFile = true;
+
+		workerNodeList = lappend(workerNodeList, workerNode);
+	}
+
+	FreeFile(workerFileStream);
+	free(workerFilePath);
+
+	return workerNodeList;
+}
+
+
+/* Marks all worker nodes in the shared hash as stale. */
+static void
+ResetWorkerNodesHash(HTAB *WorkerNodesHash)
+{
+	WorkerNode *workerNode = NULL;
+
+	HASH_SEQ_STATUS status;
+	hash_seq_init(&status, WorkerNodesHash);
+
+	workerNode = (WorkerNode *) hash_seq_search(&status);
+	while (workerNode != NULL)
+	{
+		workerNode->inWorkerFile = false;
+
+		workerNode = (WorkerNode *) hash_seq_search(&status);
+	}
+}
+
+
+/* ResponsiveWorkerNodeList returns a list of all responsive worker nodes */
+List *
+ResponsiveWorkerNodeList(void)
+{
+	List *responsiveWorkerNodeList = NULL;
+	ListCell *workerNodeCell = NULL;
+	List *workerNodeList = WorkerNodeList();
+
+	foreach(workerNodeCell, workerNodeList)
+	{
+		bool workerNodeResponsive = false;
+		WorkerNode *workerNode = lfirst(workerNodeCell);
+
+		workerNodeResponsive = WorkerNodeResponsive(workerNode->workerName,
+													workerNode->workerPort);
+		if (workerNodeResponsive)
+		{
+			responsiveWorkerNodeList = lappend(responsiveWorkerNodeList, workerNode);
+		}
+	}
+
+	return responsiveWorkerNodeList;
+}
+
+
+/*
+ * WorkerNodeResponsive returns true if the given worker node is reponsive.
+ * Otherwise, it returns false.
+ *
+ * This function is based on worker_node_responsive function present in the
+ * shard rebalancer.
+ */
+static bool
+WorkerNodeResponsive(const char *workerName, uint32 workerPort)
+{
+	bool workerNodeResponsive = false;
+	const char *databaseName = get_database_name(MyDatabaseId);
+
+	int connectionId = MultiClientConnect(workerName, workerPort, databaseName);
+	if (connectionId != INVALID_CONNECTION_ID)
+	{
+		MultiClientDisconnect(connectionId);
+
+		workerNodeResponsive = true;
+	}
+
+	return workerNodeResponsive;
+}
--- a/src/backend/distributed/planner/modify_planner.c
+++ b/src/backend/distributed/planner/modify_planner.c
@ -0,0 +1,649 @@
+/*-------------------------------------------------------------------------
+ *
+ * modify_planner.c
+ *
+ * This file contains functions to plan distributed table modifications.
+ *
+ * Copyright (c) 2014-2016, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+
+#include <stddef.h>
+
+#if (PG_VERSION_NUM >= 90500 && PG_VERSION_NUM < 90600)
+#include "access/stratnum.h"
+#else
+#include "access/skey.h"
+#endif
+#include "distributed/citus_nodes.h"
+#include "distributed/master_metadata_utility.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/modify_planner.h" /* IWYU pragma: keep */
+#include "distributed/multi_join_order.h"
+#include "distributed/multi_logical_planner.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/multi_router_executor.h"
+#include "distributed/listutils.h"
+#include "distributed/citus_ruleutils.h"
+#include "distributed/relay_utility.h"
+#include "distributed/resource_lock.h"
+#include "executor/execdesc.h"
+#include "lib/stringinfo.h"
+#if (PG_VERSION_NUM >= 90500)
+#include "nodes/makefuncs.h"
+#endif
+#include "nodes/nodeFuncs.h"
+#include "nodes/nodes.h"
+#include "nodes/parsenodes.h"
+#include "nodes/pg_list.h"
+#include "nodes/primnodes.h"
+#include "optimizer/clauses.h"
+#include "parser/parsetree.h"
+#include "storage/lock.h"
+#include "utils/elog.h"
+#include "utils/errcodes.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+
+
+/* planner functions forward declarations */
+static void ErrorIfQueryNotSupported(Query *queryTree);
+static Task * DistributedModifyTask(Query *query);
+#if (PG_VERSION_NUM >= 90500)
+static OnConflictExpr * RebuildOnConflict(Oid relationId,
+										  OnConflictExpr *originalOnConflict);
+#endif
+static Job * DistributedModifyJob(Query *query, Task *modifyTask);
+static List * QueryRestrictList(Query *query);
+static ShardInterval * DistributedModifyShardInterval(Query *query);
+static Oid ExtractFirstDistributedTableId(Query *query);
+static Const * ExtractPartitionValue(Query *query, Var *partitionColumn);
+
+
+/*
+ * MultiModifyPlanCreate actually creates the distributed plan for execution
+ * of a distribution modification. It expects that the provided MultiTreeRoot
+ * is actually a Query object, which it uses directly to produce a MultiPlan.
+ */
+MultiPlan *
+MultiModifyPlanCreate(Query *query)
+{
+	Task *modifyTask = NULL;
+	Job *modifyJob = NULL;
+	MultiPlan *multiPlan = NULL;
+
+	ErrorIfQueryNotSupported(query);
+
+	modifyTask = DistributedModifyTask(query);
+
+	modifyJob = DistributedModifyJob(query, modifyTask);
+
+	multiPlan = CitusMakeNode(MultiPlan);
+	multiPlan->workerJob = modifyJob;
+	multiPlan->masterQuery = NULL;
+	multiPlan->masterTableName = NULL;
+
+	return multiPlan;
+}
+
+
+/*
+ * ErrorIfQueryNotSupported checks if the query contains unsupported features,
+ * and errors out if it does.
+ */
+static void
+ErrorIfQueryNotSupported(Query *queryTree)
+{
+	Oid distributedTableId = ExtractFirstDistributedTableId(queryTree);
+	uint32 rangeTableId = 1;
+	Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
+	char partitionMethod = PartitionMethod(distributedTableId);
+	List *rangeTableList = NIL;
+	ListCell *rangeTableCell = NULL;
+	bool hasValuesScan = false;
+	uint32 queryTableCount = 0;
+	bool hasNonConstTargetEntryExprs = false;
+	bool hasNonConstQualExprs = false;
+	bool specifiesPartitionValue = false;
+#if (PG_VERSION_NUM >= 90500)
+	ListCell *setTargetCell = NULL;
+	List *onConflictSet = NIL;
+	Node *arbiterWhere = NULL;
+	Node *onConflictWhere = NULL;
+#endif
+
+	CmdType commandType = queryTree->commandType;
+	Assert(commandType == CMD_INSERT || commandType == CMD_UPDATE ||
+		   commandType == CMD_DELETE);
+
+	if (!(partitionMethod == DISTRIBUTE_BY_HASH ||
+		  partitionMethod == DISTRIBUTE_BY_RANGE))
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot perform distributed planning for the given"
+							   " modification"),
+						errdetail("Only hash- or range-partitioned tables may be the "
+								  "target of distributed modifications")));
+	}
+
+	/*
+	 * Reject subqueries which are in SELECT or WHERE clause.
+	 * Queries which include subqueries in FROM clauses are rejected below.
+	 */
+	if (queryTree->hasSubLinks == true)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot perform distributed planning for the given"
+							   " modification"),
+						errdetail("Subqueries are not supported in distributed"
+								  " modifications.")));
+	}
+
+	/* reject queries which include CommonTableExpr */
+	if (queryTree->cteList != NIL)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot perform distributed planning for the given"
+							   " modification"),
+						errdetail("Common table expressions are not supported in"
+								  " distributed modifications.")));
+	}
+
+	/* extract range table entries */
+	ExtractRangeTableEntryWalker((Node *) queryTree, &rangeTableList);
+
+	foreach(rangeTableCell, rangeTableList)
+	{
+		RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell);
+		if (rangeTableEntry->rtekind == RTE_RELATION)
+		{
+			queryTableCount++;
+		}
+		else if (rangeTableEntry->rtekind == RTE_VALUES)
+		{
+			hasValuesScan = true;
+		}
+		else
+		{
+			/*
+			 * Error out for rangeTableEntries that we do not support.
+			 * We do not explicitly specify "in FROM clause" in the error detail
+			 * for the features that we do not support at all (SUBQUERY, JOIN).
+			 * We do not need to check for RTE_CTE because all common table expressions
+			 * are rejected above with queryTree->cteList check.
+			 */
+			char *rangeTableEntryErrorDetail = NULL;
+			if (rangeTableEntry->rtekind == RTE_SUBQUERY)
+			{
+				rangeTableEntryErrorDetail = "Subqueries are not supported in"
+											 " distributed modifications.";
+			}
+			else if (rangeTableEntry->rtekind == RTE_JOIN)
+			{
+				rangeTableEntryErrorDetail = "Joins are not supported in distributed"
+											 " modifications.";
+			}
+			else if (rangeTableEntry->rtekind == RTE_FUNCTION)
+			{
+				rangeTableEntryErrorDetail = "Functions must not appear in the FROM"
+											 " clause of a distributed modifications.";
+			}
+			else
+			{
+				rangeTableEntryErrorDetail = "Unrecognized range table entry.";
+			}
+
+			ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							errmsg("cannot perform distributed planning for the given"
+								   " modifications"),
+							errdetail("%s", rangeTableEntryErrorDetail)));
+		}
+	}
+
+	/*
+	 * Reject queries which involve joins. Note that UPSERTs are exceptional for this case.
+	 * Queries like "INSERT INTO table_name ON CONFLICT DO UPDATE (col) SET other_col = ''"
+	 * contains two range table entries, and we have to allow them.
+	 */
+	if (commandType != CMD_INSERT && queryTableCount != 1)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot perform distributed planning for the given"
+							   " modification"),
+						errdetail("Joins are not supported in distributed "
+								  "modifications.")));
+	}
+
+	/* reject queries which involve multi-row inserts */
+	if (hasValuesScan)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot perform distributed planning for the given"
+							   " modification"),
+						errdetail("Multi-row INSERTs to distributed tables are not "
+								  "supported.")));
+	}
+
+	/* reject queries with a returning list */
+	if (list_length(queryTree->returningList) > 0)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot perform distributed planning for the given"
+							   " modification"),
+						errdetail("RETURNING clauses are not supported in distributed "
+								  "modifications.")));
+	}
+
+	if (commandType == CMD_INSERT || commandType == CMD_UPDATE ||
+		commandType == CMD_DELETE)
+	{
+		FromExpr *joinTree = NULL;
+		ListCell *targetEntryCell = NULL;
+
+		foreach(targetEntryCell, queryTree->targetList)
+		{
+			TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell);
+
+			/* skip resjunk entries: UPDATE adds some for ctid, etc. */
+			if (targetEntry->resjunk)
+			{
+				continue;
+			}
+
+			if (!IsA(targetEntry->expr, Const))
+			{
+				hasNonConstTargetEntryExprs = true;
+			}
+
+			if (commandType == CMD_UPDATE &&
+				targetEntry->resno == partitionColumn->varattno)
+			{
+				specifiesPartitionValue = true;
+			}
+		}
+
+		joinTree = queryTree->jointree;
+		if (joinTree != NULL && contain_mutable_functions(joinTree->quals))
+		{
+			hasNonConstQualExprs = true;
+		}
+	}
+
+#if (PG_VERSION_NUM >= 90500)
+	if (commandType == CMD_INSERT && queryTree->onConflict != NULL)
+	{
+		onConflictSet = queryTree->onConflict->onConflictSet;
+		arbiterWhere = queryTree->onConflict->arbiterWhere;
+		onConflictWhere = queryTree->onConflict->onConflictWhere;
+	}
+
+	/*
+	 * onConflictSet is expanded via expand_targetlist() on the standard planner.
+	 * This ends up adding all the columns to the onConflictSet even if the user
+	 * does not explicitly state the columns in the query.
+	 *
+	 * The following loop simply allows "DO UPDATE SET part_col = table.part_col"
+	 * types of elements in the target list, which are added by expand_targetlist().
+	 * Any other attempt to update partition column value is forbidden.
+	 */
+	foreach(setTargetCell, onConflictSet)
+	{
+		TargetEntry *setTargetEntry = (TargetEntry *) lfirst(setTargetCell);
+
+		if (setTargetEntry->resno == partitionColumn->varattno)
+		{
+			Expr *setExpr = setTargetEntry->expr;
+			if (IsA(setExpr, Var) &&
+				((Var *) setExpr)->varattno == partitionColumn->varattno)
+			{
+				specifiesPartitionValue = false;
+			}
+			else
+			{
+				specifiesPartitionValue = true;
+			}
+		}
+		else
+		{
+			/*
+			 * Similarly, allow  "DO UPDATE SET col_1 = table.col_1" types of
+			 * target list elements. Note that, the following check allows
+			 * "DO UPDATE SET col_1 = table.col_2", which is not harmful.
+			 */
+			if (IsA(setTargetEntry->expr, Var))
+			{
+				continue;
+			}
+			else if (contain_mutable_functions((Node *) setTargetEntry->expr))
+			{
+				hasNonConstTargetEntryExprs = true;
+			}
+		}
+	}
+
+	/* error if either arbiter or on conflict WHERE contains a mutable function */
+	if (contain_mutable_functions((Node *) arbiterWhere) ||
+		contain_mutable_functions((Node *) onConflictWhere))
+	{
+		hasNonConstQualExprs = true;
+	}
+#endif
+
+	if (hasNonConstTargetEntryExprs || hasNonConstQualExprs)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot plan sharded modification containing values "
+							   "which are not constants or constant expressions")));
+	}
+
+	if (specifiesPartitionValue)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("modifying the partition value of rows is not allowed")));
+	}
+}
+
+
+/*
+ * DistributedModifyTask builds a Task to represent a modification performed by
+ * the provided query against the provided shard interval. This task contains
+ * shard-extended deparsed SQL to be run during execution.
+ */
+static Task *
+DistributedModifyTask(Query *query)
+{
+	ShardInterval *shardInterval = DistributedModifyShardInterval(query);
+	uint64 shardId = shardInterval->shardId;
+	FromExpr *joinTree = NULL;
+	StringInfo queryString = makeStringInfo();
+	Task *modifyTask = NULL;
+	bool upsertQuery = false;
+
+	/* grab shared metadata lock to stop concurrent placement additions */
+	LockShardDistributionMetadata(shardId, ShareLock);
+
+	/*
+	 * Convert the qualifiers to an explicitly and'd clause, which is needed
+	 * before we deparse the query. This applies to SELECT, UPDATE and
+	 * DELETE statements.
+	 */
+	joinTree = query->jointree;
+	if ((joinTree != NULL) && (joinTree->quals != NULL))
+	{
+		Node *whereClause = joinTree->quals;
+		if (IsA(whereClause, List))
+		{
+			joinTree->quals = (Node *) make_ands_explicit((List *) whereClause);
+		}
+	}
+
+#if (PG_VERSION_NUM >= 90500)
+	if (query->onConflict != NULL)
+	{
+		RangeTblEntry *rangeTableEntry = NULL;
+		Oid relationId = shardInterval->relationId;
+
+		/* set the flag */
+		upsertQuery = true;
+
+		/* setting an alias simplifies deparsing of UPSERTs */
+		rangeTableEntry = linitial(query->rtable);
+		if (rangeTableEntry->alias == NULL)
+		{
+			Alias *alias = makeAlias(UPSERT_ALIAS, NIL);
+			rangeTableEntry->alias = alias;
+		}
+
+		/* some fields in onConflict expression needs to be updated for deparsing */
+		query->onConflict = RebuildOnConflict(relationId, query->onConflict);
+	}
+#else
+	/* always set to false for PG_VERSION_NUM < 90500 */
+	upsertQuery = false;
+#endif
+
+	deparse_shard_query(query, shardInterval->relationId, shardId, queryString);
+	ereport(DEBUG4, (errmsg("distributed statement: %s", queryString->data)));
+
+	modifyTask = CitusMakeNode(Task);
+	modifyTask->jobId = INVALID_JOB_ID;
+	modifyTask->taskId = INVALID_TASK_ID;
+	modifyTask->taskType = MODIFY_TASK;
+	modifyTask->queryString = queryString->data;
+	modifyTask->anchorShardId = shardId;
+	modifyTask->dependedTaskList = NIL;
+	modifyTask->upsertQuery = upsertQuery;
+
+	return modifyTask;
+}
+
+
+#if (PG_VERSION_NUM >= 90500)
+/*
+ * RebuildOnConflict rebuilds OnConflictExpr for correct deparsing. The function
+ * makes WHERE clause elements explicit and filters dropped columns
+ * from the target list.
+ */
+static OnConflictExpr *
+RebuildOnConflict(Oid relationId, OnConflictExpr *originalOnConflict)
+{
+	OnConflictExpr *updatedOnConflict = copyObject(originalOnConflict);
+	Node *onConflictWhere = updatedOnConflict->onConflictWhere;
+	List *onConflictSet = updatedOnConflict->onConflictSet;
+	TupleDesc distributedRelationDesc = NULL;
+	ListCell *targetEntryCell = NULL;
+	List *filteredOnConflictSet = NIL;
+	Form_pg_attribute *tableAttributes = NULL;
+	Relation distributedRelation = RelationIdGetRelation(relationId);
+
+	/* Convert onConflictWhere qualifiers to an explicitly and'd clause */
+	updatedOnConflict->onConflictWhere =
+			(Node *) make_ands_explicit((List *) onConflictWhere);
+
+	/*
+	 * Here we handle dropped columns on the distributed table. onConflictSet
+	 * includes the table attributes even if they are dropped,
+	 * since the it is expanded via expand_targetlist() on standard planner.
+	 */
+
+	/* get the relation tuple descriptor and table attributes */
+	distributedRelationDesc = RelationGetDescr(distributedRelation);
+	tableAttributes = distributedRelationDesc->attrs;
+
+	foreach(targetEntryCell, onConflictSet)
+	{
+		TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell);
+		FormData_pg_attribute *tableAttribute = tableAttributes[targetEntry->resno -1];
+
+		/* skip dropped columns */
+		if (tableAttribute->attisdropped)
+		{
+			continue;
+		}
+
+		/* we only want to deparse non-dropped columns */
+		filteredOnConflictSet = lappend(filteredOnConflictSet, targetEntry);
+	}
+
+	/* close distributedRelation to prevent leaks */
+	RelationClose(distributedRelation);
+
+	/* set onConflictSet again with the filtered list */
+	updatedOnConflict->onConflictSet = filteredOnConflictSet;
+
+	return updatedOnConflict;
+}
+#endif
+
+
+/*
+ * DistributedModifyJob creates a Job for the specified query to execute the
+ * provided modification task. Modification task placements are produced using
+ * the "first-replica" algorithm, except modifications run against all matching
+ * placements rather than just the first successful one.
+ */
+Job *
+DistributedModifyJob(Query *query, Task *modifyTask)
+{
+	Job *modifyJob = NULL;
+	List *taskList = FirstReplicaAssignTaskList(list_make1(modifyTask));
+
+	modifyJob = CitusMakeNode(Job);
+	modifyJob->dependedJobList = NIL;
+	modifyJob->jobId = INVALID_JOB_ID;
+	modifyJob->subqueryPushdown = false;
+	modifyJob->jobQuery = query;
+	modifyJob->taskList = taskList;
+
+	return modifyJob;
+}
+
+
+/*
+ * DistributedModifyShardInterval determines the single shard targeted by a
+ * provided distributed modification command. If no matching shards exist, or
+ * if the modification targets more than one one shard, this function raises
+ * an error.
+ */
+static ShardInterval *
+DistributedModifyShardInterval(Query *query)
+{
+	List *restrictClauseList = NIL;
+	List *prunedShardList = NIL;
+	Index tableId = 1;
+
+	Oid distributedTableId = ExtractFirstDistributedTableId(query);
+	List *shardIntervalList = NIL;
+
+	/* error out if no shards exist for the table */
+	shardIntervalList = LoadShardIntervalList(distributedTableId);
+	if (shardIntervalList == NIL)
+	{
+		char *relationName = get_rel_name(distributedTableId);
+
+		ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						errmsg("could not find any shards for modification"),
+						errdetail("No shards exist for distributed table \"%s\".",
+								  relationName),
+						errhint("Run master_create_worker_shards to create shards "
+								"and try again.")));
+	}
+
+	restrictClauseList = QueryRestrictList(query);
+	prunedShardList = PruneShardList(distributedTableId, tableId, restrictClauseList,
+									 shardIntervalList);
+
+	if (list_length(prunedShardList) != 1)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("distributed modifications must target exactly one "
+							   "shard")));
+	}
+
+	return (ShardInterval *) linitial(prunedShardList);
+}
+
+
+/*
+ * QueryRestrictList returns the restriction clauses for the query. For a SELECT
+ * statement these are the where-clause expressions. For INSERT statements we
+ * build an equality clause based on the partition-column and its supplied
+ * insert value.
+ */
+static List *
+QueryRestrictList(Query *query)
+{
+	List *queryRestrictList = NIL;
+	CmdType commandType = query->commandType;
+
+	if (commandType == CMD_INSERT)
+	{
+		/* build equality expression based on partition column value for row */
+		Oid distributedTableId = ExtractFirstDistributedTableId(query);
+		uint32 rangeTableId = 1;
+		Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
+		Const *partitionValue = ExtractPartitionValue(query, partitionColumn);
+
+		OpExpr *equalityExpr = MakeOpExpression(partitionColumn, BTEqualStrategyNumber);
+
+		Node *rightOp = get_rightop((Expr *) equalityExpr);
+		Const *rightConst = (Const *) rightOp;
+		Assert(IsA(rightOp, Const));
+
+		rightConst->constvalue = partitionValue->constvalue;
+		rightConst->constisnull = partitionValue->constisnull;
+		rightConst->constbyval = partitionValue->constbyval;
+
+		queryRestrictList = list_make1(equalityExpr);
+	}
+	else if (commandType == CMD_UPDATE || commandType == CMD_DELETE)
+	{
+		queryRestrictList = WhereClauseList(query->jointree);
+	}
+
+	return queryRestrictList;
+}
+
+
+/*
+ * ExtractFirstDistributedTableId takes a given query, and finds the relationId
+ * for the first distributed table in that query. If the function cannot find a
+ * distributed table, it returns InvalidOid.
+ */
+static Oid
+ExtractFirstDistributedTableId(Query *query)
+{
+	List *rangeTableList = NIL;
+	ListCell *rangeTableCell = NULL;
+	Oid distributedTableId = InvalidOid;
+
+	/* extract range table entries */
+	ExtractRangeTableEntryWalker((Node *) query, &rangeTableList);
+
+	foreach(rangeTableCell, rangeTableList)
+	{
+		RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell);
+
+		if (IsDistributedTable(rangeTableEntry->relid))
+		{
+			distributedTableId = rangeTableEntry->relid;
+			break;
+		}
+	}
+
+	return distributedTableId;
+}
+
+
+/*
+ * ExtractPartitionValue extracts the partition column value from a the target
+ * of a modification command. If a partition value is missing altogether or is
+ * NULL, this function throws an error.
+ */
+static Const *
+ExtractPartitionValue(Query *query, Var *partitionColumn)
+{
+	Const *partitionValue = NULL;
+	TargetEntry *targetEntry = get_tle_by_resno(query->targetList,
+												partitionColumn->varattno);
+	if (targetEntry != NULL)
+	{
+		Assert(IsA(targetEntry->expr, Const));
+
+		partitionValue = (Const *) targetEntry->expr;
+	}
+
+	if (partitionValue == NULL || partitionValue->constisnull)
+	{
+		ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+						errmsg("cannot plan INSERT using row with NULL value "
+							   "in partition column")));
+	}
+
+	return partitionValue;
+}
--- a/src/backend/distributed/planner/multi_explain.c
+++ b/src/backend/distributed/planner/multi_explain.c
@ -0,0 +1,108 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi_explain.c
+ *	  CitusDB explain support.
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "commands/prepare.h"
+#include "distributed/citus_nodefuncs.h"
+#include "distributed/multi_explain.h"
+#include "distributed/multi_planner.h"
+#include "distributed/multi_logical_optimizer.h"
+#include "distributed/multi_physical_planner.h"
+#include "nodes/print.h"
+#include "optimizer/planner.h"
+#include "tcop/tcopprot.h"
+
+
+/* Config variables that enable printing distributed query plans */
+bool ExplainMultiLogicalPlan = false;
+bool ExplainMultiPhysicalPlan = false;
+
+
+/*
+ * MultiExplainOneQuery takes the given query, and checks if the query is local
+ * or distributed. If the query is local, the function runs the standard explain
+ * logic. If the query is distributed, the function looks up configuration and
+ * prints out the distributed logical and physical plans as appropriate.
+ */
+void
+MultiExplainOneQuery(Query *query, IntoClause *into, ExplainState *es,
+					 const char *queryString, ParamListInfo params)
+{
+	MultiTreeRoot *multiTree = NULL;
+	MultiPlan *multiPlan = NULL;
+	Query *queryCopy = NULL;
+	CmdType commandType = query->commandType;
+
+	/* if local query, run the standard explain and return */
+	bool localQuery = !NeedsDistributedPlanning(query);
+	if (localQuery)
+	{
+		PlannedStmt *plan = NULL;
+		instr_time	planstart;
+		instr_time planduration;
+
+		INSTR_TIME_SET_CURRENT(planstart);
+
+		/* plan the query */
+		plan = pg_plan_query(query, 0, params);
+
+		INSTR_TIME_SET_CURRENT(planduration);
+		INSTR_TIME_SUBTRACT(planduration, planstart);
+
+		/* run it (if needed) and produce output */
+		ExplainOnePlan(plan, into, es, queryString, params, &planduration);
+
+		return;
+	}
+
+	/* error out early if the query is a modification */
+	if (commandType == CMD_INSERT || commandType == CMD_UPDATE ||
+		commandType == CMD_DELETE)
+	{
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("cannot show execution plan for distributed modification"),
+						errdetail("EXPLAIN commands are unsupported for distributed "
+								  "modifications.")));
+	}
+
+	/* call standard planner to modify the query structure before multi planning */
+	standard_planner(query, 0, params);
+	queryCopy = copyObject(query);
+
+	/* create the logical and physical plan */
+	multiTree = MultiLogicalPlanCreate(queryCopy);
+	MultiLogicalPlanOptimize(multiTree);
+	multiPlan = MultiPhysicalPlanCreate(multiTree);
+
+	if (ExplainMultiLogicalPlan)
+	{
+		char *logicalPlanString = CitusNodeToString(multiTree);
+		char *formattedPlanString = pretty_format_node_dump(logicalPlanString);
+
+		appendStringInfo(es->str, "logical plan:\n");
+		appendStringInfo(es->str, "%s\n", formattedPlanString);
+	}
+
+	if (ExplainMultiPhysicalPlan)
+	{
+		char *physicalPlanString = CitusNodeToString(multiPlan);
+		char *formattedPlanString = pretty_format_node_dump(physicalPlanString);
+
+		appendStringInfo(es->str, "physical plan:\n");
+		appendStringInfo(es->str, "%s\n", formattedPlanString);
+	}
+
+	/* if explain printing isn't enabled, print error only after planning */
+	if (!ExplainMultiLogicalPlan && !ExplainMultiPhysicalPlan)
+	{
+		appendStringInfo(es->str, "explain statements for distributed queries ");
+		appendStringInfo(es->str, "are currently unsupported\n");
+	}
+}
--- a/src/backend/distributed/planner/multi_join_order.c
+++ b/src/backend/distributed/planner/multi_join_order.c
--- a/src/backend/distributed/planner/multi_logical_optimizer.c
+++ b/src/backend/distributed/planner/multi_logical_optimizer.c
--- a/src/backend/distributed/planner/multi_logical_planner.c
+++ b/src/backend/distributed/planner/multi_logical_planner.c
--- a/src/backend/distributed/planner/multi_master_planner.c
+++ b/src/backend/distributed/planner/multi_master_planner.c
@ -0,0 +1,378 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi_master_planner.c
+ *	  Routines for building create table and select into table statements on the
+ *	  master node.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "distributed/multi_master_planner.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/multi_server_executor.h"
+#include "distributed/worker_protocol.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/clauses.h"
+#include "optimizer/planmain.h"
+#include "optimizer/tlist.h"
+#include "optimizer/var.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+
+/*
+ * MasterTargetList uses the given worker target list's expressions, and creates
+ * a target target list for the master node. This master target list keeps the
+ * temporary table's columns on the master node.
+ */
+static List *
+MasterTargetList(List *workerTargetList)
+{
+	List *masterTargetList = NIL;
+	const Index tableId = 1;
+	AttrNumber columnId = 1;
+
+	ListCell *workerTargetCell = NULL;
+	foreach(workerTargetCell, workerTargetList)
+	{
+		TargetEntry *workerTargetEntry = (TargetEntry *) lfirst(workerTargetCell);
+		TargetEntry *masterTargetEntry = copyObject(workerTargetEntry);
+
+		Var *masterColumn = makeVarFromTargetEntry(tableId, workerTargetEntry);
+		masterColumn->varattno = columnId;
+		masterColumn->varoattno = columnId;
+		columnId++;
+
+		/*
+		 * The master target entry has two pieces to it. The first piece is the
+		 * target entry's expression, which we set to the newly created column.
+		 * The second piece is sort and group clauses that we implicitly copy
+		 * from the worker target entry. Note that any changes to worker target
+		 * entry's sort and group clauses will *break* us here.
+		 */
+		masterTargetEntry->expr = (Expr *) masterColumn;
+		masterTargetList = lappend(masterTargetList, masterTargetEntry);
+	}
+
+	return masterTargetList;
+}
+
+
+/*
+ * BuildCreateStatement builds the executable create statement for creating a
+ * temporary table on the master; and then returns this create statement. This
+ * function obtains the needed column type information from the target list.
+ */
+static CreateStmt *
+BuildCreateStatement(char *masterTableName, List *masterTargetList,
+					 List *masterColumnNameList)
+{
+	CreateStmt *createStatement = NULL;
+	RangeVar *relation = NULL;
+	char *relationName = NULL;
+	List *columnTypeList = NIL;
+	List *columnDefinitionList = NIL;
+	ListCell *masterTargetCell = NULL;
+
+	/* build rangevar object for temporary table */
+	relationName = masterTableName;
+	relation = makeRangeVar(NULL, relationName, -1);
+	relation->relpersistence = RELPERSISTENCE_TEMP;
+
+	/* build the list of column types as cstrings */
+	foreach(masterTargetCell, masterTargetList)
+	{
+		TargetEntry *targetEntry = (TargetEntry *) lfirst(masterTargetCell);
+		Var *column = (Var *) targetEntry->expr;
+		Oid columnTypeId = exprType((Node *) column);
+		int32 columnTypeMod = exprTypmod((Node *) column);
+
+		char *columnTypeName = format_type_with_typemod(columnTypeId, columnTypeMod);
+		columnTypeList = lappend(columnTypeList, columnTypeName);
+	}
+
+	/* build the column definition list */
+	columnDefinitionList = ColumnDefinitionList(masterColumnNameList, columnTypeList);
+
+	/* build the create statement */
+	createStatement = CreateStatement(relation, columnDefinitionList);
+
+	return createStatement;
+}
+
+
+/*
+ * BuildAggregatePlan creates and returns an aggregate plan. This aggregate plan
+ * builds aggreation and grouping operators (if any) that are to be executed on
+ * the master node.
+ */
+static Agg *
+BuildAggregatePlan(Query *masterQuery, Plan *subPlan)
+{
+	Agg *aggregatePlan = NULL;
+	AggStrategy aggregateStrategy = AGG_PLAIN;
+	AggClauseCosts aggregateCosts;
+	AttrNumber *groupColumnIdArray = NULL;
+	List *aggregateTargetList = NIL; 
+	List *groupColumnList = NIL;
+	List *columnList = NIL;
+	ListCell *columnCell = NULL;
+	Oid *groupColumnOpArray = NULL;
+	uint32 groupColumnCount = 0;
+	const long rowEstimate = 10;
+
+	/* assert that we need to build an aggregate plan */
+	Assert(masterQuery->hasAggs || masterQuery->groupClause);
+
+	aggregateTargetList = masterQuery->targetList;
+	count_agg_clauses(NULL, (Node *) aggregateTargetList, &aggregateCosts);
+
+	/*
+	 * For upper level plans above the sequential scan, the planner expects the
+	 * table id (varno) to be set to OUTER_VAR.
+	 */
+	columnList = pull_var_clause_default((Node *) aggregateTargetList);
+	foreach(columnCell, columnList)
+	{
+		Var *column = (Var *) lfirst(columnCell);
+		column->varno = OUTER_VAR;
+	}
+
+	groupColumnList = masterQuery->groupClause;
+	groupColumnCount = list_length(groupColumnList);
+
+	/* if we have grouping, then initialize appropriate information */
+	if (groupColumnCount > 0)
+	{
+		if (!grouping_is_hashable(groupColumnList))
+		{
+			ereport(ERROR, (errmsg("grouped column list cannot be hashed")));
+		}
+
+		/* switch to hashed aggregate strategy to allow grouping */
+		aggregateStrategy = AGG_HASHED;
+
+		/* get column indexes that are being grouped */
+		groupColumnIdArray = extract_grouping_cols(groupColumnList, subPlan->targetlist);
+		groupColumnOpArray = extract_grouping_ops(groupColumnList);
+	}
+
+	/* finally create the plan */
+#if (PG_VERSION_NUM >= 90500)
+	aggregatePlan =  make_agg(NULL, aggregateTargetList, NIL, aggregateStrategy,
+							  &aggregateCosts, groupColumnCount, groupColumnIdArray,
+							  groupColumnOpArray, NIL, rowEstimate, subPlan);
+#else
+	aggregatePlan =  make_agg(NULL, aggregateTargetList, NIL, aggregateStrategy,
+							  &aggregateCosts, groupColumnCount, groupColumnIdArray,
+							  groupColumnOpArray, rowEstimate, subPlan);
+#endif
+
+	return aggregatePlan;
+}
+
+
+/*
+ * BuildSelectStatement builds the final select statement to run on the master
+ * node, before returning results to the user. The function first builds a scan
+ * statement for all results fetched to the master, and layers aggregation, sort
+ * and limit plans on top of the scan statement if necessary.
+ */
+static PlannedStmt *
+BuildSelectStatement(Query *masterQuery, char *masterTableName,
+					 List *masterTargetList)
+{
+	PlannedStmt *selectStatement = NULL;
+	RangeTblEntry *rangeTableEntry = NULL;
+	RangeTblEntry *queryRangeTableEntry = NULL;
+	SeqScan *sequentialScan = NULL;
+	Agg *aggregationPlan = NULL;
+	Plan *topLevelPlan = NULL;
+
+	/* (1) make PlannedStmt and set basic information */
+	selectStatement = makeNode(PlannedStmt);
+	selectStatement->canSetTag = true;
+	selectStatement->relationOids = NIL; /* to be filled in exec_Start */
+	selectStatement->commandType = CMD_SELECT;
+
+	/* prepare the range table entry for our temporary table */
+	Assert(list_length(masterQuery->rtable) == 1);
+	queryRangeTableEntry = (RangeTblEntry *) linitial(masterQuery->rtable);
+
+	rangeTableEntry = copyObject(queryRangeTableEntry);
+	rangeTableEntry->rtekind = RTE_RELATION;
+	rangeTableEntry->eref = makeAlias(masterTableName, NIL);
+	rangeTableEntry->relid = 0;	/* to be filled in exec_Start */
+	rangeTableEntry->inh = false;
+	rangeTableEntry->inFromCl = true;
+
+	/* set the single element range table list */
+	selectStatement->rtable = list_make1(rangeTableEntry);
+
+	/* (2) build and initialize sequential scan node */
+	sequentialScan = makeNode(SeqScan);
+	sequentialScan->scanrelid = 1;	/* always one */
+
+	/* (3) add an aggregation plan if needed */
+	if (masterQuery->hasAggs || masterQuery->groupClause)
+	{
+		sequentialScan->plan.targetlist = masterTargetList;
+
+		aggregationPlan = BuildAggregatePlan(masterQuery, (Plan *) sequentialScan);
+		topLevelPlan = (Plan *) aggregationPlan;
+	}
+	else
+	{
+		/* otherwise set the final projections on the scan plan directly */
+		sequentialScan->plan.targetlist = masterQuery->targetList;
+		topLevelPlan = (Plan *) sequentialScan;
+	}
+
+	/* (4) add a sorting plan if needed */
+	if (masterQuery->sortClause)
+	{
+		List *sortClauseList = masterQuery->sortClause;
+		Sort *sortPlan = make_sort_from_sortclauses(NULL, sortClauseList, topLevelPlan);
+		topLevelPlan = (Plan *) sortPlan;
+	}
+
+	/* (5) add a limit plan if needed */
+	if (masterQuery->limitCount)
+	{
+		Node *limitCount = masterQuery->limitCount;
+		Node *limitOffset = masterQuery->limitOffset;
+		int64 offsetEstimate = 0;
+		int64 countEstimate = 0;
+
+		Limit *limitPlan = make_limit(topLevelPlan, limitOffset, limitCount,
+									  offsetEstimate, countEstimate);
+		topLevelPlan = (Plan *) limitPlan;
+	}
+
+	/* (6) finally set our top level plan in the plan tree */
+	selectStatement->planTree = topLevelPlan;
+
+	return selectStatement;
+}
+
+
+/*
+ * ValueToStringList walks over the given list of string value types, converts
+ * value types to cstrings, and adds these cstrings into a new list.
+ */
+static List *
+ValueToStringList(List *valueList)
+{
+	List *stringList = NIL;
+	ListCell *valueCell = NULL;
+
+	foreach(valueCell, valueList)
+	{
+		Value *value = (Value *) lfirst(valueCell);
+		char *stringValue = strVal(value);
+
+		stringList = lappend(stringList, stringValue);
+	}
+
+	return stringList;
+}
+
+
+/*
+ * MasterNodeCreateStatement takes in a multi plan, and constructs a statement
+ * to create a temporary table on the master node for final result
+ * aggregation.
+ */
+CreateStmt *
+MasterNodeCreateStatement(MultiPlan *multiPlan)
+{
+	Query *masterQuery = multiPlan->masterQuery;
+	Job *workerJob = multiPlan->workerJob;
+	List *workerTargetList = workerJob->jobQuery->targetList;
+	List *rangeTableList = masterQuery->rtable;
+	char *tableName = multiPlan->masterTableName;
+	CreateStmt *createStatement = NULL;
+
+	RangeTblEntry *rangeTableEntry = (RangeTblEntry *) linitial(rangeTableList);
+	List *columnNameValueList = rangeTableEntry->eref->colnames;
+	List *columnNameList = ValueToStringList(columnNameValueList);
+	List *targetList = MasterTargetList(workerTargetList);
+
+	createStatement = BuildCreateStatement(tableName, targetList, columnNameList);
+
+	return createStatement;
+}
+
+
+/*
+ * MasterNodeSelectPlan takes in a distributed plan, finds the master node query
+ * structure in that plan, and builds the final select plan to execute on the
+ * master node. Note that this select plan is executed after result files are
+ * retrieved from worker nodes and are merged into a temporary table.
+ */
+PlannedStmt *
+MasterNodeSelectPlan(MultiPlan *multiPlan)
+{
+	Query *masterQuery = multiPlan->masterQuery;
+	char *tableName = multiPlan->masterTableName;
+	PlannedStmt *masterSelectPlan = NULL;
+
+	Job *workerJob = multiPlan->workerJob;
+	List *workerTargetList = workerJob->jobQuery->targetList;
+	List *masterTargetList = MasterTargetList(workerTargetList);
+
+	masterSelectPlan = BuildSelectStatement(masterQuery, tableName, masterTargetList);
+
+	return masterSelectPlan;
+}
+
+
+/*
+ * MasterNodeCopyStatementList takes in a multi plan, and constructs
+ * statements that copy over worker task results to a temporary table on the
+ * master node.
+ */
+List *
+MasterNodeCopyStatementList(MultiPlan *multiPlan)
+{
+	Job *workerJob = multiPlan->workerJob;
+	List *workerTaskList = workerJob->taskList;
+	char *tableName = multiPlan->masterTableName;
+	List *copyStatementList = NIL;
+
+	ListCell *workerTaskCell = NULL;
+	foreach(workerTaskCell, workerTaskList)
+	{
+		Task *workerTask = (Task *) lfirst(workerTaskCell);
+		StringInfo jobDirectoryName = JobDirectoryName(workerTask->jobId);
+		StringInfo taskFilename = TaskFilename(jobDirectoryName, workerTask->taskId);
+
+		RangeVar *relation = makeRangeVar(NULL, tableName, -1);
+		CopyStmt *copyStatement = makeNode(CopyStmt);
+		copyStatement->relation = relation;
+		copyStatement->is_from = true;
+		copyStatement->filename = taskFilename->data;
+		if (BinaryMasterCopyFormat)
+		{
+			DefElem *copyOption = makeDefElem("format", (Node *) makeString("binary"));
+			copyStatement->options = list_make1(copyOption);
+		}
+		else
+		{
+			copyStatement->options = NIL;
+		}
+
+		copyStatementList = lappend(copyStatementList, copyStatement);
+	}
+
+	return copyStatementList;
+}
--- a/src/backend/distributed/planner/multi_physical_planner.c
+++ b/src/backend/distributed/planner/multi_physical_planner.c
--- a/src/backend/distributed/planner/multi_planner.c
+++ b/src/backend/distributed/planner/multi_planner.c
@ -0,0 +1,294 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi_planner.c
+ *	  General CitusDB planner code.
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "catalog/pg_type.h"
+
+#include "distributed/citus_nodefuncs.h"
+#include "distributed/citus_nodes.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/multi_planner.h"
+#include "distributed/multi_logical_optimizer.h"
+#include "distributed/multi_logical_planner.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/modify_planner.h"
+
+#include "executor/executor.h"
+
+#include "optimizer/planner.h"
+
+#include "utils/memutils.h"
+
+/* local function forward declarations */
+static void CheckNodeIsDumpable(Node *node);
+
+
+/* local function forward declarations */
+static MultiPlan * CreatePhysicalPlan(Query *parse);
+static char * GetMultiPlanString(PlannedStmt *result);
+static PlannedStmt * MultiQueryContainerNode(PlannedStmt *result, MultiPlan *multiPlan);
+
+
+/* Distributed planner hook */
+PlannedStmt *
+multi_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
+{
+	PlannedStmt *result = NULL;
+
+	/*
+	 * First call into standard planner. This is required because the CitusDB
+	 * planner relies on parse tree transformations made by postgres' planner.
+	 */
+	result = standard_planner(parse, cursorOptions, boundParams);
+
+	if (NeedsDistributedPlanning(parse))
+	{
+		MemoryContext oldcontext = NULL;
+		MultiPlan *physicalPlan = NULL;
+
+		/* Switch to top level message context */
+		oldcontext = MemoryContextSwitchTo(MessageContext);
+
+		physicalPlan = CreatePhysicalPlan(parse);
+
+		/* store required data into the planned statement */
+		result = MultiQueryContainerNode(result, physicalPlan);
+
+		/* Now switch back to original context */
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	return result;
+}
+
+
+/*
+ * CreatePhysicalPlan encapsulates the logic needed to transform a particular
+ * query into a physical plan. For modifications, queries immediately enter
+ * the physical planning stage, since they are essentially "routed" to remote
+ * target shards. SELECT queries go through the full logical plan/optimize/
+ * physical plan process needed to produce distributed query plans.
+ */
+static MultiPlan *
+CreatePhysicalPlan(Query *parse)
+{
+	Query *parseCopy = copyObject(parse);
+	MultiPlan *physicalPlan = NULL;
+	CmdType commandType = parse->commandType;
+
+	if (commandType == CMD_INSERT || commandType == CMD_UPDATE ||
+		commandType == CMD_DELETE)
+	{
+		/* modifications go directly from a query to a physical plan */
+		physicalPlan = MultiModifyPlanCreate(parse);
+	}
+	else
+	{
+		/* Create and optimize logical plan */
+		MultiTreeRoot *logicalPlan = MultiLogicalPlanCreate(parseCopy);
+		MultiLogicalPlanOptimize(logicalPlan);
+
+		/*
+		 * This check is here to make it likely that all node types used in
+		 * CitusDB are dumpable. Explain can dump logical and physical plans
+		 * using the extended outfuncs infrastructure, but it's infeasible to
+		 * test most plans. MultiQueryContainerNode always serializes the
+		 * physical plan, so there's no need to check that separately.
+		 */
+		CheckNodeIsDumpable((Node *) logicalPlan);
+
+		/* Create the physical plan */
+		physicalPlan = MultiPhysicalPlanCreate(logicalPlan);
+	}
+
+	return physicalPlan;
+}
+
+
+/*
+ * GetMultiPlan returns the associated MultiPlan for a PlannedStmt if the
+ * statement requires distributed execution, NULL otherwise.
+ */
+MultiPlan *
+GetMultiPlan(PlannedStmt *result)
+{
+	char *serializedMultiPlan = NULL;
+	MultiPlan *multiPlan = NULL;
+
+	serializedMultiPlan = GetMultiPlanString(result);
+	multiPlan = (MultiPlan *) CitusStringToNode(serializedMultiPlan);
+	Assert(CitusIsA(multiPlan, MultiPlan));
+
+	return multiPlan;
+}
+
+
+/* Does the passed in statement require distributed execution? */
+bool
+HasCitusToplevelNode(PlannedStmt *result)
+{
+	/*
+	 * Can't be a distributed query if the extension hasn't been loaded
+	 * yet. Directly return false, part of the required infrastructure for
+	 * further checks might not be present.
+	 */
+	if (!CitusDBHasBeenLoaded())
+	{
+		return false;
+	}
+
+	if (GetMultiPlanString(result) == NULL)
+	{
+		return false;
+	}
+	else
+	{
+		return true;
+	}
+}
+
+
+/*
+ * CreateCitusToplevelNode creates the top-level planTree node for a
+ * distributed statement. That top-level node is a) recognizable by the
+ * executor hooks, allowing them to redirect execution, b) contains the
+ * parameters required for distributed execution.
+ *
+ * The exact representation of the top-level node is an implementation detail
+ * which should not be referred to outside this file, as it's likely to become
+ * version dependant. Use GetMultiPlan() and HasCitusToplevelNode() to access.
+ *
+ * Internally the data is stored as arguments to a 'citus_extradata_container'
+ * function, which has to be removed from the really executed plan tree before
+ * query execution.
+ */
+static PlannedStmt *
+MultiQueryContainerNode(PlannedStmt *result, MultiPlan *multiPlan)
+{
+	FunctionScan *fauxFunctionScan = NULL;
+	RangeTblFunction *fauxFunction = NULL;
+	FuncExpr *fauxFuncExpr = NULL;
+	Const *multiPlanData = NULL;
+	char *serializedPlan = NULL;
+
+	/* pass multiPlan serialized as a constant function argument */
+	serializedPlan = CitusNodeToString(multiPlan);
+	multiPlanData = makeNode(Const);
+	multiPlanData->consttype = CSTRINGOID;
+	multiPlanData->constlen = strlen(serializedPlan);
+	multiPlanData->constvalue = CStringGetDatum(serializedPlan);
+	multiPlanData->constbyval = false;
+	multiPlanData->location = -1;
+
+	fauxFuncExpr = makeNode(FuncExpr);
+	fauxFuncExpr->funcid = CitusExtraDataContainerFuncId();
+	fauxFuncExpr->funcretset = true;
+	fauxFuncExpr->location = -1;
+
+	fauxFuncExpr->args = list_make1(multiPlanData);
+	fauxFunction = makeNode(RangeTblFunction);
+	fauxFunction->funcexpr = (Node *) fauxFuncExpr;
+
+	fauxFunctionScan = makeNode(FunctionScan);
+	fauxFunctionScan->functions = lappend(fauxFunctionScan->functions, fauxFunction);
+
+	/*
+	 * Add set returning function to target list if the original (postgres
+	 * created) plan doesn't support backward scans; doing so prevents
+	 * backward scans being supported by the new plantree as well.  This is
+	 * ugly as hell, but until we can rely on custom scans (which can signal
+	 * this via CUSTOMPATH_SUPPORT_BACKWARD_SCAN), there's not really a pretty
+	 * method to achieve this.
+	 *
+	 * FIXME: This should really be done on the master select plan.
+	 */
+	if (!ExecSupportsBackwardScan(result->planTree))
+	{
+		FuncExpr *funcExpr = makeNode(FuncExpr);
+		funcExpr->funcretset = true;
+
+		fauxFunctionScan->scan.plan.targetlist =
+			lappend(fauxFunctionScan->scan.plan.targetlist,
+					funcExpr);
+	}
+
+	result->planTree = (Plan *) fauxFunctionScan;
+
+	return result;
+}
+
+
+/*
+ * GetMultiPlanString returns either NULL, if the plan is not a distributed
+ * one, or the string representing the distributed plan.
+ */
+static char *
+GetMultiPlanString(PlannedStmt *result)
+{
+	FunctionScan *fauxFunctionScan = NULL;
+	RangeTblFunction *fauxFunction = NULL;
+	FuncExpr *fauxFuncExpr = NULL;
+	Const *multiPlanData = NULL;
+
+	if (!IsA(result->planTree, FunctionScan))
+	{
+		return NULL;
+	}
+
+	fauxFunctionScan = (FunctionScan *) result->planTree;
+
+	if (list_length(fauxFunctionScan->functions) != 1)
+	{
+		return NULL;
+	}
+
+	fauxFunction = linitial(fauxFunctionScan->functions);
+
+	if (!IsA(fauxFunction->funcexpr, FuncExpr))
+	{
+		return NULL;
+	}
+
+	fauxFuncExpr = (FuncExpr *) fauxFunction->funcexpr;
+
+	if (fauxFuncExpr->funcid != CitusExtraDataContainerFuncId())
+	{
+		return NULL;
+	}
+
+	if (list_length(fauxFuncExpr->args) != 1)
+	{
+		ereport(ERROR, (errmsg("unexpected number of function arguments to "
+							   "citusdb_extradata_container")));
+	}
+
+	multiPlanData = (Const *) linitial(fauxFuncExpr->args);
+	Assert(IsA(multiPlanData, Const));
+	Assert(multiPlanData->consttype == CSTRINGOID);
+
+	return DatumGetCString(multiPlanData->constvalue);
+}
+
+
+/*
+ * CheckNodeIsDumpable checks that the passed node can be dumped using
+ * CitusNodeToString(). As this checks is expensive, it's only active when
+ * assertions are enabled.
+ */
+static void
+CheckNodeIsDumpable(Node *node)
+{
+#ifdef USE_ASSERT_CHECKING
+	char *out = CitusNodeToString(node);
+	pfree(out);
+#endif
+}
--- a/src/backend/distributed/relay/relay_event_utility.c
+++ b/src/backend/distributed/relay/relay_event_utility.c
@ -0,0 +1,511 @@
+/*-------------------------------------------------------------------------
+ *
+ * relay_event_utility.c
+ *
+ * Routines for handling DDL statements that relate to relay files. These
+ * routines extend relation, index and constraint names in utility commands.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/skey.h"
+#include "access/xact.h"
+#include "catalog/indexing.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_constraint.h"
+#include "commands/defrem.h"
+#include "distributed/relay_utility.h"
+#include "nodes/parsenodes.h"
+#include "parser/parse_utilcmd.h"
+#include "storage/lock.h"
+#include "tcop/utility.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/tqual.h"
+
+
+/* Local functions forward declarations */
+static bool TypeAddIndexConstraint(const AlterTableCmd *command);
+static bool TypeDropIndexConstraint(const AlterTableCmd *command, 
+									const RangeVar *relation, uint64 shardId);
+static void AppendShardIdToConstraintName(AlterTableCmd *command, uint64 shardId);
+
+
+/*
+ * RelayEventExtendNames extends relation names in the given parse tree for
+ * certain utility commands. The function more specifically extends table,
+ * sequence, and index names in the parse tree by appending the given shardId;
+ * thereby avoiding name collisions in the database among sharded tables. This
+ * function has the side effect of extending relation names in the parse tree.
+ */
+void
+RelayEventExtendNames(Node *parseTree, uint64 shardId)
+{
+	/* we don't extend names in extension or schema commands */
+	NodeTag nodeType = nodeTag(parseTree);
+	if (nodeType == T_CreateExtensionStmt || nodeType == T_CreateSchemaStmt)
+	{
+		return;
+	}
+
+	switch (nodeType)
+	{
+		case T_AlterSeqStmt:
+		{
+			AlterSeqStmt *alterSeqStmt = (AlterSeqStmt *) parseTree;
+			char **sequenceName = &(alterSeqStmt->sequence->relname);
+
+			AppendShardIdToName(sequenceName, shardId);
+			break;
+		}
+		
+		case T_AlterTableStmt:
+		{
+			/*
+			 * We append shardId to the very end of table, sequence and index
+			 * names to avoid name collisions. We usually do not touch
+			 * constraint names, except for cases where they refer to index
+			 * names. In those cases, we also append to constraint names.
+			 */
+
+			AlterTableStmt *alterTableStmt = (AlterTableStmt *) parseTree;
+			char **relationName = &(alterTableStmt->relation->relname);
+			RangeVar *relation  = alterTableStmt->relation; /* for constraints */
+
+			List *commandList = alterTableStmt->cmds;
+			ListCell *commandCell = NULL;
+
+			/* first append shardId to base relation name */
+			AppendShardIdToName(relationName, shardId);
+
+			foreach(commandCell, commandList)
+			{
+				AlterTableCmd *command = (AlterTableCmd *) lfirst(commandCell);
+
+				if (TypeAddIndexConstraint(command) ||
+					TypeDropIndexConstraint(command, relation, shardId))
+				{
+					AppendShardIdToConstraintName(command, shardId);
+				}
+				else if (command->subtype == AT_ClusterOn)
+				{
+					char **indexName = &(command->name);
+					AppendShardIdToName(indexName, shardId);
+				}
+			}
+
+			break;
+		}
+
+		case T_ClusterStmt:
+		{
+			ClusterStmt *clusterStmt = (ClusterStmt *) parseTree;
+			char **relationName = NULL;
+
+			/* we do not support clustering the entire database */
+			if (clusterStmt->relation == NULL)
+			{
+				ereport(ERROR, (errmsg("cannot extend name for multi-relation cluster")));
+			}
+
+			relationName = &(clusterStmt->relation->relname);
+			AppendShardIdToName(relationName, shardId);
+
+			if (clusterStmt->indexname != NULL)
+			{
+				char **indexName = &(clusterStmt->indexname);
+				AppendShardIdToName(indexName, shardId);
+			}
+
+			break;
+		}
+
+		case T_CreateSeqStmt:
+		{
+			CreateSeqStmt *createSeqStmt = (CreateSeqStmt *) parseTree;
+			char **sequenceName = &(createSeqStmt->sequence->relname);
+
+			AppendShardIdToName(sequenceName, shardId);
+			break;
+		}
+
+		case T_CreateForeignServerStmt:
+		{
+			CreateForeignServerStmt *serverStmt = (CreateForeignServerStmt *) parseTree;
+			char **serverName = &(serverStmt->servername);
+
+			AppendShardIdToName(serverName, shardId);
+			break;
+		}
+
+		case T_CreateForeignTableStmt:
+		{
+			CreateForeignTableStmt *createStmt = (CreateForeignTableStmt *) parseTree;
+			char **serverName = &(createStmt->servername);
+
+			AppendShardIdToName(serverName, shardId);
+
+			/*
+			 * Since CreateForeignTableStmt inherits from CreateStmt and any change
+			 * performed on CreateStmt should be done here too, we simply *fall
+			 * through* to avoid code repetition.
+			 */
+		}
+
+		case T_CreateStmt:
+		{
+			CreateStmt *createStmt = (CreateStmt *) parseTree;
+			char **relationName = &(createStmt->relation->relname);
+
+			AppendShardIdToName(relationName, shardId);
+			break;
+		}
+
+		case T_DropStmt:
+		{
+			DropStmt *dropStmt = (DropStmt *) parseTree;
+			ObjectType objectType = dropStmt->removeType;
+
+			if (objectType == OBJECT_TABLE || objectType == OBJECT_SEQUENCE ||
+				objectType == OBJECT_INDEX || objectType == OBJECT_FOREIGN_TABLE ||
+				objectType == OBJECT_FOREIGN_SERVER)
+			{
+				List  *relationNameList = NULL;
+				int    relationNameListLength = 0;
+				Value *relationNameValue = NULL;
+				char  **relationName = NULL;
+
+				uint32 dropCount = list_length(dropStmt->objects);
+				if (dropCount > 1)
+				{
+					ereport(ERROR, 
+							(errmsg("cannot extend name for multiple drop objects")));
+				}
+
+				/*
+				 * We now need to extend a single relation, sequence or index
+				 * name. To be able to do this extension, we need to extract the
+				 * names' addresses from the value objects they are stored in.
+				 * Otherwise, the repalloc called in AppendShardIdToName() will
+				 * not have the correct memory address for the name.
+				 */
+
+				relationNameList = (List *) linitial(dropStmt->objects);
+				relationNameListLength = list_length(relationNameList);
+
+				switch (relationNameListLength)
+				{
+					case 1:
+						relationNameValue = linitial(relationNameList);
+						break;
+					case 2:
+						relationNameValue = lsecond(relationNameList);
+						break;
+					case 3:
+						relationNameValue = lthird(relationNameList);
+						break;
+					default:
+						ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
+										errmsg("improper relation name: \"%s\"",
+											   NameListToString(relationNameList))));
+						break;
+				}
+
+				relationName = &(relationNameValue->val.str);
+				AppendShardIdToName(relationName, shardId);
+			}
+			else
+			{
+				ereport(WARNING, (errmsg("unsafe object type in drop statement"),
+								  errdetail("Object type: %u", (uint32) objectType)));
+			}
+
+			break;
+		}
+
+		case T_IndexStmt:
+		{
+			IndexStmt *indexStmt = (IndexStmt *) parseTree;
+			char **relationName = &(indexStmt->relation->relname);
+			char **indexName = &(indexStmt->idxname);
+
+			/*
+			 * Concurrent index statements cannot run within a transaction block.
+			 * Therefore, we do not support them.
+			 */
+			if (indexStmt->concurrent)
+			{
+				ereport(ERROR, (errmsg("cannot extend name for concurrent index")));
+			}
+
+			/*
+			 * In the regular DDL execution code path (for non-sharded tables),
+			 * if the index statement results from a table creation command, the
+			 * indexName may be null. For sharded tables however, we intercept
+			 * that code path and explicitly set the index name. Therefore, the
+			 * index name in here cannot be null.
+			 */
+			if ((*indexName) == NULL)
+			{
+				ereport(ERROR, (errmsg("cannot extend name for null index name")));
+			}
+
+			AppendShardIdToName(relationName, shardId);
+			AppendShardIdToName(indexName, shardId);
+			break;
+		}
+
+		case T_ReindexStmt:
+		{
+			ReindexStmt *reindexStmt = (ReindexStmt *) parseTree;
+
+#if (PG_VERSION_NUM >= 90500)
+			ReindexObjectType objectType = reindexStmt->kind;
+			if (objectType == REINDEX_OBJECT_TABLE || objectType == REINDEX_OBJECT_INDEX)
+			{
+				char **objectName = &(reindexStmt->relation->relname);
+				AppendShardIdToName(objectName, shardId);
+			}
+			else if (objectType == REINDEX_OBJECT_DATABASE)
+			{
+				ereport(ERROR, (errmsg("cannot extend name for multi-relation reindex")));
+			}
+#else
+			ObjectType objectType = reindexStmt->kind;
+			if (objectType == OBJECT_TABLE || objectType == OBJECT_INDEX)
+			{
+				char **objectName = &(reindexStmt->relation->relname);
+				AppendShardIdToName(objectName, shardId);
+			}
+			else if (objectType == OBJECT_DATABASE)
+			{
+				ereport(ERROR, (errmsg("cannot extend name for multi-relation reindex")));
+			}
+#endif
+			else
+			{
+				ereport(ERROR, (errmsg("invalid object type in reindex statement"),
+								errdetail("Object type: %u", (uint32) objectType)));
+			}
+
+			break;
+		}
+
+		case T_RenameStmt:
+		{
+			RenameStmt *renameStmt = (RenameStmt *) parseTree;
+			ObjectType objectType = renameStmt->renameType;
+			
+			if (objectType == OBJECT_TABLE || objectType == OBJECT_SEQUENCE ||
+				objectType == OBJECT_INDEX)
+			{
+				char **oldRelationName = &(renameStmt->relation->relname);
+				char **newRelationName = &(renameStmt->newname);
+
+				AppendShardIdToName(oldRelationName, shardId);
+				AppendShardIdToName(newRelationName, shardId);
+			}
+			else if (objectType == OBJECT_COLUMN || objectType == OBJECT_TRIGGER)
+			{
+				char **relationName = &(renameStmt->relation->relname);
+
+				AppendShardIdToName(relationName, shardId);
+			}
+			else
+			{
+				ereport(WARNING, (errmsg("unsafe object type in rename statement"),
+								  errdetail("Object type: %u", (uint32) objectType)));
+			}
+
+			break;
+		}
+
+		case T_TruncateStmt:
+		{
+			/*
+			 * We currently do not support truncate statements. This is
+			 * primarily because truncates allow implicit modifications to
+			 * sequences through table column dependencies. As we have not
+			 * determined our dependency model for sequences, we error here. 
+			 */
+			ereport(ERROR, (errmsg("cannot extend name for truncate statement")));
+			break;
+		}
+
+		default:
+		{
+			ereport(WARNING, (errmsg("unsafe statement type in name extension"),
+							  errdetail("Statement type: %u", (uint32) nodeType)));
+			break;
+		}
+	}
+}
+
+
+/*
+ * TypeAddIndexConstraint checks if the alter table command adds a constraint
+ * and if that constraint also results in an index creation.
+ */
+static bool
+TypeAddIndexConstraint(const AlterTableCmd *command)
+{
+	if (command->subtype == AT_AddConstraint)
+	{
+		if (IsA(command->def, Constraint))
+		{
+			Constraint *constraint = (Constraint *) command->def;
+			if (constraint->contype == CONSTR_PRIMARY ||
+				constraint->contype == CONSTR_UNIQUE)
+			{
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
+
+/*
+ * TypeDropIndexConstraint checks if the alter table command drops a constraint
+ * and if that constraint also results in an index drop. Note that drop
+ * constraints do not have access to constraint type information; this is in
+ * contrast with add constraint commands. This function therefore performs
+ * additional system catalog lookups to determine if the drop constraint is
+ * associated with an index.
+ */
+static bool
+TypeDropIndexConstraint(const AlterTableCmd *command, 
+						const RangeVar *relation, uint64 shardId)
+{
+	Relation pgConstraint = NULL;
+	SysScanDesc scanDescriptor = NULL;
+	ScanKeyData	scanKey[1];
+	int scanKeyCount = 1;
+	HeapTuple heapTuple = NULL;
+
+	char *searchedConstraintName = NULL;
+	bool indexConstraint = false;	
+	Oid  relationId = InvalidOid;
+	bool failOK = true;
+
+	if (command->subtype != AT_DropConstraint)
+	{
+		return false;
+	}
+
+	/*
+	 * At this stage, our only option is performing a relationId lookup. We
+	 * first find the relationId, and then scan the pg_constraints system
+	 * catalog using this relationId. Finally, we check if the passed in
+	 * constraint is for a primary key or unique index.
+	 */
+	relationId = RangeVarGetRelid(relation, NoLock, failOK);
+	if (!OidIsValid(relationId))
+	{
+		/* overlook this error, it should be signaled later in the pipeline */
+		return false;
+	}
+
+	searchedConstraintName = pnstrdup(command->name, NAMEDATALEN);
+	AppendShardIdToName(&searchedConstraintName, shardId);
+
+	pgConstraint = heap_open(ConstraintRelationId, AccessShareLock);
+
+	ScanKeyInit(&scanKey[0], Anum_pg_constraint_conrelid,
+				BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(relationId));
+	
+	scanDescriptor = systable_beginscan(pgConstraint, 
+										ConstraintRelidIndexId, true, /* indexOK */
+										NULL, scanKeyCount, scanKey);
+
+	heapTuple = systable_getnext(scanDescriptor);
+	while (HeapTupleIsValid(heapTuple))
+	{
+		Form_pg_constraint constraintForm = (Form_pg_constraint) GETSTRUCT(heapTuple);
+		char *constraintName = NameStr(constraintForm->conname);
+		
+		if (strncmp(constraintName, searchedConstraintName, NAMEDATALEN) == 0)
+		{
+			/* we found the constraint, now check if it is for an index */
+			if (constraintForm->contype == CONSTRAINT_PRIMARY ||
+				constraintForm->contype == CONSTRAINT_UNIQUE)
+			{
+				indexConstraint = true;
+			}
+			
+			break;
+		}
+
+		heapTuple = systable_getnext(scanDescriptor);
+	}
+
+	systable_endscan(scanDescriptor);
+	heap_close(pgConstraint, AccessShareLock);
+	
+	pfree(searchedConstraintName);
+
+	return indexConstraint;
+}
+
+
+/*
+ * AppendShardIdToConstraintName extends given constraint name with given
+ * shardId. Note that we only extend constraint names if they correspond to
+ * indexes, and the caller should verify that index correspondence before
+ * calling this function.
+ */
+static void
+AppendShardIdToConstraintName(AlterTableCmd *command, uint64 shardId)
+{
+	if (command->subtype == AT_AddConstraint)
+	{
+		Constraint *constraint = (Constraint *) command->def;
+		char **constraintName = &(constraint->conname);
+		AppendShardIdToName(constraintName, shardId);
+	}
+	else if (command->subtype == AT_DropConstraint)
+	{
+		char **constraintName = &(command->name);
+		AppendShardIdToName(constraintName, shardId);
+	}
+}
+
+
+/*
+ * AppendShardIdToName appends shardId to the given name. The function takes in
+ * the name's address in order to reallocate memory for the name in the same
+ * memory context the name was originally created in.
+ */
+void
+AppendShardIdToName(char **name, uint64 shardId)
+{
+	char   extendedName[NAMEDATALEN];
+	uint32 extendedNameLength = 0;
+
+	snprintf(extendedName, NAMEDATALEN, "%s%c" UINT64_FORMAT, 
+			 (*name), SHARD_NAME_SEPARATOR, shardId);
+
+	/*
+	 * Parser should have already checked that the table name has enough space
+	 * reserved for appending shardIds. Nonetheless, we perform an additional
+	 * check here to verify that the appended name does not overflow.
+	 */
+	extendedNameLength = strlen(extendedName) + 1;
+	if (extendedNameLength >= NAMEDATALEN)
+	{
+		ereport(ERROR, (errmsg("shard name too long to extend: \"%s\"", (*name))));
+	}
+
+	(*name) = (char *) repalloc((*name), extendedNameLength);
+	snprintf((*name), extendedNameLength, "%s", extendedName);
+}
--- a/src/backend/distributed/shared_library_init.c
+++ b/src/backend/distributed/shared_library_init.c
@ -0,0 +1,535 @@
+/*-------------------------------------------------------------------------
+ *
+ * shared_library_init.c
+ *	  Initialize CitusDB extension
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "fmgr.h"
+#include "miscadmin.h"
+
+#include "commands/explain.h"
+#include "executor/executor.h"
+#include "distributed/master_protocol.h"
+#include "distributed/modify_planner.h"
+#include "distributed/multi_executor.h"
+#include "distributed/multi_explain.h"
+#include "distributed/multi_join_order.h"
+#include "distributed/multi_logical_optimizer.h"
+#include "distributed/multi_planner.h"
+#include  "distributed/multi_router_executor.h"
+#include "distributed/multi_server_executor.h"
+#include "distributed/multi_utility.h"
+#include "distributed/task_tracker.h"
+#include "distributed/worker_manager.h"
+#include "distributed/worker_protocol.h"
+#include "postmaster/postmaster.h"
+#include "optimizer/planner.h"
+#include "utils/guc.h"
+#include "utils/guc_tables.h"
+
+/* marks shared object as one loadable by the postgres version compiled against */
+PG_MODULE_MAGIC;
+
+void _PG_init(void);
+
+static void CreateRequiredDirectories(void);
+static void RegisterCitusConfigVariables(void);
+static void NormalizeWorkerListPath(void);
+
+
+/* GUC enum definitions */
+static const struct config_enum_entry task_assignment_policy_options[] = {
+	{"greedy", TASK_ASSIGNMENT_GREEDY, false},
+	{"first-replica", TASK_ASSIGNMENT_FIRST_REPLICA, false},
+	{"round-robin", TASK_ASSIGNMENT_ROUND_ROBIN, false},
+	{NULL, 0, false}
+};
+
+static const struct config_enum_entry task_executor_type_options[] = {
+	{"real-time", MULTI_EXECUTOR_REAL_TIME, false},
+	{"task-tracker", MULTI_EXECUTOR_TASK_TRACKER, false},
+	{"router", MULTI_EXECUTOR_ROUTER, false},
+	{NULL, 0, false}
+};
+
+static const struct config_enum_entry shard_placement_policy_options[] = {
+	{"local-node-first", SHARD_PLACEMENT_LOCAL_NODE_FIRST, false},
+	{"round-robin", SHARD_PLACEMENT_ROUND_ROBIN, false},
+	{NULL, 0, false}
+};
+
+
+/* shared library initialization function */
+void
+_PG_init(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+	{
+		ereport(ERROR, (errmsg("CitusDB can only be loaded via shared_preload_libraries"),
+						errhint("Add citusdb to shared_preload_libraries.")));
+	}
+
+	/*
+	 * Perform checks before registering any hooks, to avoid erroring out in a
+	 * partial state.
+	 *
+	 * In many cases (e.g. planner and utility hook, to run inside
+	 * pg_stat_statements et. al.) we have to be loaded before other hooks
+	 * (thus as the innermost/last running hook) to be able to do our
+	 * duties. For simplicity insist that all hooks are previously unused.
+	 */
+	if (planner_hook != NULL ||
+		ExplainOneQuery_hook != NULL ||
+		ExecutorStart_hook != NULL ||
+		ExecutorRun_hook != NULL ||
+		ExecutorFinish_hook != NULL ||
+		ExecutorEnd_hook != NULL ||
+		ProcessUtility_hook != NULL)
+	{
+		ereport(ERROR, (errmsg("CitusDB has to be loaded first"),
+						errhint("Place citusdb at the beginning of "
+								"shared_preload_libraries.")));
+	}
+
+	/*
+	 * Extend the database directory structure before continuing with
+	 * initialization - one of the later steps might require them to exist.
+	 */
+	CreateRequiredDirectories();
+
+	/*
+	 * Register CitusDB configuration variables. Do so before intercepting
+	 * hooks or calling initialization functions, in case we want to do the
+	 * latter in a configuration dependent manner.
+	 */
+	RegisterCitusConfigVariables();
+
+	/* intercept planner */
+	planner_hook = multi_planner;
+
+	/* intercept explain */
+	ExplainOneQuery_hook = MultiExplainOneQuery;
+
+	/* intercept executor */
+	ExecutorStart_hook = multi_ExecutorStart;
+	ExecutorRun_hook = multi_ExecutorRun;
+	ExecutorFinish_hook = multi_ExecutorFinish;
+	ExecutorEnd_hook = multi_ExecutorEnd;
+
+	/* register utility hook */
+	ProcessUtility_hook = multi_ProcessUtility;
+
+	/* organize that task tracker is started once server is up */
+	TaskTrackerRegister();
+
+	/* initialize worker node manager */
+	WorkerNodeRegister();
+}
+
+
+/*
+ * CreateRequiredDirectories - Create directories required for CitusDB to
+ * function.
+ *
+ * These used to be created by initdb, but that's not possible anymore.
+ */
+static void
+CreateRequiredDirectories(void)
+{
+	int dirNo = 0;
+	const char *subdirs[] = {
+		"pg_foreign_file",
+		"pg_foreign_file/cached",
+		"base/pgsql_job_cache"
+	};
+
+	for (dirNo = 0; dirNo < lengthof(subdirs); dirNo++)
+	{
+		int ret = mkdir(subdirs[dirNo], S_IRWXU);
+
+		if (ret != 0 && errno != EEXIST)
+		{
+			ereport(ERROR, (errcode_for_file_access(),
+							errmsg("could not create directory \"%s\": %m",
+								   subdirs[dirNo])));
+		}
+	}
+}
+
+
+/* Register CitusDB configuration variables. */
+static void
+RegisterCitusConfigVariables(void)
+{
+	DefineCustomStringVariable(
+		"citusdb.worker_list_file",
+		gettext_noop("Sets the server's \"worker_list\" configuration file."),
+		NULL,
+		&WorkerListFileName,
+		NULL,
+		PGC_POSTMASTER,
+		GUC_SUPERUSER_ONLY,
+		NULL, NULL, NULL);
+	NormalizeWorkerListPath();
+
+	DefineCustomBoolVariable(
+		"citusdb.binary_master_copy_format",
+		gettext_noop("Use the binary master copy format."),
+		gettext_noop("When enabled, data is copied from workers to the master "
+					 "in PostgreSQL's binary serialization format."),
+		&BinaryMasterCopyFormat,
+		false,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomBoolVariable(
+		"citusdb.binary_worker_copy_format",
+		gettext_noop("Use the binary worker copy format."),
+		gettext_noop("When enabled, data is copied from workers to workers "
+					 "in PostgreSQL's binary serialization format when "
+					 "joining large tables."),
+		&BinaryWorkerCopyFormat,
+		false,
+		PGC_SIGHUP,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomBoolVariable(
+		"citusdb.expire_cached_shards",
+		gettext_noop("Enables shard cache expiration if a shard's size on disk has changed. "),
+		gettext_noop("When appending to an existing shard, old data may still be cached on "
+					 "other workers. This configuration entry activates automatic "
+					 "expiration, but should not be used with manual updates to shards."),
+		&ExpireCachedShards,
+		false,
+		PGC_SIGHUP,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomBoolVariable(
+		"citusdb.subquery_pushdown",
+		gettext_noop("Enables supported subquery pushdown to workers."),
+		NULL,
+		&SubqueryPushdown,
+		false,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomBoolVariable(
+		"citusdb.log_multi_join_order",
+		gettext_noop("Logs the distributed join order to the server log."),
+		gettext_noop("We use this private configuration entry as a debugging aid. "
+					 "If enabled, we print the distributed join order."),
+		&LogMultiJoinOrder,
+		false,
+		PGC_USERSET,
+		GUC_NO_SHOW_ALL,
+		NULL, NULL, NULL);
+
+	DefineCustomBoolVariable(
+		"citusdb.explain_multi_logical_plan",
+		gettext_noop("Enables Explain to print out distributed logical plans."),
+		gettext_noop("We use this private configuration entry as a debugging aid. "
+					 "If enabled, the Explain command prints out the optimized "
+					 "logical plan for distributed queries."),
+		&ExplainMultiLogicalPlan,
+		false,
+		PGC_USERSET,
+		GUC_NO_SHOW_ALL,
+		NULL, NULL, NULL);
+
+	DefineCustomBoolVariable(
+		"citusdb.explain_multi_physical_plan",
+		gettext_noop("Enables Explain to print out distributed physical plans."),
+		gettext_noop("We use this private configuration entry as a debugging aid. "
+					 "If enabled, the Explain command prints out the physical "
+					 "plan for distributed queries."),
+		&ExplainMultiPhysicalPlan,
+		false,
+		PGC_USERSET,
+		GUC_NO_SHOW_ALL,
+		NULL, NULL, NULL);
+
+	DefineCustomBoolVariable(
+		"citusdb.all_modifications_commutative",
+		gettext_noop("Bypasses commutativity checks when enabled"),
+		NULL,
+		&AllModificationsCommutative,
+		false,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.shard_replication_factor",
+		gettext_noop("Sets the replication factor for shards."),
+		gettext_noop("Shards are replicated across nodes according to this "
+					 "replication factor. Note that shards read this "
+					 "configuration value at sharded table creation time, "
+					 "and later reuse the initially read value."),
+		&ShardReplicationFactor,
+		2, 1, 100,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.shard_max_size",
+		gettext_noop("Sets the maximum size a shard will grow before it gets split."),
+		gettext_noop("Shards store table and file data. When the source "
+					 "file's size for one shard exceeds this configuration "
+					 "value, the database ensures that either a new shard "
+					 "gets created, or the current one gets split. Note that "
+					 "shards read this configuration value at sharded table "
+					 "creation time, and later reuse the initially read value."),
+		&ShardMaxSize,
+		1048576, 256, INT_MAX, /* max allowed size not set to MAX_KILOBYTES on purpose */
+		PGC_USERSET,
+		GUC_UNIT_KB,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.max_worker_nodes_tracked",
+		gettext_noop("Sets the maximum number of worker nodes that are tracked."),
+		gettext_noop("Worker nodes' network locations, their membership and "
+					 "health status are tracked in a shared hash table on "
+					 "the master node. This configuration value limits the "
+					 "size of the hash table, and consequently the maximum "
+					 "number of worker nodes that can be tracked."),
+		&MaxWorkerNodesTracked,
+		2048, 8, INT_MAX,
+		PGC_POSTMASTER,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.remote_task_check_interval",
+		gettext_noop("Sets the frequency at which we check job statuses."),
+		gettext_noop("The master node assigns tasks to workers nodes, and "
+					 "then regularly checks with them about each task's "
+					 "progress. This configuration value sets the time "
+					 "interval between two consequent checks."),
+		&RemoteTaskCheckInterval,
+		10, 1, REMOTE_NODE_CONNECT_TIMEOUT,
+		PGC_USERSET,
+		GUC_UNIT_MS,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.task_tracker_delay",
+		gettext_noop("Task tracker sleep time between task management rounds."),
+		gettext_noop("The task tracker process wakes up regularly, walks over "
+					 "all tasks assigned to it, and schedules and executes these "
+					 "tasks. Then, the task tracker sleeps for a time period "
+					 "before walking over these tasks again. This configuration "
+					 "value determines the length of that sleeping period."),
+		&TaskTrackerDelay,
+		200, 10, 100000,
+		PGC_SIGHUP,
+		GUC_UNIT_MS,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.max_assign_task_batch_size",
+		gettext_noop("Sets the maximum number of tasks to assign per round."),
+		gettext_noop("The master node synchronously assigns tasks to workers in "
+					 "batches. Bigger batches allow for faster task assignment, "
+					 "but it may take longer for all workers to get tasks "
+					 "if the number of workers is large. This configuration "
+					 "value controls the maximum batch size."),
+		&MaxAssignTaskBatchSize,
+		64, 1, INT_MAX,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.max_tracked_tasks_per_node",
+		gettext_noop("Sets the maximum number of tracked tasks per node."),
+		gettext_noop("The task tracker processes keeps all assigned tasks in "
+					 "a shared hash table, and schedules and executes these "
+					 "tasks as appropriate. This configuration value limits "
+					 "the size of the hash table, and therefore the maximum "
+					 "number of tasks that can be tracked at any given time."),
+		&MaxTrackedTasksPerNode,
+		1024, 8, INT_MAX,
+		PGC_POSTMASTER,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.max_running_tasks_per_node",
+		gettext_noop("Sets the maximum number of tasks to run concurrently per node."),
+		gettext_noop("The task tracker process schedules and executes the tasks "
+					 "assigned to it as appropriate. This configuration value "
+					 "sets the maximum number of tasks to execute concurrently "
+					 "on one node at any given time."),
+		&MaxRunningTasksPerNode,
+		8, 1, INT_MAX,
+		PGC_SIGHUP,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.partition_buffer_size",
+		gettext_noop("Sets the buffer size to use for partition operations."),
+		gettext_noop("Worker nodes allow for table data to be repartitioned "
+					 "into multiple text files, much like Hadoop's Map "
+					 "command. This configuration value sets the buffer size "
+					 "to use per partition operation. After the buffer fills "
+					 "up, we flush the repartitioned data into text files."),
+		&PartitionBufferSize,
+		8192, 0, (INT_MAX / 1024), /* result stored in int variable */
+		PGC_USERSET,
+		GUC_UNIT_KB,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.large_table_shard_count",
+		gettext_noop("The shard count threshold over which a table is considered large."),
+		gettext_noop("A distributed table is considered to be large if it has "
+					 "more shards than the value specified here. This largeness "
+					 "criteria is then used in picking a table join order during "
+					 "distributed query planning."),
+		&LargeTableShardCount,
+		4, 1, 10000,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+		"citusdb.limit_clause_row_fetch_count",
+		gettext_noop("Number of rows to fetch per task for limit clause optimization."),
+		gettext_noop("Select queries get partitioned and executed as smaller "
+					 "tasks. In some cases, select queries with limit clauses "
+					 "may need to fetch all rows from each task to generate "
+					 "results. In those cases, and where an approximation would "
+					 "produce meaningful results, this configuration value sets "
+					 "the number of rows to fetch from each task."),
+		&LimitClauseRowFetchCount,
+		-1, -1, INT_MAX,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomRealVariable(
+		"citusdb.count_distinct_error_rate",
+		gettext_noop("Desired error rate when calculating count(distinct) "
+					 "approximates using the postgresql-hll extension. "
+					 "0.0 disables approximations for count(distinct); 1.0 "
+					 "provides no guarantees about the accuracy of results."),
+		NULL,
+		&CountDistinctErrorRate,
+		0.0, 0.0, 1.0,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomEnumVariable(
+		"citusdb.task_assignment_policy",
+		gettext_noop("Sets the policy to use when assigning tasks to worker nodes."),
+		gettext_noop("The master node assigns tasks to worker nodes based on shard "
+						 "locations. This configuration value specifies the policy to "
+						 "use when making these assignments. The greedy policy aims to "
+						 "evenly distribute tasks across worker nodes, first-replica just "
+						 "assigns tasks in the order shard placements were created, "
+						 "and the round-robin policy assigns tasks to worker nodes in "
+					 "a round-robin fashion."),
+		&TaskAssignmentPolicy,
+		TASK_ASSIGNMENT_GREEDY,
+		task_assignment_policy_options,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomEnumVariable(
+		"citusdb.task_executor_type",
+		gettext_noop("Sets the executor type to be used for distributed queries."),
+		gettext_noop("The master node chooses between three different executor types "
+					 "when executing a distributed query. The router executor is "
+					 "optimal for simple key-value lookups on a single shard. The "
+					 "real-time executor is optimal for queries that involve "
+					 "aggregations and/or co-located joins on multiple shards. The "
+					 "task-tracker executor is optimal for long-running, complex "
+					 "queries that touch thousands of shards and/or that involve "
+					 "table repartitioning."),
+		&TaskExecutorType,
+		MULTI_EXECUTOR_REAL_TIME,
+		task_executor_type_options,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomEnumVariable(
+		"citusdb.shard_placement_policy",
+		gettext_noop("Sets the policy to use when choosing nodes for shard placement."),
+		gettext_noop("The master node chooses which worker nodes to place new shards "
+					 "on. This configuration value specifies the policy to use when "
+					 "selecting these nodes. The local-node-first policy places the "
+					 "first replica on the client node and chooses others randomly. "
+					 "The round-robin policy aims to distribute shards evenly across "
+					 "the cluster by selecting nodes in a round-robin fashion."),
+		&ShardPlacementPolicy,
+		SHARD_PLACEMENT_ROUND_ROBIN, shard_placement_policy_options,
+		PGC_USERSET,
+		0,
+		NULL, NULL, NULL);
+
+	/* warn about config items in the citusdb namespace that are not registered above */
+	EmitWarningsOnPlaceholders("citusdb");
+	/* Also warn about citus namespace, as that's a very likely misspelling */
+	EmitWarningsOnPlaceholders("citus");
+}
+
+
+/*
+ * NormalizeWorkerListPath converts the path configured via
+ * citusdb.worker_list_file into an absolute path, falling back to the default
+ * value if necessary. The previous value of the config variable is
+ * overwritten with the normalized value.
+ *
+ * NB: This has to be called before ChangeToDataDir() is called as otherwise
+ * the relative paths won't make much sense to the user anymore.
+ */
+static void
+NormalizeWorkerListPath(void)
+{
+	char *absoluteFileName = NULL;
+
+	if (WorkerListFileName != NULL)
+	{
+		absoluteFileName = make_absolute_path(WorkerListFileName);
+	}
+	else if (DataDir != NULL)
+	{
+		absoluteFileName = malloc(strlen(DataDir) + strlen(WORKER_LIST_FILENAME) + 2);
+		if (absoluteFileName == NULL)
+			ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY),
+							errmsg("out of memory")));
+
+		sprintf(absoluteFileName, "%s/%s", DataDir, WORKER_LIST_FILENAME);
+	}
+	else
+	{
+		ereport(FATAL, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("%s does not know where to find the \"worker_list_file\" "
+							   "configuration file.\n"
+							   "This can be specified as \"citusdb.worker_list_file\" in "
+							   "\"%s\", or by the -D invocation option, or by the PGDATA "
+							   "environment variable.\n", progname, ConfigFileName)));
+	}
+
+	SetConfigOption("citusdb.worker_list_file", absoluteFileName, PGC_POSTMASTER, PGC_S_OVERRIDE);
+	free(absoluteFileName);
+}
--- a/src/backend/distributed/test/connection_cache.c
+++ b/src/backend/distributed/test/connection_cache.c
@ -0,0 +1,170 @@
+/*-------------------------------------------------------------------------
+ *
+ * test/src/connection_cache.c
+ *
+ * This file contains functions to exercise CitusDB's connection hash
+ * functionality for purposes of unit testing.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "fmgr.h"
+#include "libpq-int.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "catalog/pg_type.h"
+#include "distributed/connection_cache.h"
+#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
+#include "utils/lsyscache.h"
+
+
+/* local function forward declarations */
+static Datum ExtractIntegerDatum(char *input);
+
+
+/* declarations for dynamic loading */
+PG_FUNCTION_INFO_V1(initialize_remote_temp_table);
+PG_FUNCTION_INFO_V1(count_remote_temp_table_rows);
+PG_FUNCTION_INFO_V1(get_and_purge_connection);
+PG_FUNCTION_INFO_V1(set_connection_status_bad);
+
+
+/*
+ * initialize_remote_temp_table connects to a specified host on a specified
+ * port and creates a temporary table with 100 rows. Because the table is
+ * temporary, it will be visible if a connection is reused but not if a new
+ * connection is opened to the node.
+ */
+Datum
+initialize_remote_temp_table(PG_FUNCTION_ARGS)
+{
+	char *nodeName = PG_GETARG_CSTRING(0);
+	int32 nodePort = PG_GETARG_INT32(1);
+	PGresult *result = NULL;
+
+	PGconn *connection = GetConnection(nodeName, nodePort);
+	if (connection == NULL)
+	{
+		PG_RETURN_BOOL(false);
+	}
+
+	result = PQexec(connection, POPULATE_TEMP_TABLE);
+	if (PQresultStatus(result) != PGRES_COMMAND_OK)
+	{
+		ReportRemoteError(connection, result);
+	}
+
+	PQclear(result);
+
+	PG_RETURN_BOOL(true);
+}
+
+
+/*
+ * count_remote_temp_table_rows just returns the integer count of rows in the
+ * table created by initialize_remote_temp_table. If no such table exists, this
+ * function emits a warning and returns -1.
+ */
+Datum
+count_remote_temp_table_rows(PG_FUNCTION_ARGS)
+{
+	char *nodeName = PG_GETARG_CSTRING(0);
+	int32 nodePort = PG_GETARG_INT32(1);
+	Datum count = Int32GetDatum(-1);
+	PGresult *result = NULL;
+
+	PGconn *connection = GetConnection(nodeName, nodePort);
+	if (connection == NULL)
+	{
+		PG_RETURN_DATUM(count);
+	}
+
+	result = PQexec(connection, COUNT_TEMP_TABLE);
+	if (PQresultStatus(result) != PGRES_TUPLES_OK)
+	{
+		ReportRemoteError(connection, result);
+	}
+	else
+	{
+		char *countText = PQgetvalue(result, 0, 0);
+		count = ExtractIntegerDatum(countText);
+	}
+
+	PQclear(result);
+
+	PG_RETURN_DATUM(count);
+}
+
+
+/*
+ * get_and_purge_connection first gets a connection using the provided hostname
+ * and port before immediately passing that connection to PurgeConnection.
+ * Simply a wrapper around PurgeConnection that uses hostname/port rather than
+ * PGconn.
+ */
+Datum
+get_and_purge_connection(PG_FUNCTION_ARGS)
+{
+	char *nodeName = PG_GETARG_CSTRING(0);
+	int32 nodePort = PG_GETARG_INT32(1);
+
+	PGconn *connection = GetConnection(nodeName, nodePort);
+	if (connection == NULL)
+	{
+		PG_RETURN_BOOL(false);
+	}
+
+	PurgeConnection(connection);
+
+	PG_RETURN_BOOL(true);
+}
+
+
+/*
+ * set_connection_status_bad does not remove the given connection from the connection hash.
+ * It only sets its status to CONNECTION_BAD. On success, it returns true.
+ */
+Datum
+set_connection_status_bad(PG_FUNCTION_ARGS)
+{
+	char *nodeName = PG_GETARG_CSTRING(0);
+	int32 nodePort = PG_GETARG_INT32(1);
+
+	PGconn *connection = GetConnection(nodeName, nodePort);
+	if (connection == NULL)
+	{
+		PG_RETURN_BOOL(false);
+	}
+
+	/* set the connection status */
+	connection->status = CONNECTION_BAD;
+
+	PG_RETURN_BOOL(true);
+}
+
+
+/*
+ * ExtractIntegerDatum transforms an integer in textual form into a Datum.
+ */
+static Datum
+ExtractIntegerDatum(char *input)
+{
+	Oid typIoFunc = InvalidOid;
+	Oid typIoParam = InvalidOid;
+	Datum intDatum = 0;
+	FmgrInfo fmgrInfo;
+	memset(&fmgrInfo, 0, sizeof(fmgrInfo));
+
+	getTypeInputInfo(INT4OID, &typIoFunc, &typIoParam);
+	fmgr_info(typIoFunc, &fmgrInfo);
+
+	intDatum = InputFunctionCall(&fmgrInfo, input, typIoFunc, -1);
+
+	return intDatum;
+}
--- a/src/backend/distributed/test/create_shards.c
+++ b/src/backend/distributed/test/create_shards.c
@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * test/src/create_shards.c
+ *
+ * This file contains functions to exercise shard creation functionality
+ * within CitusDB.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "fmgr.h"
+
+#include <string.h>
+
+#include "distributed/listutils.h"
+#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
+#include "lib/stringinfo.h"
+#include "nodes/pg_list.h"
+
+
+/* local function forward declarations */
+static int CompareStrings(const void *leftElement, const void *rightElement);
+
+
+/* declarations for dynamic loading */
+PG_FUNCTION_INFO_V1(sort_names);
+
+
+/*
+ * sort_names accepts three strings, places them in a list, then calls PGSSortList
+ * to test its sort functionality. Returns a string containing sorted lines.
+ */
+Datum
+sort_names(PG_FUNCTION_ARGS)
+{
+	char *first = PG_GETARG_CSTRING(0);
+	char *second = PG_GETARG_CSTRING(1);
+	char *third = PG_GETARG_CSTRING(2);
+	List *nameList = SortList(list_make3(first, second, third),
+							  (int (*)(const void *, const void *))(&CompareStrings));
+	StringInfo sortedNames = makeStringInfo();
+
+	ListCell *nameCell = NULL;
+	foreach(nameCell, nameList)
+	{
+		char *name = lfirst(nameCell);
+		appendStringInfo(sortedNames, "%s\n", name);
+	}
+
+
+	PG_RETURN_CSTRING(sortedNames->data);
+}
+
+
+/*
+ * A simple wrapper around strcmp suitable for use with PGSSortList or qsort.
+ */
+static int
+CompareStrings(const void *leftElement, const void *rightElement)
+{
+	const char *leftString = *((const char **) leftElement);
+	const char *rightString = *((const char **) rightElement);
+
+	return strcmp(leftString, rightString);
+}
--- a/src/backend/distributed/test/distribution_metadata.c
+++ b/src/backend/distributed/test/distribution_metadata.c
@ -0,0 +1,365 @@
+/*-------------------------------------------------------------------------
+ *
+ * test/src/distribution_metadata.c
+ *
+ * This file contains functions to exercise distributed table metadata
+ * functionality within CitusDB.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "fmgr.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "access/heapam.h"
+#include "catalog/pg_type.h"
+#include "distributed/master_metadata_utility.h"
+#include "distributed/master_protocol.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/multi_join_order.h"
+#include "distributed/pg_dist_shard.h"
+#include "distributed/resource_lock.h"
+#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
+#include "lib/stringinfo.h"
+#include "nodes/pg_list.h"
+#include "nodes/primnodes.h"
+#include "storage/lock.h"
+#include "utils/array.h"
+#include "utils/elog.h"
+#include "utils/errcodes.h"
+#include "utils/builtins.h"
+#include "utils/palloc.h"
+
+
+/* declarations for dynamic loading */
+PG_FUNCTION_INFO_V1(load_shard_id_array);
+PG_FUNCTION_INFO_V1(load_shard_interval_array);
+PG_FUNCTION_INFO_V1(load_shard_placement_array);
+PG_FUNCTION_INFO_V1(partition_column_id);
+PG_FUNCTION_INFO_V1(partition_type);
+PG_FUNCTION_INFO_V1(is_distributed_table);
+PG_FUNCTION_INFO_V1(column_name_to_column);
+PG_FUNCTION_INFO_V1(column_name_to_column_id);
+PG_FUNCTION_INFO_V1(create_monolithic_shard_row);
+PG_FUNCTION_INFO_V1(create_healthy_local_shard_placement_row);
+PG_FUNCTION_INFO_V1(delete_shard_placement_row);
+PG_FUNCTION_INFO_V1(update_shard_placement_row_state);
+PG_FUNCTION_INFO_V1(acquire_shared_shard_lock);
+
+
+/*
+ * load_shard_id_array returns the shard identifiers for a particular
+ * distributed table as a bigint array. If the table is not distributed
+ * yet, the function errors-out.
+ */
+Datum
+load_shard_id_array(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	ArrayType *shardIdArrayType = NULL;
+	ListCell *shardCell = NULL;
+	int shardIdIndex = 0;
+	Oid shardIdTypeId = INT8OID;
+
+	int shardIdCount = -1;
+	Datum *shardIdDatumArray = NULL;
+	List *shardList = LoadShardIntervalList(distributedTableId);
+
+	shardIdCount = list_length(shardList);
+	shardIdDatumArray = palloc0(shardIdCount * sizeof(Datum));
+
+	foreach(shardCell, shardList)
+	{
+		ShardInterval *shardId = (ShardInterval *) lfirst(shardCell);
+		Datum shardIdDatum = Int64GetDatum(shardId->shardId);
+
+		shardIdDatumArray[shardIdIndex] = shardIdDatum;
+		shardIdIndex++;
+	}
+
+	shardIdArrayType = DatumArrayToArrayType(shardIdDatumArray, shardIdCount,
+											 shardIdTypeId);
+
+	PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
+}
+
+
+/*
+ * load_shard_interval_array loads a shard interval using a provided identifier
+ * and returns a two-element array consisting of min/max values contained in
+ * that shard interval. If no such interval can be found, this function raises
+ * an error instead.
+ */
+Datum
+load_shard_interval_array(PG_FUNCTION_ARGS)
+{
+	int64 shardId = PG_GETARG_INT64(0);
+	Oid expectedType PG_USED_FOR_ASSERTS_ONLY = get_fn_expr_argtype(fcinfo->flinfo, 1);
+	ShardInterval *shardInterval = LoadShardInterval(shardId);
+	Datum shardIntervalArray[] = { shardInterval->minValue, shardInterval->maxValue };
+	ArrayType *shardIntervalArrayType = NULL;
+
+	Assert(expectedType == shardInterval->valueTypeId);
+
+	shardIntervalArrayType = DatumArrayToArrayType(shardIntervalArray, 2,
+												   shardInterval->valueTypeId);
+
+	PG_RETURN_ARRAYTYPE_P(shardIntervalArrayType);
+}
+
+
+/*
+ * load_shard_placement_array loads a shard interval using the provided ID
+ * and returns an array of strings containing the node name and port for each
+ * placement of the specified shard interval. If the second argument is true,
+ * only finalized placements are returned; otherwise, all are. If no such shard
+ * interval can be found, this function raises an error instead.
+ */
+Datum
+load_shard_placement_array(PG_FUNCTION_ARGS)
+{
+	int64 shardId = PG_GETARG_INT64(0);
+	bool onlyFinalized = PG_GETARG_BOOL(1);
+	ArrayType *placementArrayType = NULL;
+	List *placementList = NIL;
+	ListCell *placementCell = NULL;
+	int placementCount = -1;
+	int placementIndex = 0;
+	Datum *placementDatumArray = NULL;
+	Oid placementTypeId = TEXTOID;
+	StringInfo placementInfo = makeStringInfo();
+
+	if (onlyFinalized)
+	{
+		placementList = FinalizedShardPlacementList(shardId);
+	}
+	else
+	{
+		placementList = ShardPlacementList(shardId);
+	}
+
+	placementCount = list_length(placementList);
+	placementDatumArray = palloc0(placementCount * sizeof(Datum));
+
+	foreach(placementCell, placementList)
+	{
+		ShardPlacement *placement = (ShardPlacement *) lfirst(placementCell);
+		appendStringInfo(placementInfo, "%s:%d", placement->nodeName,
+						 placement->nodePort);
+
+		placementDatumArray[placementIndex] = CStringGetTextDatum(placementInfo->data);
+		placementIndex++;
+		resetStringInfo(placementInfo);
+	}
+
+	placementArrayType = DatumArrayToArrayType(placementDatumArray, placementCount,
+											   placementTypeId);
+
+	PG_RETURN_ARRAYTYPE_P(placementArrayType);
+}
+
+
+/*
+ * partition_column_id simply finds a distributed table using the provided Oid
+ * and returns the column_id of its partition column. If the specified table is
+ * not distributed, this function raises an error instead.
+ */
+Datum
+partition_column_id(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	uint32 rangeTableId = 1;
+	Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
+
+	PG_RETURN_INT16((int16) partitionColumn->varattno);
+}
+
+
+/*
+ * partition_type simply finds a distributed table using the provided Oid and
+ * returns the type of partitioning in use by that table. If the specified
+ * table is not distributed, this function raises an error instead.
+ */
+Datum
+partition_type(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	char partitionType = PartitionMethod(distributedTableId);
+
+	PG_RETURN_CHAR(partitionType);
+}
+
+
+/*
+ * is_distributed_table simply returns whether a given table is distributed. No
+ * errors, just a boolean.
+ */
+Datum
+is_distributed_table(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	bool isDistributedTable = IsDistributedTable(distributedTableId);
+
+	PG_RETURN_BOOL(isDistributedTable);
+}
+
+
+/*
+ * column_name_to_column is an internal UDF to obtain a textual representation
+ * of a particular column node (Var), given a relation identifier and column
+ * name. There is no requirement that the table be distributed; this function
+ * simply returns the textual representation of a Var representing a column.
+ * This function will raise an ERROR if no such column can be found or if the
+ * provided name refers to a system column.
+ */
+Datum
+column_name_to_column(PG_FUNCTION_ARGS)
+{
+	Oid relationId = PG_GETARG_OID(0);
+	text *columnText = PG_GETARG_TEXT_P(1);
+	Relation relation = NULL;
+	char *columnName = text_to_cstring(columnText);
+	Var *column = NULL;
+	char *columnNodeString = NULL;
+	text *columnNodeText = NULL;
+
+	relation = relation_open(relationId, AccessExclusiveLock);
+
+	column = (Var *) BuildDistributionKeyFromColumnName(relation, columnName);
+	columnNodeString = nodeToString(column);
+	columnNodeText = cstring_to_text(columnNodeString);
+
+	relation_close(relation, NoLock);
+
+	PG_RETURN_TEXT_P(columnNodeText);
+}
+
+
+/*
+ * column_name_to_column_id takes a relation identifier and a name of a column
+ * in that relation and returns the index of that column in the relation. If
+ * the provided name is a system column or no column at all, this function will
+ * throw an error instead.
+ */
+Datum
+column_name_to_column_id(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	char *columnName = PG_GETARG_CSTRING(1);
+	Relation relation = NULL;
+	Var *column = NULL;
+
+	relation = relation_open(distributedTableId, AccessExclusiveLock);
+
+	column = (Var *) BuildDistributionKeyFromColumnName(relation, columnName);
+
+	relation_close(relation, NoLock);
+
+	PG_RETURN_INT16((int16) column->varattno);
+}
+
+
+/*
+ * create_monolithic_shard_row creates a single shard covering all possible
+ * hash values for a given table and inserts a row representing that shard
+ * into the backing store. It returns the primary key of the new row.
+ */
+Datum
+create_monolithic_shard_row(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	StringInfo minInfo = makeStringInfo();
+	StringInfo maxInfo = makeStringInfo();
+	Datum newShardIdDatum = master_get_new_shardid(NULL);
+	int64 newShardId = DatumGetInt64(newShardIdDatum);
+	text *maxInfoText = NULL;
+	text *minInfoText = NULL;
+
+	appendStringInfo(minInfo, "%d", INT32_MIN);
+	appendStringInfo(maxInfo, "%d", INT32_MAX);
+
+	minInfoText = cstring_to_text(minInfo->data);
+	maxInfoText = cstring_to_text(maxInfo->data);
+
+	InsertShardRow(distributedTableId, newShardId, SHARD_STORAGE_TABLE, minInfoText,
+				   maxInfoText);
+
+	PG_RETURN_INT64(newShardId);
+}
+
+
+/*
+ * create_healthy_local_shard_placement_row inserts a row representing a
+ * finalized placement for localhost (on the default port) into the backing
+ * store.
+ */
+Datum
+create_healthy_local_shard_placement_row(PG_FUNCTION_ARGS)
+{
+	int64 shardId = PG_GETARG_INT64(0);
+	int64 shardLength = 0;
+
+	InsertShardPlacementRow(shardId, FILE_FINALIZED, shardLength, "localhost", 5432);
+
+	PG_RETURN_VOID();
+}
+
+
+/*
+ * delete_shard_placement_row removes a shard placement with the specified ID.
+ */
+Datum
+delete_shard_placement_row(PG_FUNCTION_ARGS)
+{
+	int64 shardId = PG_GETARG_INT64(0);
+	text *hostName = PG_GETARG_TEXT_P(1);
+	int64 hostPort = PG_GETARG_INT64(2);
+	bool successful = true;
+	char *hostNameString = text_to_cstring(hostName);
+
+	DeleteShardPlacementRow(shardId, hostNameString, hostPort);
+
+	PG_RETURN_BOOL(successful);
+}
+
+
+/*
+ * update_shard_placement_row_state sets the state of the placement with the
+ * specified ID.
+ */
+Datum
+update_shard_placement_row_state(PG_FUNCTION_ARGS)
+{
+	int64 shardId = PG_GETARG_INT64(0);
+	text *hostName = PG_GETARG_TEXT_P(1);
+	int64 hostPort = PG_GETARG_INT64(2);
+	RelayFileState shardState = (RelayFileState) PG_GETARG_INT32(3);
+	bool successful = true;
+	char *hostNameString = text_to_cstring(hostName);
+	uint64 shardLength = 0;
+
+	DeleteShardPlacementRow(shardId, hostNameString, hostPort);
+	InsertShardPlacementRow(shardId, shardState, shardLength, hostNameString, hostPort);
+
+	PG_RETURN_BOOL(successful);
+}
+
+
+/*
+ * acquire_shared_shard_lock grabs a shared lock for the specified shard.
+ */
+Datum
+acquire_shared_shard_lock(PG_FUNCTION_ARGS)
+{
+	int64 shardId = PG_GETARG_INT64(0);
+
+	LockShardResource(shardId, ShareLock);
+
+	PG_RETURN_VOID();
+}
--- a/src/backend/distributed/test/fake_fdw.c
+++ b/src/backend/distributed/test/fake_fdw.c
@ -0,0 +1,168 @@
+/*-------------------------------------------------------------------------
+ *
+ * test/src/fake_fdw.c
+ *
+ * This file contains a barebones FDW implementation, suitable for use in
+ * test code. Inspired by Andrew Dunstan's blackhole_fdw.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "fmgr.h"
+
+#include <stddef.h>
+
+#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
+#include "executor/tuptable.h"
+#include "foreign/fdwapi.h"
+#include "nodes/execnodes.h"
+#include "nodes/nodes.h"
+#include "nodes/pg_list.h"
+#include "nodes/plannodes.h"
+#include "nodes/relation.h"
+#include "optimizer/pathnode.h"
+#include "optimizer/planmain.h"
+#include "optimizer/restrictinfo.h"
+#include "utils/palloc.h"
+
+/* local function forward declarations */
+static void FakeGetForeignRelSize(PlannerInfo *root, RelOptInfo *baserel,
+								  Oid foreigntableid);
+static void FakeGetForeignPaths(PlannerInfo *root, RelOptInfo *baserel,
+								Oid foreigntableid);
+#if (PG_VERSION_NUM >= 90300 && PG_VERSION_NUM < 90500)
+static ForeignScan * FakeGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel,
+										Oid foreigntableid, ForeignPath *best_path,
+										List *tlist, List *scan_clauses);
+#else
+static ForeignScan * FakeGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel,
+										Oid foreigntableid, ForeignPath *best_path,
+										List *tlist, List *scan_clauses,
+										Plan *outer_plan);
+#endif
+static void FakeBeginForeignScan(ForeignScanState *node, int eflags);
+static TupleTableSlot * FakeIterateForeignScan(ForeignScanState *node);
+static void FakeReScanForeignScan(ForeignScanState *node);
+static void FakeEndForeignScan(ForeignScanState *node);
+
+
+/* declarations for dynamic loading */
+PG_FUNCTION_INFO_V1(fake_fdw_handler);
+
+
+/*
+ * fake_fdw_handler populates an FdwRoutine with pointers to the functions
+ * implemented within this file.
+ */
+Datum
+fake_fdw_handler(PG_FUNCTION_ARGS)
+{
+	FdwRoutine *fdwroutine = makeNode(FdwRoutine);
+
+	fdwroutine->GetForeignRelSize = FakeGetForeignRelSize;
+	fdwroutine->GetForeignPaths = FakeGetForeignPaths;
+	fdwroutine->GetForeignPlan = FakeGetForeignPlan;
+	fdwroutine->BeginForeignScan = FakeBeginForeignScan;
+	fdwroutine->IterateForeignScan = FakeIterateForeignScan;
+	fdwroutine->ReScanForeignScan = FakeReScanForeignScan;
+	fdwroutine->EndForeignScan = FakeEndForeignScan;
+
+	PG_RETURN_POINTER(fdwroutine);
+}
+
+
+/*
+ * FakeGetForeignRelSize populates baserel with a fake relation size.
+ */
+static void
+FakeGetForeignRelSize(PlannerInfo *root, RelOptInfo *baserel, Oid foreigntableid)
+{
+	baserel->rows = 0;
+	baserel->fdw_private = (void *) palloc0(1);
+}
+
+
+/*
+ * FakeGetForeignPaths adds a single fake foreign path to baserel.
+ */
+static void
+FakeGetForeignPaths(PlannerInfo *root, RelOptInfo *baserel, Oid foreigntableid)
+{
+	Cost startup_cost = 0;
+	Cost total_cost = startup_cost + baserel->rows;
+
+#if (PG_VERSION_NUM >= 90300 && PG_VERSION_NUM < 90500)
+	add_path(baserel, (Path *) create_foreignscan_path(root, baserel, baserel->rows,
+													   startup_cost, total_cost, NIL,
+													   NULL, NIL));
+#else
+	add_path(baserel, (Path *) create_foreignscan_path(root, baserel, baserel->rows,
+													   startup_cost, total_cost, NIL,
+													   NULL, NULL, NIL));
+#endif
+}
+
+
+/*
+ * FakeGetForeignPlan builds a fake foreign plan.
+ */
+#if (PG_VERSION_NUM >= 90300 && PG_VERSION_NUM < 90500)
+static ForeignScan *
+FakeGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel, Oid foreigntableid,
+				   ForeignPath *best_path, List *tlist, List *scan_clauses)
+#else
+static ForeignScan *
+FakeGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel, Oid foreigntableid,
+				   ForeignPath *best_path, List *tlist, List *scan_clauses,
+				   Plan *outer_plan)
+#endif
+{
+	Index scan_relid = baserel->relid;
+	scan_clauses = extract_actual_clauses(scan_clauses, false);
+
+	/* make_foreignscan has a different signature in 9.3 and 9.4 than in 9.5 */
+#if (PG_VERSION_NUM >= 90300 && PG_VERSION_NUM < 90500)
+	return make_foreignscan(tlist, scan_clauses, scan_relid, NIL, NIL);
+#else
+	return make_foreignscan(tlist, scan_clauses, scan_relid, NIL, NIL, NIL, NIL,
+	                        outer_plan);
+#endif
+}
+
+
+/*
+ * FakeBeginForeignScan begins the fake plan (i.e. does nothing).
+ */
+static void
+FakeBeginForeignScan(ForeignScanState *node, int eflags) { }
+
+
+/*
+ * FakeIterateForeignScan continues the fake plan (i.e. does nothing).
+ */
+static TupleTableSlot *
+FakeIterateForeignScan(ForeignScanState *node)
+{
+	TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
+	ExecClearTuple(slot);
+
+	return slot;
+}
+
+
+/*
+ * FakeReScanForeignScan restarts the fake plan (i.e. does nothing).
+ */
+static void
+FakeReScanForeignScan(ForeignScanState *node) { }
+
+
+/*
+ * FakeEndForeignScan ends the fake plan (i.e. does nothing).
+ */
+static void
+FakeEndForeignScan(ForeignScanState *node) { }
--- a/src/backend/distributed/test/generate_ddl_commands.c
+++ b/src/backend/distributed/test/generate_ddl_commands.c
@ -0,0 +1,67 @@
+/*-------------------------------------------------------------------------
+ *
+ * test/src/generate_ddl_commands.c
+ *
+ * This file contains functions to exercise DDL generation functionality
+ * within CitusDB.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "fmgr.h"
+
+#include <stddef.h>
+
+#include "catalog/pg_type.h"
+#include "distributed/master_protocol.h"
+#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
+#include "lib/stringinfo.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodes.h"
+#include "nodes/parsenodes.h"
+#include "nodes/pg_list.h"
+#include "nodes/value.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/palloc.h"
+
+
+/* declarations for dynamic loading */
+PG_FUNCTION_INFO_V1(table_ddl_command_array);
+
+
+/*
+ * table_ddl_command_array returns an array of strings, each of which is a DDL
+ * command required to recreate a table (specified by OID).
+ */
+Datum
+table_ddl_command_array(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	ArrayType *ddlCommandArrayType = NULL;
+	List *ddlCommandList = GetTableDDLEvents(distributedTableId);
+	int ddlCommandCount = list_length(ddlCommandList);
+	Datum *ddlCommandDatumArray = palloc0(ddlCommandCount * sizeof(Datum));
+
+	ListCell *ddlCommandCell = NULL;
+	int ddlCommandIndex = 0;
+	Oid ddlCommandTypeId = TEXTOID;
+
+	foreach(ddlCommandCell, ddlCommandList)
+	{
+		char *ddlCommand = (char *) lfirst(ddlCommandCell);
+		Datum ddlCommandDatum = CStringGetTextDatum(ddlCommand);
+
+		ddlCommandDatumArray[ddlCommandIndex] = ddlCommandDatum;
+		ddlCommandIndex++;
+	}
+
+	ddlCommandArrayType = DatumArrayToArrayType(ddlCommandDatumArray, ddlCommandCount,
+												ddlCommandTypeId);
+
+	PG_RETURN_ARRAYTYPE_P(ddlCommandArrayType);
+}
--- a/src/backend/distributed/test/prune_shard_list.c
+++ b/src/backend/distributed/test/prune_shard_list.c
@ -0,0 +1,214 @@
+/*-------------------------------------------------------------------------
+ *
+ * test/src/create_shards.c
+ *
+ * This file contains functions to exercise shard creation functionality
+ * within CitusDB.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "fmgr.h"
+
+#include <string.h>
+
+#if (PG_VERSION_NUM >= 90500 && PG_VERSION_NUM < 90600)
+#include "access/stratnum.h"
+#else
+#include "access/skey.h"
+#endif
+#include "catalog/pg_type.h"
+#include "distributed/master_metadata_utility.h"
+#include "distributed/multi_join_order.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/resource_lock.h"
+#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
+#include "nodes/pg_list.h"
+#include "nodes/primnodes.h"
+#include "nodes/nodes.h"
+#include "optimizer/clauses.h"
+#include "utils/array.h"
+#include "utils/palloc.h"
+
+
+/* local function forward declarations */
+static Expr * MakeTextPartitionExpression(Oid distributedTableId, text *value);
+static ArrayType * PrunedShardIdsForTable(Oid distributedTableId, List *whereClauseList);
+
+
+/* declarations for dynamic loading */
+PG_FUNCTION_INFO_V1(prune_using_no_values);
+PG_FUNCTION_INFO_V1(prune_using_single_value);
+PG_FUNCTION_INFO_V1(prune_using_either_value);
+PG_FUNCTION_INFO_V1(prune_using_both_values);
+PG_FUNCTION_INFO_V1(debug_equality_expression);
+
+
+/*
+ * prune_using_no_values returns the shards for the specified distributed table
+ * after pruning using an empty clause list.
+ */
+Datum
+prune_using_no_values(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	List *whereClauseList = NIL;
+	ArrayType *shardIdArrayType = PrunedShardIdsForTable(distributedTableId,
+														 whereClauseList);
+
+	PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
+}
+
+
+/*
+ * prune_using_single_value returns the shards for the specified distributed
+ * table after pruning using a single value provided by the caller.
+ */
+Datum
+prune_using_single_value(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	text *value = (PG_ARGISNULL(1)) ? NULL : PG_GETARG_TEXT_P(1);
+	Expr *equalityExpr = MakeTextPartitionExpression(distributedTableId, value);
+	List *whereClauseList = list_make1(equalityExpr);
+	ArrayType *shardIdArrayType = PrunedShardIdsForTable(distributedTableId,
+														 whereClauseList);
+
+	PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
+}
+
+
+/*
+ * prune_using_either_value returns the shards for the specified distributed
+ * table after pruning using either of two values provided by the caller (OR).
+ */
+Datum
+prune_using_either_value(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	text *firstValue = PG_GETARG_TEXT_P(1);
+	text *secondValue = PG_GETARG_TEXT_P(2);
+	Expr *firstQual = MakeTextPartitionExpression(distributedTableId, firstValue);
+	Expr *secondQual = MakeTextPartitionExpression(distributedTableId, secondValue);
+	Expr *orClause = make_orclause(list_make2(firstQual, secondQual));
+	List *whereClauseList = list_make1(orClause);
+	ArrayType *shardIdArrayType = PrunedShardIdsForTable(distributedTableId,
+														 whereClauseList);
+
+	PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
+}
+
+
+/*
+ * prune_using_both_values returns the shards for the specified distributed
+ * table after pruning using both of the values provided by the caller (AND).
+ */
+Datum
+prune_using_both_values(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	text *firstValue = PG_GETARG_TEXT_P(1);
+	text *secondValue = PG_GETARG_TEXT_P(2);
+	Expr *firstQual = MakeTextPartitionExpression(distributedTableId, firstValue);
+	Expr *secondQual = MakeTextPartitionExpression(distributedTableId, secondValue);
+
+	List *whereClauseList = list_make2(firstQual, secondQual);
+	ArrayType *shardIdArrayType = PrunedShardIdsForTable(distributedTableId,
+														 whereClauseList);
+
+	PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
+}
+
+
+/*
+ * debug_equality_expression returns the textual representation of an equality
+ * expression generated by a call to MakeOpExpression.
+ */
+Datum
+debug_equality_expression(PG_FUNCTION_ARGS)
+{
+	Oid distributedTableId = PG_GETARG_OID(0);
+	uint32 rangeTableId = 1;
+	Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
+	OpExpr *equalityExpression = MakeOpExpression(partitionColumn, BTEqualStrategyNumber);
+
+	PG_RETURN_CSTRING(nodeToString(equalityExpression));
+}
+
+
+/*
+ * MakeTextPartitionExpression returns an equality expression between the
+ * specified table's partition column and the provided values.
+ */
+static Expr *
+MakeTextPartitionExpression(Oid distributedTableId, text *value)
+{
+	uint32 rangeTableId = 1;
+	Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
+	Expr *partitionExpression = NULL;
+
+	if (value != NULL)
+	{
+		OpExpr *equalityExpr = MakeOpExpression(partitionColumn, BTEqualStrategyNumber);
+		Const *rightConst = (Const *) get_rightop((Expr *) equalityExpr);
+
+		rightConst->constvalue = (Datum) value;
+		rightConst->constisnull = false;
+		rightConst->constbyval = false;
+
+		partitionExpression = (Expr *) equalityExpr;
+	}
+	else
+	{
+		NullTest *nullTest = makeNode(NullTest);
+		nullTest->arg = (Expr *) partitionColumn;
+		nullTest->nulltesttype = IS_NULL;
+
+		partitionExpression = (Expr *) nullTest;
+	}
+
+	return partitionExpression;
+}
+
+
+/*
+ * PrunedShardIdsForTable loads the shard intervals for the specified table,
+ * prunes them using the provided clauses. It returns an ArrayType containing
+ * the shard identifiers, suitable for return from an SQL-facing function.
+ */
+static ArrayType *
+PrunedShardIdsForTable(Oid distributedTableId, List *whereClauseList)
+{
+	ArrayType *shardIdArrayType = NULL;
+	ListCell *shardCell = NULL;
+	int shardIdIndex = 0;
+	Oid shardIdTypeId = INT8OID;
+	Index tableId = 1;
+
+	List *shardList = LoadShardIntervalList(distributedTableId);
+	int shardIdCount = -1;
+	Datum *shardIdDatumArray = NULL;
+
+	shardList = PruneShardList(distributedTableId, tableId, whereClauseList, shardList);
+
+	shardIdCount = list_length(shardList);
+	shardIdDatumArray = palloc0(shardIdCount * sizeof(Datum));
+
+	foreach(shardCell, shardList)
+	{
+		ShardInterval *shardId = (ShardInterval *) lfirst(shardCell);
+		Datum shardIdDatum = Int64GetDatum(shardId->shardId);
+
+		shardIdDatumArray[shardIdIndex] = shardIdDatum;
+		shardIdIndex++;
+	}
+
+	shardIdArrayType = DatumArrayToArrayType(shardIdDatumArray, shardIdCount,
+											 shardIdTypeId);
+
+	return shardIdArrayType;
+}
--- a/src/backend/distributed/test/test_helper_functions.c
+++ b/src/backend/distributed/test/test_helper_functions.c
@ -0,0 +1,39 @@
+/*-------------------------------------------------------------------------
+ *
+ * test/src/test_helper_functions.c
+ *
+ * This file contains helper functions used in many CitusDB tests.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+
+#include <string.h>
+
+#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
+#include "utils/array.h"
+#include "utils/lsyscache.h"
+
+
+/*
+ * DatumArrayToArrayType converts the provided Datum array (of the specified
+ * length and type) into an ArrayType suitable for returning from a UDF.
+ */
+ArrayType *
+DatumArrayToArrayType(Datum *datumArray, int datumCount, Oid datumTypeId)
+{
+	ArrayType *arrayObject = NULL;
+	int16 typeLength = 0;
+	bool typeByValue = false;
+	char typeAlignment = 0;
+
+	get_typlenbyvalalign(datumTypeId, &typeLength, &typeByValue, &typeAlignment);
+	arrayObject = construct_array(datumArray, datumCount, datumTypeId,
+								  typeLength, typeByValue, typeAlignment);
+
+	return arrayObject;
+}
--- a/src/backend/distributed/utils/citus_nodefuncs.c
+++ b/src/backend/distributed/utils/citus_nodefuncs.c
@ -0,0 +1,304 @@
+/*-------------------------------------------------------------------------
+ *
+ * citus_nodefuncs.c
+ *	  Helper functions for dealing with nodes
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/pg_type.h"
+#include "distributed/citus_nodefuncs.h"
+#include "distributed/metadata_cache.h"
+
+
+/* exports for SQL callable functions */
+PG_FUNCTION_INFO_V1(citusdb_extradata_container);
+
+
+/*
+ * SetRangeTblExtraData adds additional data to a RTE, overwriting previous
+ * values, if present.
+ *
+ * The data is stored as RTE_FUNCTION type RTE of a special
+ * citus_extradata_container function, with the extra data serialized into the
+ * function arguments. That works, because these RTEs aren't used by Postgres
+ * to any significant degree, and Citus' variant of ruleutils.c knows how to
+ * deal with these extended RTEs. Note that rte->eref needs to be set prior
+ * to calling SetRangeTblExtraData to ensure the funccolcount can be set
+ * correctly.
+ *
+ * NB: If used for postgres defined RTEKinds, fields specific to that RTEKind
+ * will not be handled by out/readfuncs.c. For the current uses that's ok.
+ */
+void
+SetRangeTblExtraData(RangeTblEntry *rte, CitusRTEKind rteKind,
+					 char *fragmentSchemaName, char *fragmentTableName,
+					 List *tableIdList)
+{
+	RangeTblFunction *fauxFunction = NULL;
+	FuncExpr *fauxFuncExpr = NULL;
+	Const *rteKindData = NULL;
+	Const *fragmentSchemaData = NULL;
+	Const *fragmentTableData = NULL;
+	Const *tableIdListData = NULL;
+
+	Assert(rte->eref && rte->eref->colnames != NIL);
+
+	/* store RTE kind as a plain int4 */
+	rteKindData = makeNode(Const);
+	rteKindData->consttype = INT4OID;
+	rteKindData->constlen = 4;
+	rteKindData->constvalue = Int32GetDatum(rteKind);
+	rteKindData->constbyval = true;
+	rteKindData->constisnull = false;
+	rteKindData->location = -1;
+
+	/* store the fragment schema as a cstring */
+	fragmentSchemaData = makeNode(Const);
+	fragmentSchemaData->consttype = CSTRINGOID;
+	fragmentSchemaData->constlen = -2;
+	fragmentSchemaData->constvalue = CStringGetDatum(fragmentSchemaName);
+	fragmentSchemaData->constbyval = false;
+	fragmentSchemaData->constisnull = fragmentSchemaName == NULL;
+	fragmentSchemaData->location = -1;
+
+	/* store the fragment name as a cstring */
+	fragmentTableData = makeNode(Const);
+	fragmentTableData->consttype = CSTRINGOID;
+	fragmentTableData->constlen = -2;
+	fragmentTableData->constvalue = CStringGetDatum(fragmentTableName);
+	fragmentTableData->constbyval = false;
+	fragmentTableData->constisnull = fragmentTableName == NULL;
+	fragmentTableData->location = -1;
+
+	/* store the table id list as an array of integers: FIXME */
+	tableIdListData = makeNode(Const);
+	tableIdListData->consttype = CSTRINGOID;
+	tableIdListData->constbyval = false;
+	tableIdListData->constlen = -2;
+	tableIdListData->location = -1;
+
+	/* serialize tableIdList to a string, seems simplest that way */
+	if (tableIdList != NIL)
+	{
+		char *serializedList = nodeToString(tableIdList);
+		tableIdListData->constisnull = false;
+		tableIdListData->constvalue = CStringGetDatum(serializedList);
+	}
+	else
+	{
+		tableIdListData->constisnull = true;
+	}
+
+	/* create function expression to store our faux arguments in */
+	fauxFuncExpr = makeNode(FuncExpr);
+	fauxFuncExpr->funcid = CitusExtraDataContainerFuncId();
+	fauxFuncExpr->funcretset = true;
+	fauxFuncExpr->location = -1;
+	fauxFuncExpr->args = list_make4(rteKindData, fragmentSchemaData,
+									fragmentTableData, tableIdListData);
+
+	fauxFunction = makeNode(RangeTblFunction);
+	fauxFunction->funcexpr = (Node *) fauxFuncExpr;
+
+	/* set the column count to pass ruleutils checks, not used elsewhere */
+	fauxFunction->funccolcount = list_length(rte->eref->colnames);
+
+	rte->rtekind = RTE_FUNCTION;
+	rte->functions = list_make1(fauxFunction);
+}
+
+
+/*
+ * ExtractRangeTblExtraData extracts extra data stored for a range table entry
+ * that previously has been stored with
+ * Set/ModifyRangeTblExtraData. Parameters can be NULL if unintersting. It is
+ * valid to use the function on a RTE without extra data.
+ */
+void
+ExtractRangeTblExtraData(RangeTblEntry *rte, CitusRTEKind *rteKind,
+						 char **fragmentSchemaName, char **fragmentTableName,
+						 List **tableIdList)
+{
+	RangeTblFunction *fauxFunction = NULL;
+	FuncExpr *fauxFuncExpr = NULL;
+	Const *tmpConst = NULL;
+
+	/* set base rte kind first, so this can be used for 'non-extended' RTEs as well */
+	if (rteKind != NULL)
+	{
+		*rteKind = (CitusRTEKind) rte->rtekind;
+	}
+
+	/* reset values of optionally-present fields, will later be overwritten, if present */
+	if (fragmentSchemaName != NULL)
+	{
+		*fragmentSchemaName = NULL;
+	}
+
+	if (fragmentTableName != NULL)
+	{
+		*fragmentTableName = NULL;
+	}
+
+	if (tableIdList != NULL)
+	{
+		*tableIdList = NIL;
+	}
+
+
+	/* only function RTEs have our special extra data */
+	if (rte->rtekind != RTE_FUNCTION)
+	{
+		return;
+	}
+
+	/* we only ever generate one argument */
+	if (list_length(rte->functions) != 1)
+	{
+		return;
+	}
+
+	/* should pretty much always be a FuncExpr, but be liberal in what we expect... */
+	fauxFunction = linitial(rte->functions);
+	if (!IsA(fauxFunction->funcexpr, FuncExpr))
+	{
+		return;
+	}
+
+	fauxFuncExpr = (FuncExpr *) fauxFunction->funcexpr;
+
+	/*
+	 * There will never be a range table entry with this function id, but for
+	 * the purpose of this file.
+	 */
+	if (fauxFuncExpr->funcid != CitusExtraDataContainerFuncId())
+	{
+		return;
+	}
+
+	/*
+	 * Extra data for rtes is stored in the function arguments. The first
+	 * argument stores the rtekind, second fragmentSchemaName, third
+	 * fragmentTableName, fourth tableIdList.
+	 */
+	if (list_length(fauxFuncExpr->args) != 4)
+	{
+		ereport(ERROR, (errmsg("unexpected number of function arguments to "
+							   "citusdb_extradata_container")));
+		return;
+	}
+
+	/* extract rteKind */
+	tmpConst = (Const *) linitial(fauxFuncExpr->args);
+	Assert(IsA(tmpConst, Const));
+	Assert(tmpConst->consttype == INT4OID);
+	if (rteKind != NULL)
+	{
+		*rteKind = DatumGetInt32(tmpConst->constvalue);
+	}
+
+	/* extract fragmentSchemaName */
+	tmpConst = (Const *) lsecond(fauxFuncExpr->args);
+	Assert(IsA(tmpConst, Const));
+	Assert(tmpConst->consttype == CSTRINGOID);
+	if (fragmentSchemaName != NULL && !tmpConst->constisnull)
+	{
+		*fragmentSchemaName = DatumGetCString(tmpConst->constvalue);
+	}
+
+	/* extract fragmentTableName */
+	tmpConst = (Const *) lthird(fauxFuncExpr->args);
+	Assert(IsA(tmpConst, Const));
+	Assert(tmpConst->consttype == CSTRINGOID);
+	if (fragmentTableName != NULL && !tmpConst->constisnull)
+	{
+		*fragmentTableName = DatumGetCString(tmpConst->constvalue);
+	}
+
+	/* extract tableIdList, stored as a serialized integer list */
+	tmpConst = (Const *) lfourth(fauxFuncExpr->args);
+	Assert(IsA(tmpConst, Const));
+	Assert(tmpConst->consttype == CSTRINGOID);
+	if (tableIdList != NULL && !tmpConst->constisnull)
+	{
+		Node *deserializedList = stringToNode(DatumGetCString(tmpConst->constvalue));
+		Assert(IsA(deserializedList, IntList));
+
+		*tableIdList = (List *) deserializedList;
+	}
+}
+
+
+/*
+ * ModifyRangeTblExtraData sets the RTE extra data fields for the passed
+ * fields, leaving the current values in place for the ones not specified.
+ *
+ * rteKind has to be specified, fragmentSchemaName, fragmentTableName,
+ * tableIdList can be set to NULL/NIL respectively to leave the current values
+ * in-place.
+ */
+void
+ModifyRangeTblExtraData(RangeTblEntry *rte, CitusRTEKind rteKind,
+						char *fragmentSchemaName, char *fragmentTableName,
+						List *tableIdList)
+{
+	/* load existing values for the arguments not specifying a new value */
+	ExtractRangeTblExtraData(rte, NULL,
+							 fragmentSchemaName == NULL ? &fragmentSchemaName : NULL,
+							 fragmentTableName == NULL ? &fragmentTableName : NULL,
+							 tableIdList == NIL ? &tableIdList : NULL);
+
+	SetRangeTblExtraData(rte, rteKind,
+						 fragmentSchemaName, fragmentTableName,
+						 tableIdList);
+}
+
+
+/* GetRangeTblKind returns rtekind of a RTE, be it an extended one or not. */
+CitusRTEKind
+GetRangeTblKind(RangeTblEntry *rte)
+{
+	CitusRTEKind rteKind = CITUS_RTE_RELATION /* invalid */;
+
+	switch(rte->rtekind)
+	{
+		/* directly rtekind if it's not possibly an extended RTE */
+		case RTE_RELATION:
+		case RTE_SUBQUERY:
+		case RTE_JOIN:
+		case RTE_VALUES:
+		case RTE_CTE:
+			rteKind = (CitusRTEKind) rte->rtekind;
+			break;
+		case RTE_FUNCTION:
+			/*
+			 * Extract extra data - correct even if a plain RTE_FUNCTION, not
+			 * an extended one, ExtractRangeTblExtraData handles that case
+			 * transparently.
+			 */
+			ExtractRangeTblExtraData(rte, &rteKind, NULL, NULL, NULL);
+			break;
+	}
+
+	return rteKind;
+}
+
+
+/*
+ * citusdb_extradata_container is a placeholder function to store information
+ * needed by CitusDB in plain postgres node trees. Executor and other hooks
+ * should always intercept statements containing calls to this function. It's
+ * not actually SQL callable by the user because of an INTERNAL argument.
+ */
+Datum
+citusdb_extradata_container(PG_FUNCTION_ARGS)
+{
+	ereport(ERROR, (errmsg("not supposed to get here, did you cheat?")));
+
+	PG_RETURN_NULL();
+}
--- a/src/backend/distributed/utils/citus_outfuncs.c
+++ b/src/backend/distributed/utils/citus_outfuncs.c
@ -0,0 +1,595 @@
+/*-------------------------------------------------------------------------
+ *
+ * citus_outfuncs.c
+ *	  Output functions for CitusDB tree nodes.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2012-2015, Citus Data, Inc.
+ *
+ * NOTES
+ *	  This is a wrapper around postgres' nodeToString() that additionally
+ *	  supports CitusDB node types.
+ *
+ *    Keep as closely aligned with the upstream version as possible.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+
+#include "distributed/citus_nodefuncs.h"
+#include "distributed/citus_nodes.h"
+#include "distributed/multi_logical_planner.h"
+#include "distributed/multi_physical_planner.h"
+#include "distributed/master_metadata_utility.h"
+#include "lib/stringinfo.h"
+#include "nodes/plannodes.h"
+#include "nodes/relation.h"
+#include "utils/datum.h"
+
+
+/*
+ * Macros to simplify output of different kinds of fields.  Use these
+ * wherever possible to reduce the chance for silly typos.  Note that these
+ * hard-wire conventions about the names of the local variables in an Out
+ * routine.
+ */
+
+/* Write the label for the node type */
+#define WRITE_NODE_TYPE(nodelabel) \
+	appendStringInfoString(str, nodelabel)
+
+/* Write an integer field (anything written as ":fldname %d") */
+#define WRITE_INT_FIELD(fldname) \
+	appendStringInfo(str, " :" CppAsString(fldname) " %d", node->fldname)
+
+/* Write an unsigned integer field (anything written as ":fldname %u") */
+#define WRITE_UINT_FIELD(fldname) \
+	appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
+
+/* XXX: Citus: Write an unsigned 64-bit integer field */
+#define WRITE_UINT64_FIELD(fldname) \
+	appendStringInfo(str, " :" CppAsString(fldname) " " UINT64_FORMAT, node->fldname)
+
+/* Write an OID field (don't hard-wire assumption that OID is same as uint) */
+#define WRITE_OID_FIELD(fldname) \
+	appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
+
+/* Write a long-integer field */
+#define WRITE_LONG_FIELD(fldname) \
+	appendStringInfo(str, " :" CppAsString(fldname) " %ld", node->fldname)
+
+/* Write a char field (ie, one ascii character) */
+#define WRITE_CHAR_FIELD(fldname) \
+	appendStringInfo(str, " :" CppAsString(fldname) " %c", node->fldname)
+
+/* Write an enumerated-type field as an integer code */
+#define WRITE_ENUM_FIELD(fldname, enumtype) \
+	appendStringInfo(str, " :" CppAsString(fldname) " %d", \
+					 (int) node->fldname)
+
+/* Write a float field --- caller must give format to define precision */
+#define WRITE_FLOAT_FIELD(fldname,format) \
+	appendStringInfo(str, " :" CppAsString(fldname) " " format, node->fldname)
+
+/* Write a boolean field */
+#define WRITE_BOOL_FIELD(fldname) \
+	appendStringInfo(str, " :" CppAsString(fldname) " %s", \
+					 booltostr(node->fldname))
+
+/* Write a character-string (possibly NULL) field */
+#define WRITE_STRING_FIELD(fldname) \
+	(appendStringInfo(str, " :" CppAsString(fldname) " "), \
+	 _outToken(str, node->fldname))
+
+/* Write a parse location field (actually same as INT case) */
+#define WRITE_LOCATION_FIELD(fldname) \
+	appendStringInfo(str, " :" CppAsString(fldname) " %d", node->fldname)
+
+/* Write a Node field */
+#define WRITE_NODE_FIELD(fldname) \
+	(appendStringInfo(str, " :" CppAsString(fldname) " "), \
+	 _outNode(str, node->fldname))
+
+/* Write a bitmapset field */
+#define WRITE_BITMAPSET_FIELD(fldname) \
+	(appendStringInfo(str, " :" CppAsString(fldname) " "), \
+	 _outBitmapset(str, node->fldname))
+
+
+#define booltostr(x)  ((x) ? "true" : "false")
+
+static void _outNode(StringInfo str, const void *obj);
+
+
+/*
+ * _outToken
+ *	  Convert an ordinary string (eg, an identifier) into a form that
+ *	  will be decoded back to a plain token by read.c's functions.
+ *
+ *	  If a null or empty string is given, it is encoded as "<>".
+ */
+static void
+_outToken(StringInfo str, const char *s)
+{
+	if (s == NULL || *s == '\0')
+	{
+		appendStringInfoString(str, "<>");
+		return;
+	}
+
+	/*
+	 * Look for characters or patterns that are treated specially by read.c
+	 * (either in pg_strtok() or in nodeRead()), and therefore need a
+	 * protective backslash.
+	 */
+	/* These characters only need to be quoted at the start of the string */
+	if (*s == '<' ||
+		*s == '\"' ||
+		isdigit((unsigned char) *s) ||
+		((*s == '+' || *s == '-') &&
+		 (isdigit((unsigned char) s[1]) || s[1] == '.')))
+		appendStringInfoChar(str, '\\');
+	while (*s)
+	{
+		/* These chars must be backslashed anywhere in the string */
+		if (*s == ' ' || *s == '\n' || *s == '\t' ||
+			*s == '(' || *s == ')' || *s == '{' || *s == '}' ||
+			*s == '\\')
+			appendStringInfoChar(str, '\\');
+		appendStringInfoChar(str, *s++);
+	}
+}
+
+
+static void
+_outList(StringInfo str, const List *node)
+{
+	const ListCell *lc;
+
+	appendStringInfoChar(str, '(');
+
+	if (IsA(node, IntList))
+		appendStringInfoChar(str, 'i');
+	else if (IsA(node, OidList))
+		appendStringInfoChar(str, 'o');
+
+	foreach(lc, node)
+	{
+		/*
+		 * For the sake of backward compatibility, we emit a slightly
+		 * different whitespace format for lists of nodes vs. other types of
+		 * lists. XXX: is this necessary?
+		 */
+		if (IsA(node, List))
+		{
+			_outNode(str, lfirst(lc));
+			if (lnext(lc))
+				appendStringInfoChar(str, ' ');
+		}
+		else if (IsA(node, IntList))
+			appendStringInfo(str, " %d", lfirst_int(lc));
+		else if (IsA(node, OidList))
+			appendStringInfo(str, " %u", lfirst_oid(lc));
+		else
+			elog(ERROR, "unrecognized list node type: %d",
+				 (int) node->type);
+	}
+
+	appendStringInfoChar(str, ')');
+}
+
+
+/*
+ * Print the value of a Datum given its type.
+ */
+static void
+_outDatum(StringInfo str, Datum value, int typlen, bool typbyval)
+{
+	Size		length,
+				i;
+	char	   *s;
+
+	length = datumGetSize(value, typbyval, typlen);
+
+	if (typbyval)
+	{
+		s = (char *) (&value);
+		appendStringInfo(str, "%u [ ", (unsigned int) length);
+		for (i = 0; i < (Size) sizeof(Datum); i++)
+			appendStringInfo(str, "%d ", (int) (s[i]));
+		appendStringInfoChar(str, ']');
+	}
+	else
+	{
+		s = (char *) DatumGetPointer(value);
+		if (!PointerIsValid(s))
+			appendStringInfoString(str, "0 [ ]");
+		else
+		{
+			appendStringInfo(str, "%u [ ", (unsigned int) length);
+			for (i = 0; i < length; i++)
+				appendStringInfo(str, "%d ", (int) (s[i]));
+			appendStringInfoChar(str, ']');
+		}
+	}
+}
+
+
+/*****************************************************************************
+ *	Output routines for CitusDB node types
+ *****************************************************************************/
+
+static void
+_outMultiUnaryNode(StringInfo str, const MultiUnaryNode *node)
+{
+	WRITE_NODE_FIELD(childNode);
+}
+
+
+static void
+_outMultiBinaryNode(StringInfo str, const MultiBinaryNode *node)
+{
+	WRITE_NODE_FIELD(leftChildNode);
+	WRITE_NODE_FIELD(rightChildNode);
+}
+
+
+static void
+_outMultiTreeRoot(StringInfo str, const MultiTreeRoot *node)
+{
+	WRITE_NODE_TYPE("MULTITREEROOT");
+
+	_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
+}
+
+
+static void
+_outMultiPlan(StringInfo str, const MultiPlan *node)
+{
+	WRITE_NODE_TYPE("MULTIPLAN");
+
+	WRITE_NODE_FIELD(workerJob);
+	WRITE_NODE_FIELD(masterQuery);
+	WRITE_STRING_FIELD(masterTableName);
+}
+
+
+static void
+_outMultiProject(StringInfo str, const MultiProject *node)
+{
+	WRITE_NODE_TYPE("MULTIPROJECT");
+
+	WRITE_NODE_FIELD(columnList);
+
+	_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
+}
+
+
+static void
+_outMultiCollect(StringInfo str, const MultiCollect *node)
+{
+	WRITE_NODE_TYPE("MULTICOLLECT");
+
+	_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
+}
+
+
+static void
+_outMultiSelect(StringInfo str, const MultiSelect *node)
+{
+	WRITE_NODE_TYPE("MULTISELECT");
+
+	WRITE_NODE_FIELD(selectClauseList);
+
+	_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
+}
+
+
+static void
+_outMultiTable(StringInfo str, const MultiTable *node)
+{
+	WRITE_NODE_TYPE("MULTITABLE");
+
+	WRITE_OID_FIELD(relationId);
+	WRITE_INT_FIELD(rangeTableId);
+
+	_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
+}
+
+
+static void
+_outMultiJoin(StringInfo str, const MultiJoin *node)
+{
+	WRITE_NODE_TYPE("MULTIJOIN");
+
+	WRITE_NODE_FIELD(joinClauseList);
+	WRITE_ENUM_FIELD(joinRuleType, JoinRuleType);
+	WRITE_ENUM_FIELD(joinType, JoinType);
+
+	_outMultiBinaryNode(str, (const MultiBinaryNode *) node);
+}
+
+
+static void
+_outMultiPartition(StringInfo str, const MultiPartition *node)
+{
+	WRITE_NODE_TYPE("MULTIPARTITION");
+
+	WRITE_NODE_FIELD(partitionColumn);
+
+	_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
+}
+
+
+static void
+_outMultiCartesianProduct(StringInfo str, const MultiCartesianProduct *node)
+{
+	WRITE_NODE_TYPE("MULTICARTESIANPRODUCT");
+
+	_outMultiBinaryNode(str, (const MultiBinaryNode *) node);
+}
+
+
+
+
+static void
+_outMultiExtendedOp(StringInfo str, const MultiExtendedOp *node)
+{
+	WRITE_NODE_TYPE("MULTIEXTENDEDOP");
+
+	WRITE_NODE_FIELD(targetList);
+	WRITE_NODE_FIELD(groupClauseList);
+	WRITE_NODE_FIELD(sortClauseList);
+	WRITE_NODE_FIELD(limitCount);
+	WRITE_NODE_FIELD(limitOffset);
+
+	_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
+}
+
+static void
+_outJobInfo(StringInfo str, const Job *node)
+{
+	WRITE_UINT64_FIELD(jobId);
+	WRITE_NODE_FIELD(jobQuery);
+	WRITE_NODE_FIELD(taskList);
+	WRITE_NODE_FIELD(dependedJobList);
+	WRITE_BOOL_FIELD(subqueryPushdown);
+}
+
+
+static void
+_outJob(StringInfo str, const Job *node)
+{
+	WRITE_NODE_TYPE("JOB");
+
+	_outJobInfo(str, node);
+}
+
+
+static void
+_outShardInterval(StringInfo str, const ShardInterval *node)
+{
+	WRITE_NODE_TYPE("SHARDINTERVAL");
+
+	WRITE_OID_FIELD(relationId);
+	WRITE_CHAR_FIELD(storageType);
+	WRITE_OID_FIELD(valueTypeId);
+	WRITE_INT_FIELD(valueTypeLen);
+	WRITE_BOOL_FIELD(valueByVal);
+	WRITE_BOOL_FIELD(minValueExists);
+	WRITE_BOOL_FIELD(maxValueExists);
+
+	appendStringInfoString(str, " :minValue ");
+	if (!node->minValueExists)
+		appendStringInfoString(str, "<>");
+	else
+		_outDatum(str, node->minValue, node->valueTypeLen, node->valueByVal);
+
+	appendStringInfoString(str, " :maxValue ");
+	if (!node->maxValueExists)
+		appendStringInfoString(str, "<>");
+	else
+		_outDatum(str, node->maxValue, node->valueTypeLen, node->valueByVal);
+
+	WRITE_UINT64_FIELD(shardId);
+}
+
+
+static void
+_outMapMergeJob(StringInfo str, const MapMergeJob *node)
+{
+	int arrayLength = node->sortedShardIntervalArrayLength;
+	int i;
+
+	WRITE_NODE_TYPE("MAPMERGEJOB");
+
+	_outJobInfo(str, (Job *) node);
+	WRITE_NODE_FIELD(reduceQuery);
+	WRITE_ENUM_FIELD(partitionType, PartitionType);
+	WRITE_NODE_FIELD(partitionColumn);
+	WRITE_UINT_FIELD(partitionCount);
+	WRITE_INT_FIELD(sortedShardIntervalArrayLength);
+
+	for (i = 0; i < arrayLength; ++i)
+	{
+		ShardInterval *writeElement = node->sortedShardIntervalArray[i];
+
+		_outShardInterval(str, writeElement);
+	}
+
+	WRITE_NODE_FIELD(mapTaskList);
+	WRITE_NODE_FIELD(mergeTaskList);
+}
+
+
+static void
+_outShardPlacement(StringInfo str, const ShardPlacement *node)
+{
+	WRITE_NODE_TYPE("SHARDPLACEMENT");
+
+	WRITE_OID_FIELD(tupleOid);
+	WRITE_UINT64_FIELD(shardId);
+	WRITE_UINT64_FIELD(shardLength);
+	WRITE_ENUM_FIELD(shardState, RelayFileState);
+	WRITE_STRING_FIELD(nodeName);
+	WRITE_UINT_FIELD(nodePort);
+}
+
+
+static void
+_outTask(StringInfo str, const Task *node)
+{
+	WRITE_NODE_TYPE("TASK");
+
+	WRITE_ENUM_FIELD(taskType, TaskType);
+	WRITE_UINT64_FIELD(jobId);
+	WRITE_UINT_FIELD(taskId);
+	WRITE_STRING_FIELD(queryString);
+	WRITE_UINT64_FIELD(anchorShardId);
+	WRITE_NODE_FIELD(taskPlacementList);
+	WRITE_NODE_FIELD(dependedTaskList);
+	WRITE_UINT_FIELD(partitionId);
+	WRITE_UINT_FIELD(upstreamTaskId);
+	WRITE_NODE_FIELD(shardInterval);
+	WRITE_BOOL_FIELD(assignmentConstrained);
+	WRITE_NODE_FIELD(taskExecution);
+	WRITE_BOOL_FIELD(upsertQuery);
+}
+
+
+/*
+ * _outNode -
+ *	  converts a Node into ascii string and append it to 'str'
+ */
+static void
+_outNode(StringInfo str, const void *obj)
+{
+	if (obj == NULL)
+	{
+		appendStringInfoString(str, "<>");
+		return;
+	}
+
+
+	switch (CitusNodeTag(obj))
+	{
+		case T_List:
+		case T_IntList:
+		case T_OidList:
+			_outList(str, obj);
+			break;
+
+		case T_MultiTreeRoot:
+			appendStringInfoChar(str, '{');
+			_outMultiTreeRoot(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MultiProject:
+			appendStringInfoChar(str, '{');
+			_outMultiProject(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MultiCollect:
+			appendStringInfoChar(str, '{');
+			_outMultiCollect(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MultiSelect:
+			appendStringInfoChar(str, '{');
+			_outMultiSelect(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MultiTable:
+			appendStringInfoChar(str, '{');
+			_outMultiTable(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MultiJoin:
+			appendStringInfoChar(str, '{');
+			_outMultiJoin(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MultiPartition:
+			appendStringInfoChar(str, '{');
+			_outMultiPartition(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MultiCartesianProduct:
+			appendStringInfoChar(str, '{');
+			_outMultiCartesianProduct(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MultiExtendedOp:
+			appendStringInfoChar(str, '{');
+			_outMultiExtendedOp(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_Job:
+			appendStringInfoChar(str, '{');
+			_outJob(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MapMergeJob:
+			appendStringInfoChar(str, '{');
+			_outMapMergeJob(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_MultiPlan:
+			appendStringInfoChar(str, '{');
+			_outMultiPlan(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_Task:
+			appendStringInfoChar(str, '{');
+			_outTask(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_ShardInterval:
+			appendStringInfoChar(str, '{');
+			_outShardInterval(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		case T_ShardPlacement:
+			appendStringInfoChar(str, '{');
+			_outShardPlacement(str, obj);
+			appendStringInfoChar(str, '}');
+			break;
+
+		default:
+			/* fall back into postgres' normal nodeToString machinery */
+			appendStringInfoString(str, nodeToString(obj));
+	}
+}
+
+
+/*
+ * CitusNodeToString -
+ *	   returns the ascii representation of the Node as a palloc'd string
+ */
+char *
+CitusNodeToString(const void *obj)
+{
+	StringInfoData str;
+
+	initStringInfo(&str);
+	_outNode(&str, obj);
+	return str.data;
+}
--- a/src/backend/distributed/utils/citus_read.c
+++ b/src/backend/distributed/utils/citus_read.c
@ -0,0 +1,348 @@
+/*-------------------------------------------------------------------------
+ *
+ * citus_read.c
+ *	  Citus version of postgres' read.c, using a different state variable for
+ *	  citus_pg_strtok.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2012-2015, Citus Data, Inc.
+ *
+ * NOTES
+ *    Unfortunately we have to copy this file as the state variable for
+ *    pg_strtok is not externally accessible. That prevents creating a a
+ *    version of stringToNode() that calls CitusNodeRead() instead of
+ *    nodeRead().  Luckily these functions seldomly change.
+ *
+ *    Keep as closely aligned with the upstream version as possible.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+
+#include "nodes/pg_list.h"
+#include "nodes/readfuncs.h"
+#include "distributed/citus_nodefuncs.h"
+#include "nodes/value.h"
+
+
+/* Static state for citus_pg_strtok */
+static char *citus_pg_strtok_ptr = NULL;
+
+
+/*
+ * CitusStringToNode -
+ *	  returns a Node with a given legal ASCII representation
+ */
+void *
+CitusStringToNode(char *str)
+{
+	char	   *save_strtok;
+	void	   *retval;
+
+	/*
+	 * We save and restore the pre-existing state of citus_pg_strtok. This makes the
+	 * world safe for re-entrant invocation of stringToNode, without incurring
+	 * a lot of notational overhead by having to pass the next-character
+	 * pointer around through all the readfuncs.c code.
+	 */
+	save_strtok = citus_pg_strtok_ptr;
+
+	citus_pg_strtok_ptr = str;		/* point citus_pg_strtok at the string to read */
+
+	retval = CitusNodeRead(NULL, 0); /* do the reading */
+
+	citus_pg_strtok_ptr = save_strtok;
+
+	return retval;
+}
+
+/*
+ * citus_pg_strtok is a copy of postgres' pg_strtok routine, referencing
+ * citus_pg_strtok_ptr instead of pg_strtok_ptr as state.
+*/
+char *
+citus_pg_strtok(int *length)
+{
+	char	   *local_str;		/* working pointer to string */
+	char	   *ret_str;		/* start of token to return */
+
+	local_str = citus_pg_strtok_ptr;
+
+	while (*local_str == ' ' || *local_str == '\n' || *local_str == '\t')
+		local_str++;
+
+	if (*local_str == '\0')
+	{
+		*length = 0;
+		citus_pg_strtok_ptr = local_str;
+		return NULL;			/* no more tokens */
+	}
+
+	/*
+	 * Now pointing at start of next token.
+	 */
+	ret_str = local_str;
+
+	if (*local_str == '(' || *local_str == ')' ||
+		*local_str == '{' || *local_str == '}')
+	{
+		/* special 1-character token */
+		local_str++;
+	}
+	else
+	{
+		/* Normal token, possibly containing backslashes */
+		while (*local_str != '\0' &&
+			   *local_str != ' ' && *local_str != '\n' &&
+			   *local_str != '\t' &&
+			   *local_str != '(' && *local_str != ')' &&
+			   *local_str != '{' && *local_str != '}')
+		{
+			if (*local_str == '\\' && local_str[1] != '\0')
+				local_str += 2;
+			else
+				local_str++;
+		}
+	}
+
+	*length = local_str - ret_str;
+
+	/* Recognize special case for "empty" token */
+	if (*length == 2 && ret_str[0] == '<' && ret_str[1] == '>')
+		*length = 0;
+
+	citus_pg_strtok_ptr = local_str;
+
+	return ret_str;
+}
+
+#define RIGHT_PAREN (1000000 + 1)
+#define LEFT_PAREN	(1000000 + 2)
+#define LEFT_BRACE	(1000000 + 3)
+#define OTHER_TOKEN (1000000 + 4)
+
+/*
+ * nodeTokenType -
+ *	  returns the type of the node token contained in token.
+ *	  It returns one of the following valid NodeTags:
+ *		T_Integer, T_Float, T_String, T_BitString
+ *	  and some of its own:
+ *		RIGHT_PAREN, LEFT_PAREN, LEFT_BRACE, OTHER_TOKEN
+ *
+ *	  Assumption: the ascii representation is legal
+ */
+static NodeTag
+nodeTokenType(char *token, int length)
+{
+	NodeTag		retval;
+	char	   *numptr;
+	int			numlen;
+
+	/*
+	 * Check if the token is a number
+	 */
+	numptr = token;
+	numlen = length;
+	if (*numptr == '+' || *numptr == '-')
+		numptr++, numlen--;
+	if ((numlen > 0 && isdigit((unsigned char) *numptr)) ||
+		(numlen > 1 && *numptr == '.' && isdigit((unsigned char) numptr[1])))
+	{
+		/*
+		 * Yes.  Figure out whether it is integral or float; this requires
+		 * both a syntax check and a range check. strtol() can do both for us.
+		 * We know the token will end at a character that strtol will stop at,
+		 * so we do not need to modify the string.
+		 */
+		long		val;
+		char	   *endptr;
+
+		errno = 0;
+		val = strtol(token, &endptr, 10);
+		(void) val;				/* avoid compiler warning if unused */
+		if (endptr != token + length || errno == ERANGE
+#ifdef HAVE_LONG_INT_64
+		/* if long > 32 bits, check for overflow of int4 */
+			|| val != (long) ((int32) val)
+#endif
+			)
+			return T_Float;
+		return T_Integer;
+	}
+
+	/*
+	 * these three cases do not need length checks, since citus_pg_strtok() will
+	 * always treat them as single-byte tokens
+	 */
+	else if (*token == '(')
+		retval = LEFT_PAREN;
+	else if (*token == ')')
+		retval = RIGHT_PAREN;
+	else if (*token == '{')
+		retval = LEFT_BRACE;
+	else if (*token == '\"' && length > 1 && token[length - 1] == '\"')
+		retval = T_String;
+	else if (*token == 'b')
+		retval = T_BitString;
+	else
+		retval = OTHER_TOKEN;
+	return retval;
+}
+
+
+/*
+ * CitusNodeRead is an adapted copy of postgres' nodeRead routine, using
+ * citus_pg_strtok_ptr instead of pg_strtok_ptr.
+ */
+void *
+CitusNodeRead(char *token, int tok_len)
+{
+	Node	   *result;
+	NodeTag		type;
+
+	if (token == NULL)			/* need to read a token? */
+	{
+		token = citus_pg_strtok(&tok_len);
+
+		if (token == NULL)		/* end of input */
+			return NULL;
+	}
+
+	type = nodeTokenType(token, tok_len);
+
+	switch ((int) type)
+	{
+		case LEFT_BRACE:
+			result = CitusParseNodeString();
+			token = citus_pg_strtok(&tok_len);
+			if (token == NULL || token[0] != '}')
+				elog(ERROR, "did not find '}' at end of input node");
+			break;
+		case LEFT_PAREN:
+			{
+				List	   *l = NIL;
+
+				/*----------
+				 * Could be an integer list:	(i int int ...)
+				 * or an OID list:				(o int int ...)
+				 * or a list of nodes/values:	(node node ...)
+				 *----------
+				 */
+				token = citus_pg_strtok(&tok_len);
+				if (token == NULL)
+					elog(ERROR, "unterminated List structure");
+				if (tok_len == 1 && token[0] == 'i')
+				{
+					/* List of integers */
+					for (;;)
+					{
+						int			val;
+						char	   *endptr;
+
+						token = citus_pg_strtok(&tok_len);
+						if (token == NULL)
+							elog(ERROR, "unterminated List structure");
+						if (token[0] == ')')
+							break;
+						val = (int) strtol(token, &endptr, 10);
+						if (endptr != token + tok_len)
+							elog(ERROR, "unrecognized integer: \"%.*s\"",
+								 tok_len, token);
+						l = lappend_int(l, val);
+					}
+				}
+				else if (tok_len == 1 && token[0] == 'o')
+				{
+					/* List of OIDs */
+					for (;;)
+					{
+						Oid			val;
+						char	   *endptr;
+
+						token = citus_pg_strtok(&tok_len);
+						if (token == NULL)
+							elog(ERROR, "unterminated List structure");
+						if (token[0] == ')')
+							break;
+						val = (Oid) strtoul(token, &endptr, 10);
+						if (endptr != token + tok_len)
+							elog(ERROR, "unrecognized OID: \"%.*s\"",
+								 tok_len, token);
+						l = lappend_oid(l, val);
+					}
+				}
+				else
+				{
+					/* List of other node types */
+					for (;;)
+					{
+						/* We have already scanned next token... */
+						if (token[0] == ')')
+							break;
+						l = lappend(l, CitusNodeRead(token, tok_len));
+						token = citus_pg_strtok(&tok_len);
+						if (token == NULL)
+							elog(ERROR, "unterminated List structure");
+					}
+				}
+				result = (Node *) l;
+				break;
+			}
+		case RIGHT_PAREN:
+			elog(ERROR, "unexpected right parenthesis");
+			result = NULL;		/* keep compiler happy */
+			break;
+		case OTHER_TOKEN:
+			if (tok_len == 0)
+			{
+				/* must be "<>" --- represents a null pointer */
+				result = NULL;
+			}
+			else
+			{
+				elog(ERROR, "unrecognized token: \"%.*s\"", tok_len, token);
+				result = NULL;	/* keep compiler happy */
+			}
+			break;
+		case T_Integer:
+
+			/*
+			 * we know that the token terminates on a char atol will stop at
+			 */
+			result = (Node *) makeInteger(atol(token));
+			break;
+		case T_Float:
+			{
+				char	   *fval = (char *) palloc(tok_len + 1);
+
+				memcpy(fval, token, tok_len);
+				fval[tok_len] = '\0';
+				result = (Node *) makeFloat(fval);
+			}
+			break;
+		case T_String:
+			/* need to remove leading and trailing quotes, and backslashes */
+			result = (Node *) makeString(debackslash(token + 1, tok_len - 2));
+			break;
+		case T_BitString:
+			{
+				char	   *val = palloc(tok_len);
+
+				/* skip leading 'b' */
+				memcpy(val, token + 1, tok_len - 1);
+				val[tok_len - 1] = '\0';
+				result = (Node *) makeBitString(val);
+				break;
+			}
+		default:
+			elog(ERROR, "unrecognized node type: %d", (int) type);
+			result = NULL;		/* keep compiler happy */
+			break;
+	}
+
+	return (void *) result;
+}
--- a/src/backend/distributed/utils/citus_readfuncs_94.c
+++ b/src/backend/distributed/utils/citus_readfuncs_94.c
--- a/src/backend/distributed/utils/citus_readfuncs_95.c
+++ b/src/backend/distributed/utils/citus_readfuncs_95.c
--- a/src/backend/distributed/utils/citus_ruleutils.c
+++ b/src/backend/distributed/utils/citus_ruleutils.c
@ -0,0 +1,560 @@
+/*-------------------------------------------------------------------------
+ *
+ * citus_ruleutils.c
+ *	  Version independent ruleutils wrapper
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "catalog/dependency.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_aggregate.h"
+#include "catalog/pg_extension.h"
+#include "catalog/pg_foreign_data_wrapper.h"
+#include "catalog/pg_opclass.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "distributed/citus_nodefuncs.h"
+#include "distributed/citus_ruleutils.h"
+#include "commands/defrem.h"
+#include "commands/extension.h"
+#include "foreign/foreign.h"
+#include "funcapi.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/tlist.h"
+#include "parser/keywords.h"
+#include "parser/parse_agg.h"
+#include "parser/parse_func.h"
+#include "parser/parse_oper.h"
+#include "parser/parser.h"
+#include "parser/parsetree.h"
+#include "rewrite/rewriteHandler.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#if (PG_VERSION_NUM >= 90500)
+#include "utils/ruleutils.h"
+#endif
+#include "utils/syscache.h"
+#include "utils/typcache.h"
+#include "utils/xml.h"
+
+static Oid get_extension_schema(Oid ext_oid);
+static void AppendOptionListToString(StringInfo stringData, List *options);
+
+/*
+ * pg_get_extensiondef_string finds the foreign data wrapper that corresponds to
+ * the given foreign tableId, and checks if an extension owns this foreign data
+ * wrapper. If it does, the function returns the extension's definition. If not,
+ * the function returns null.
+ */
+char *
+pg_get_extensiondef_string(Oid tableRelationId)
+{
+	ForeignTable *foreignTable = GetForeignTable(tableRelationId);
+	ForeignServer *server = GetForeignServer(foreignTable->serverid);
+	ForeignDataWrapper *foreignDataWrapper = GetForeignDataWrapper(server->fdwid);
+	StringInfoData buffer = { NULL, 0, 0, 0 };
+
+	Oid classId = ForeignDataWrapperRelationId;
+	Oid objectId = server->fdwid;
+
+	Oid extensionId = getExtensionOfObject(classId, objectId);
+	if (OidIsValid(extensionId))
+	{
+		char *extensionName = get_extension_name(extensionId);
+		Oid extensionSchemaId = get_extension_schema(extensionId);
+		char *extensionSchema = get_namespace_name(extensionSchemaId);
+
+		initStringInfo(&buffer);
+		appendStringInfo(&buffer, "CREATE EXTENSION IF NOT EXISTS %s WITH SCHEMA %s",
+						 quote_identifier(extensionName),
+						 quote_identifier(extensionSchema));
+	}
+	else
+	{
+		ereport(NOTICE, (errmsg("foreign-data wrapper \"%s\" does not have an "
+								"extension defined", foreignDataWrapper->fdwname)));
+	}
+
+	return (buffer.data);
+}
+
+
+/*
+ * get_extension_schema - given an extension OID, fetch its extnamespace
+ *
+ * Returns InvalidOid if no such extension.
+ */
+static Oid
+get_extension_schema(Oid ext_oid)
+{
+	Oid			result;
+	Relation	rel;
+	SysScanDesc scandesc;
+	HeapTuple	tuple;
+	ScanKeyData entry[1];
+
+	rel = heap_open(ExtensionRelationId, AccessShareLock);
+
+	ScanKeyInit(&entry[0],
+				ObjectIdAttributeNumber,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(ext_oid));
+
+	scandesc = systable_beginscan(rel, ExtensionOidIndexId, true,
+								  NULL, 1, entry);
+
+	tuple = systable_getnext(scandesc);
+
+	/* We assume that there can be at most one matching tuple */
+	if (HeapTupleIsValid(tuple))
+		result = ((Form_pg_extension) GETSTRUCT(tuple))->extnamespace;
+	else
+		result = InvalidOid;
+
+	systable_endscan(scandesc);
+
+	heap_close(rel, AccessShareLock);
+
+	return result;
+}
+
+
+/*
+ * pg_get_serverdef_string finds the foreign server that corresponds to the
+ * given foreign tableId, and returns this server's definition.
+ */
+char *
+pg_get_serverdef_string(Oid tableRelationId)
+{
+	ForeignTable *foreignTable = GetForeignTable(tableRelationId);
+	ForeignServer *server = GetForeignServer(foreignTable->serverid);
+	ForeignDataWrapper *foreignDataWrapper = GetForeignDataWrapper(server->fdwid);
+
+	StringInfoData buffer = { NULL, 0, 0, 0 };
+	initStringInfo(&buffer);
+
+	appendStringInfo(&buffer, "CREATE SERVER %s", quote_identifier(server->servername));
+	if (server->servertype != NULL)
+	{
+		appendStringInfo(&buffer, " TYPE %s",
+						 quote_literal_cstr(server->servertype));
+	}
+	if (server->serverversion != NULL)
+	{
+		appendStringInfo(&buffer, " VERSION %s",
+						 quote_literal_cstr(server->serverversion));
+	}
+
+	appendStringInfo(&buffer, " FOREIGN DATA WRAPPER %s",
+					 quote_identifier(foreignDataWrapper->fdwname));
+
+	/* append server options, if any */
+	AppendOptionListToString(&buffer, server->options);
+
+	return (buffer.data);
+}
+
+
+/*
+ * AppendOptionListToString converts the option list to its textual format, and
+ * appends this text to the given string buffer.
+ */
+static void
+AppendOptionListToString(StringInfo stringBuffer, List *optionList)
+{
+	if (optionList != NIL)
+	{
+		ListCell *optionCell = NULL;
+		bool firstOptionPrinted = false;
+
+		appendStringInfo(stringBuffer, " OPTIONS (");
+
+		foreach(optionCell, optionList)
+		{
+			DefElem *option = (DefElem*) lfirst(optionCell);
+			char *optionName = option->defname;
+			char *optionValue = defGetString(option);
+
+			if (firstOptionPrinted)
+			{
+				appendStringInfo(stringBuffer, ", ");
+			}
+			firstOptionPrinted = true;
+
+			appendStringInfo(stringBuffer, "%s ", quote_identifier(optionName));
+			appendStringInfo(stringBuffer, "%s", quote_literal_cstr(optionValue));
+		}
+
+		appendStringInfo(stringBuffer, ")");
+	}
+}
+
+
+/*
+ * pg_get_tableschemadef_string returns the definition of a given table. This
+ * definition includes table's schema, default column values, not null and check
+ * constraints. The definition does not include constraints that trigger index
+ * creations; specifically, unique and primary key constraints are excluded.
+ */
+char *
+pg_get_tableschemadef_string(Oid tableRelationId)
+{
+	Relation relation = NULL;
+	char *relationName = NULL;
+	char relationKind = 0;
+	TupleDesc tupleDescriptor = NULL;
+	TupleConstr *tupleConstraints = NULL;
+	int  attributeIndex = 0;
+	bool firstAttributePrinted = false;
+	AttrNumber defaultValueIndex = 0;
+	AttrNumber constraintIndex = 0;
+	AttrNumber constraintCount = 0;
+	StringInfoData buffer = { NULL, 0, 0, 0 };
+
+	/*
+	 * Instead of retrieving values from system catalogs as other functions in
+	 * ruleutils.c do, we follow an unusual approach here: we open the relation,
+	 * and fetch the relation's tuple descriptor. We do this because the tuple
+	 * descriptor already contains information harnessed from pg_attrdef,
+	 * pg_attribute, pg_constraint, and pg_class; and therefore using the
+	 * descriptor saves us from a lot of additional work.
+	 */
+	relation = relation_open(tableRelationId, AccessShareLock);
+	relationName = generate_relation_name(tableRelationId, NIL);
+
+	relationKind = relation->rd_rel->relkind;
+	if (relationKind != RELKIND_RELATION && relationKind != RELKIND_FOREIGN_TABLE)
+	{
+		ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						errmsg("%s is not a regular or foreign table", relationName)));
+	}
+
+	initStringInfo(&buffer);
+	if (relationKind == RELKIND_RELATION)
+	{
+		appendStringInfo(&buffer, "CREATE TABLE %s (", relationName);
+	}
+	else
+	{
+		appendStringInfo(&buffer, "CREATE FOREIGN TABLE %s (", relationName);
+	}
+
+	/*
+	 * Iterate over the table's columns. If a particular column is not dropped
+	 * and is not inherited from another table, print the column's name and its
+	 * formatted type.
+	 */
+	tupleDescriptor = RelationGetDescr(relation);
+	tupleConstraints = tupleDescriptor->constr;
+
+	for (attributeIndex = 0; attributeIndex < tupleDescriptor->natts; attributeIndex++)
+	{
+		Form_pg_attribute attributeForm = tupleDescriptor->attrs[attributeIndex];
+		const char *attributeName = NULL;
+		const char *attributeTypeName = NULL;
+
+		if (!attributeForm->attisdropped && attributeForm->attinhcount == 0)
+		{
+			if (firstAttributePrinted)
+			{
+				appendStringInfoString(&buffer, ", ");
+			}
+			firstAttributePrinted = true;
+
+			attributeName = NameStr(attributeForm->attname);
+			appendStringInfo(&buffer, "%s ", quote_identifier(attributeName));
+
+			attributeTypeName = format_type_with_typemod(attributeForm->atttypid,
+														 attributeForm->atttypmod);
+			appendStringInfoString(&buffer, attributeTypeName);
+
+			/* if this column has a default value, append the default value */
+			if (attributeForm->atthasdef)
+			{
+				AttrDefault *defaultValueList = NULL;
+				AttrDefault *defaultValue = NULL;
+
+				Node *defaultNode = NULL;
+				List *defaultContext = NULL;
+				char *defaultString = NULL;
+
+				Assert(tupleConstraints != NULL);
+
+				defaultValueList = tupleConstraints->defval;
+				Assert(defaultValueList != NULL);
+
+				defaultValue = &(defaultValueList[defaultValueIndex]);
+				defaultValueIndex++;
+
+				Assert(defaultValue->adnum == (attributeIndex + 1));
+				Assert(defaultValueIndex <= tupleConstraints->num_defval);
+
+				/* convert expression to node tree, and prepare deparse context */
+				defaultNode = (Node *) stringToNode(defaultValue->adbin);
+				defaultContext = deparse_context_for(relationName, tableRelationId);
+
+				/* deparse default value string */
+				defaultString = deparse_expression(defaultNode, defaultContext,
+												   false, false);
+
+				appendStringInfo(&buffer, " DEFAULT %s", defaultString);
+			}
+
+			/* if this column has a not null constraint, append the constraint */
+			if (attributeForm->attnotnull)
+			{
+				appendStringInfoString(&buffer, " NOT NULL");
+			}
+		}
+	}
+
+	/*
+	 * Now check if the table has any constraints. If it does, set the number of
+	 * check constraints here. Then iterate over all check constraints and print
+	 * them.
+	 */
+	if (tupleConstraints != NULL)
+	{
+		constraintCount = tupleConstraints->num_check;
+	}
+
+	for (constraintIndex = 0; constraintIndex < constraintCount; constraintIndex++)
+	{
+		ConstrCheck *checkConstraintList = tupleConstraints->check;
+		ConstrCheck *checkConstraint = &(checkConstraintList[constraintIndex]);
+
+		Node *checkNode = NULL;
+		List *checkContext = NULL;
+		char *checkString = NULL;
+
+		/* if an attribute or constraint has been printed, format properly */
+		if (firstAttributePrinted || constraintIndex > 0)
+		{
+			appendStringInfoString(&buffer, ", ");
+		}
+
+		appendStringInfo(&buffer, "CONSTRAINT %s CHECK ",
+						 quote_identifier(checkConstraint->ccname));
+
+		/* convert expression to node tree, and prepare deparse context */
+		checkNode = (Node *) stringToNode(checkConstraint->ccbin);
+		checkContext = deparse_context_for(relationName, tableRelationId);
+
+		/* deparse check constraint string */
+		checkString = deparse_expression(checkNode, checkContext, false, false);
+
+		appendStringInfoString(&buffer, checkString);
+	}
+
+	/* close create table's outer parentheses */
+	appendStringInfoString(&buffer, ")");
+
+	/*
+	 * If the relation is a foreign table, append the server name and options to
+	 * the create table statement.
+	 */
+	if (relationKind == RELKIND_FOREIGN_TABLE)
+	{
+		ForeignTable *foreignTable = GetForeignTable(tableRelationId);
+		ForeignServer *foreignServer = GetForeignServer(foreignTable->serverid);
+
+		char *serverName = foreignServer->servername;
+		appendStringInfo(&buffer, " SERVER %s", quote_identifier(serverName));
+		AppendOptionListToString(&buffer, foreignTable->options);
+	}
+
+	relation_close(relation, AccessShareLock);
+
+	return (buffer.data);
+}
+
+
+/*
+ * pg_get_tablecolumnoptionsdef_string returns column storage type and column
+ * statistics definitions for given table, _if_ these definitions differ from
+ * their default values. The function returns null if all columns use default
+ * values for their storage types and statistics.
+ */
+char *
+pg_get_tablecolumnoptionsdef_string(Oid tableRelationId)
+{
+	Relation relation = NULL;
+	char *relationName = NULL;
+	char relationKind = 0;
+	TupleDesc tupleDescriptor = NULL;
+	AttrNumber attributeIndex = 0;
+	char *columnOptionStatement = NULL;
+	List *columnOptionList = NIL;
+	ListCell *columnOptionCell = NULL;
+	bool firstOptionPrinted = false;
+	StringInfoData buffer = { NULL, 0, 0, 0 };
+
+	/*
+	 * Instead of retrieving values from system catalogs, we open the relation,
+	 * and use the relation's tuple descriptor to access attribute information.
+	 * This is primarily to maintain symmetry with pg_get_tableschemadef.
+	 */
+	relation = relation_open(tableRelationId, AccessShareLock);
+	relationName = generate_relation_name(tableRelationId, NIL);
+
+	relationKind = relation->rd_rel->relkind;
+	if (relationKind != RELKIND_RELATION && relationKind != RELKIND_FOREIGN_TABLE)
+	{
+		ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						errmsg("%s is not a regular or foreign table", relationName)));
+	}
+
+	/*
+	 * Iterate over the table's columns. If a particular column is not dropped
+	 * and is not inherited from another table, check if column storage or
+	 * statistics statements need to be printed.
+	 */
+	tupleDescriptor = RelationGetDescr(relation);
+
+	for (attributeIndex = 0; attributeIndex < tupleDescriptor->natts; attributeIndex++)
+	{
+		Form_pg_attribute attributeForm = tupleDescriptor->attrs[attributeIndex];
+		char *attributeName = NameStr(attributeForm->attname);
+		char defaultStorageType = get_typstorage(attributeForm->atttypid);
+
+		if (!attributeForm->attisdropped && attributeForm->attinhcount == 0)
+		{
+			/*
+			 * If the user changed the column's default storage type, create
+			 * alter statement and add statement to a list for later processing.
+			 */
+			if (attributeForm->attstorage != defaultStorageType)
+			{
+				char *storageName = 0;
+				StringInfoData statement = { NULL, 0, 0, 0 };
+				initStringInfo(&statement);
+
+				switch (attributeForm->attstorage)
+				{
+					case 'p':
+						storageName = "PLAIN";
+						break;
+					case 'e':
+						storageName = "EXTERNAL";
+						break;
+					case 'm':
+						storageName = "MAIN";
+						break;
+					case 'x':
+						storageName = "EXTENDED";
+						break;
+					default:
+						ereport(ERROR, (errmsg("unrecognized storage type: %c",
+											   attributeForm->attstorage)));
+						break;
+				}
+
+				appendStringInfo(&statement, "ALTER COLUMN %s ",
+								 quote_identifier(attributeName));
+				appendStringInfo(&statement, "SET STORAGE %s", storageName);
+
+				columnOptionList = lappend(columnOptionList, statement.data);
+			}
+
+			/*
+			 * If the user changed the column's statistics target, create
+			 * alter statement and add statement to a list for later processing.
+			 */
+			if (attributeForm->attstattarget >= 0)
+			{
+				StringInfoData statement = { NULL, 0, 0, 0 };
+				initStringInfo(&statement);
+
+				appendStringInfo(&statement, "ALTER COLUMN %s ",
+								 quote_identifier(attributeName));
+				appendStringInfo(&statement, "SET STATISTICS %d",
+								 attributeForm->attstattarget);
+
+				columnOptionList = lappend(columnOptionList, statement.data);
+			}
+		}
+	}
+
+	/*
+	 * Iterate over column storage and statistics statements that we created,
+	 * and append them to a single alter table statement.
+	 */
+	foreach(columnOptionCell, columnOptionList)
+	{
+		if (!firstOptionPrinted)
+		{
+			initStringInfo(&buffer);
+			appendStringInfo(&buffer, "ALTER TABLE ONLY %s ",
+							 generate_relation_name(tableRelationId, NIL));
+		}
+		else
+		{
+			appendStringInfoString(&buffer, ", ");
+		}
+		firstOptionPrinted = true;
+
+		columnOptionStatement = (char *) lfirst(columnOptionCell);
+		appendStringInfoString(&buffer, columnOptionStatement);
+
+		pfree(columnOptionStatement);
+	}
+
+	list_free(columnOptionList);
+	relation_close(relation, AccessShareLock);
+
+	return (buffer.data);
+}
+
+
+/*
+ * pg_get_indexclusterdef_string returns the definition of a cluster statement
+ * for given index. The function returns null if the table is not clustered on
+ * given index.
+ */
+char *
+pg_get_indexclusterdef_string(Oid indexRelationId)
+{
+	HeapTuple indexTuple = NULL;
+	Form_pg_index indexForm = NULL;
+	Oid tableRelationId = InvalidOid;
+	StringInfoData buffer = { NULL, 0, 0, 0 };
+
+	indexTuple = SearchSysCache(INDEXRELID, ObjectIdGetDatum(indexRelationId), 0, 0, 0);
+	if (!HeapTupleIsValid(indexTuple))
+	{
+		ereport(ERROR, (errmsg("cache lookup failed for index %u", indexRelationId)));
+	}
+
+	indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+	tableRelationId = indexForm->indrelid;
+
+	/* check if the table is clustered on this index */
+	if (indexForm->indisclustered)
+	{
+		char *tableName = generate_relation_name(tableRelationId, NIL);
+		char *indexName = get_rel_name(indexRelationId); /* needs to be quoted */
+
+		initStringInfo(&buffer);
+		appendStringInfo(&buffer, "ALTER TABLE %s CLUSTER ON %s",
+						 tableName, quote_identifier(indexName));
+	}
+
+	ReleaseSysCache(indexTuple);
+
+	return (buffer.data);
+}
--- a/src/backend/distributed/utils/connection_cache.c
+++ b/src/backend/distributed/utils/connection_cache.c
@ -0,0 +1,334 @@
+/*-------------------------------------------------------------------------
+ *
+ * connection_cache.c
+ *
+ * This file contains functions to implement a connection hash.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h" /* IWYU pragma: keep */
+#include "c.h"
+#include "libpq-fe.h"
+#include "miscadmin.h"
+
+
+#include <stddef.h>
+#include <string.h>
+
+#include "commands/dbcommands.h"
+#include "distributed/connection_cache.h"
+#include "lib/stringinfo.h"
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+#include "utils/elog.h"
+#include "utils/errcodes.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "utils/palloc.h"
+
+
+/*
+ * NodeConnectionHash is the connection hash itself. It begins uninitialized.
+ * The first call to GetConnection triggers hash creation.
+ */
+static HTAB *NodeConnectionHash = NULL;
+
+
+/* local function forward declarations */
+static HTAB * CreateNodeConnectionHash(void);
+static PGconn * ConnectToNode(char *nodeName, char *nodePort);
+static char * ConnectionGetOptionValue(PGconn *connection, char *optionKeyword);
+
+
+/*
+ * GetConnection returns a PGconn which can be used to execute queries on a
+ * remote PostgreSQL server. If no suitable connection to the specified node on
+ * the specified port yet exists, the function establishes a new connection and
+ * returns that.
+ *
+ * Returned connections are guaranteed to be in the CONNECTION_OK state. If the
+ * requested connection cannot be established, or if it was previously created
+ * but is now in an unrecoverable bad state, this function returns NULL.
+ *
+ * This function throws an error if a hostname over 255 characters is provided.
+ */
+PGconn *
+GetConnection(char *nodeName, int32 nodePort)
+{
+	PGconn *connection = NULL;
+	NodeConnectionKey nodeConnectionKey;
+	NodeConnectionEntry *nodeConnectionEntry = NULL;
+	bool entryFound = false;
+	bool needNewConnection = true;
+
+	/* check input */
+	if (strnlen(nodeName, MAX_NODE_LENGTH + 1) > MAX_NODE_LENGTH)
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("hostname exceeds the maximum length of %d",
+							   MAX_NODE_LENGTH)));
+	}
+
+	/* if first call, initialize the connection hash */
+	if (NodeConnectionHash == NULL)
+	{
+		NodeConnectionHash = CreateNodeConnectionHash();
+	}
+
+	memset(&nodeConnectionKey, 0, sizeof(nodeConnectionKey));
+	strncpy(nodeConnectionKey.nodeName, nodeName, MAX_NODE_LENGTH);
+	nodeConnectionKey.nodePort = nodePort;
+
+	nodeConnectionEntry = hash_search(NodeConnectionHash, &nodeConnectionKey,
+									  HASH_FIND, &entryFound);
+	if (entryFound)
+	{
+		connection = nodeConnectionEntry->connection;
+		if (PQstatus(connection) == CONNECTION_OK)
+		{
+			needNewConnection = false;
+		}
+		else
+		{
+			PurgeConnection(connection);
+		}
+	}
+
+	if (needNewConnection)
+	{
+		StringInfo nodePortString = makeStringInfo();
+		appendStringInfo(nodePortString, "%d", nodePort);
+
+		connection = ConnectToNode(nodeName, nodePortString->data);
+		if (connection != NULL)
+		{
+			nodeConnectionEntry = hash_search(NodeConnectionHash, &nodeConnectionKey,
+											  HASH_ENTER, &entryFound);
+			nodeConnectionEntry->connection = connection;
+		}
+	}
+
+	return connection;
+}
+
+
+/*
+ * PurgeConnection removes the given connection from the connection hash and
+ * closes it using PQfinish. If our hash does not contain the given connection,
+ * this method simply prints a warning and exits.
+ */
+void
+PurgeConnection(PGconn *connection)
+{
+	NodeConnectionKey nodeConnectionKey;
+	NodeConnectionEntry *nodeConnectionEntry = NULL;
+	bool entryFound = false;
+	char *nodeNameString = NULL;
+	char *nodePortString = NULL;
+
+	nodeNameString = ConnectionGetOptionValue(connection, "host");
+	if (nodeNameString == NULL)
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("connection is missing host option")));
+	}
+
+	nodePortString = ConnectionGetOptionValue(connection, "port");
+	if (nodePortString == NULL)
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("connection is missing port option")));
+	}
+
+	memset(&nodeConnectionKey, 0, sizeof(nodeConnectionKey));
+	strncpy(nodeConnectionKey.nodeName, nodeNameString, MAX_NODE_LENGTH);
+	nodeConnectionKey.nodePort = pg_atoi(nodePortString, sizeof(int32), 0);
+
+	pfree(nodeNameString);
+	pfree(nodePortString);
+
+	nodeConnectionEntry = hash_search(NodeConnectionHash, &nodeConnectionKey,
+									  HASH_REMOVE, &entryFound);
+	if (entryFound)
+	{
+		/*
+		 * It's possible the provided connection matches the host and port for
+		 * an entry in the hash without being precisely the same connection. In
+		 * that case, we will want to close the hash's connection (because the
+		 * entry has already been removed) in addition to the provided one.
+		 */
+		if (nodeConnectionEntry->connection != connection)
+		{
+			ereport(WARNING, (errmsg("hash entry for \"%s:%d\" contained different "
+									 "connection than that provided by caller",
+									 nodeConnectionKey.nodeName,
+									 nodeConnectionKey.nodePort)));
+			PQfinish(nodeConnectionEntry->connection);
+		}
+	}
+	else
+	{
+		ereport(WARNING, (errcode(ERRCODE_NO_DATA),
+						  errmsg("could not find hash entry for connection to \"%s:%d\"",
+								 nodeConnectionKey.nodeName,
+								 nodeConnectionKey.nodePort)));
+	}
+
+	PQfinish(connection);
+}
+
+
+/*
+ * ReportRemoteError retrieves various error fields from the a remote result and
+ * produces an error report at the WARNING level.
+ */
+void
+ReportRemoteError(PGconn *connection, PGresult *result)
+{
+	char *sqlStateString = PQresultErrorField(result, PG_DIAG_SQLSTATE);
+	char *remoteMessage = PQresultErrorField(result, PG_DIAG_MESSAGE_PRIMARY);
+	char *nodeName = ConnectionGetOptionValue(connection, "host");
+	char *nodePort = ConnectionGetOptionValue(connection, "port");
+	char *errorPrefix = "Connection failed to";
+	int sqlState = ERRCODE_CONNECTION_FAILURE;
+
+	if (sqlStateString != NULL)
+	{
+		sqlState = MAKE_SQLSTATE(sqlStateString[0], sqlStateString[1], sqlStateString[2],
+								 sqlStateString[3], sqlStateString[4]);
+
+		/* use more specific error prefix for result failures */
+		if (sqlState != ERRCODE_CONNECTION_FAILURE)
+		{
+			errorPrefix = "Bad result from";
+		}
+	}
+
+	/*
+	 * If the PGresult did not contain a message, the connection may provide a
+	 * suitable top level one. At worst, this is an empty string.
+	 */
+	if (remoteMessage == NULL)
+	{
+		char *lastNewlineIndex = NULL;
+
+		remoteMessage = PQerrorMessage(connection);
+		lastNewlineIndex = strrchr(remoteMessage, '\n');
+
+		/* trim trailing newline, if any */
+		if (lastNewlineIndex != NULL)
+		{
+			*lastNewlineIndex = '\0';
+		}
+	}
+
+	ereport(WARNING, (errcode(sqlState),
+					  errmsg("%s %s:%s", errorPrefix, nodeName, nodePort),
+					  errdetail("Remote message: %s", remoteMessage)));
+}
+
+
+/*
+ * CreateNodeConnectionHash returns a newly created hash table suitable for
+ * storing unlimited connections indexed by node name and port.
+ */
+static HTAB *
+CreateNodeConnectionHash(void)
+{
+	HTAB *nodeConnectionHash = NULL;
+	HASHCTL info;
+	int hashFlags = 0;
+
+	memset(&info, 0, sizeof(info));
+	info.keysize = sizeof(NodeConnectionKey);
+	info.entrysize = sizeof(NodeConnectionEntry);
+	info.hash = tag_hash;
+	info.hcxt = CacheMemoryContext;
+	hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	nodeConnectionHash = hash_create("citusdb connection cache", 32, &info, hashFlags);
+
+	return nodeConnectionHash;
+}
+
+
+/*
+ * ConnectToNode opens a connection to a remote PostgreSQL server. The function
+ * configures the connection's fallback application name to 'citusdb' and sets
+ * the remote encoding to match the local one. This function requires that the
+ * port be specified as a string for easier use with libpq functions.
+ *
+ * We attempt to connect up to MAX_CONNECT_ATTEMPT times. After that we give up
+ * and return NULL.
+ */
+static PGconn *
+ConnectToNode(char *nodeName, char *nodePort)
+{
+	PGconn *connection = NULL;
+	const char *clientEncoding = GetDatabaseEncodingName();
+	const char *dbname = get_database_name(MyDatabaseId);
+	int attemptIndex = 0;
+
+	const char *keywordArray[] = {
+		"host", "port", "fallback_application_name",
+		"client_encoding", "connect_timeout", "dbname", NULL
+	};
+	const char *valueArray[] = {
+		nodeName, nodePort, "citusdb", clientEncoding,
+		CLIENT_CONNECT_TIMEOUT_SECONDS, dbname, NULL
+	};
+
+	Assert(sizeof(keywordArray) == sizeof(valueArray));
+
+	for (attemptIndex = 0; attemptIndex < MAX_CONNECT_ATTEMPTS; attemptIndex++)
+	{
+		connection = PQconnectdbParams(keywordArray, valueArray, false);
+		if (PQstatus(connection) == CONNECTION_OK)
+		{
+			break;
+		}
+		else
+		{
+			/* warn if still erroring on final attempt */
+			if (attemptIndex == MAX_CONNECT_ATTEMPTS - 1)
+			{
+				ReportRemoteError(connection, NULL);
+			}
+
+			PQfinish(connection);
+			connection = NULL;
+		}
+	}
+
+	return connection;
+}
+
+
+/*
+ * ConnectionGetOptionValue inspects the provided connection for an option with
+ * a given keyword and returns a new palloc'd string with that options's value.
+ * The function returns NULL if the connection has no setting for an option with
+ * the provided keyword.
+ */
+static char *
+ConnectionGetOptionValue(PGconn *connection, char *optionKeyword)
+{
+	char *optionValue = NULL;
+	PQconninfoOption *conninfoOptions = PQconninfo(connection);
+	PQconninfoOption *option = NULL;
+
+	for (option = conninfoOptions; option->keyword != NULL; option++)
+	{
+		if (strncmp(option->keyword, optionKeyword, NAMEDATALEN) == 0)
+		{
+			optionValue = pstrdup(option->val);
+		}
+	}
+
+	PQconninfoFree(conninfoOptions);
+
+	return optionValue;
+}
--- a/src/backend/distributed/utils/listutils.c
+++ b/src/backend/distributed/utils/listutils.c
@ -0,0 +1,62 @@
+/*-------------------------------------------------------------------------
+ *
+ * listutils.c
+ *
+ * This file contains functions to perform useful operations on lists.
+ *
+ * Copyright (c) 2014-2015, Citus Data, Inc.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "c.h"
+#include "port.h"
+
+#include "distributed/listutils.h"
+#include "nodes/pg_list.h"
+#include "utils/memutils.h"
+
+
+/*
+ * SortList takes in a list of void pointers, and sorts these pointers (and the
+ * values they point to) by applying the given comparison function. The function
+ * then returns the sorted list of pointers.
+ *
+ * Because the input list is a list of pointers, and because qsort expects to
+ * compare pointers to the list elements, the provided comparison function must
+ * compare pointers to pointers to elements. In addition, this sort function
+ * naturally exhibits the same lack of stability exhibited by qsort. See that
+ * function's man page for more details.
+ */
+List *
+SortList(List *pointerList, int (*comparisonFunction)(const void *, const void *))
+{
+	List *sortedList = NIL;
+	uint32 arrayIndex = 0;
+	uint32 arraySize = (uint32) list_length(pointerList);
+	void **array = (void **) palloc0(arraySize * sizeof(void *));
+
+	ListCell *pointerCell = NULL;
+	foreach(pointerCell, pointerList)
+	{
+		void *pointer = lfirst(pointerCell);
+		array[arrayIndex] = pointer;
+
+		arrayIndex++;
+	}
+
+	/* sort the array of pointers using the comparison function */
+	qsort(array, arraySize, sizeof(void *), comparisonFunction);
+
+	/* convert the sorted array of pointers back to a sorted list */
+	for (arrayIndex = 0; arrayIndex < arraySize; arrayIndex++)
+	{
+		void *sortedPointer = array[arrayIndex];
+		sortedList = lappend(sortedList, sortedPointer);
+	}
+
+	pfree(array);
+
+	return sortedList;
+}
--- a/src/backend/distributed/utils/metadata_cache.c
+++ b/src/backend/distributed/utils/metadata_cache.c
@ -0,0 +1,929 @@
+/*-------------------------------------------------------------------------
+ *
+ * metadata_cache.c
+ *	  Distributed table metadata cache
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_namespace.h"
+#include "catalog/pg_type.h"
+#include "commands/extension.h"
+#include "commands/trigger.h"
+#include "distributed/master_metadata_utility.h"
+#include "distributed/metadata_cache.h"
+#include "distributed/pg_dist_partition.h"
+#include "distributed/pg_dist_shard.h"
+#include "parser/parse_func.h"
+#include "utils/builtins.h"
+#include "utils/catcache.h"
+#include "utils/datum.h"
+#include "utils/hsearch.h"
+#include "utils/inval.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/relfilenodemap.h"
+#include "utils/relmapper.h"
+#include "utils/syscache.h"
+
+
+/* Hash table for informations about each partition */
+static HTAB *DistTableCacheHash = NULL;
+
+/* built first time through in InitializePartitionCache */
+static ScanKeyData DistPartitionScanKey[1];
+static ScanKeyData DistShardScanKey[1];
+
+
+/* local function forward declarations */
+static DistTableCacheEntry * LookupDistTableCacheEntry(Oid relationId);
+static void InitializeDistTableCache(void);
+static void ResetDistTableCacheEntry(DistTableCacheEntry *cacheEntry);
+static void InvalidateDistRelationCacheCallback(Datum argument, Oid relationId);
+static HeapTuple LookupDistPartitionTuple(Oid relationId);
+static List * LookupDistShardTuples(Oid relationId);
+static void GetPartitionTypeInputInfo(char *partitionKeyString, char partitionMethod,
+                                      Oid *intervalTypeId, int32 *intervalTypeMod);
+static ShardInterval * TupleToShardInterval(HeapTuple heapTuple,
+                                            TupleDesc tupleDescriptor, Oid intervalTypeId,
+                                            int32 intervalTypeMod);
+static void CachedRelationLookup(const char *relationName, Oid *cachedOid);
+
+
+/* exports for SQL callable functions */
+PG_FUNCTION_INFO_V1(master_dist_partition_cache_invalidate);
+PG_FUNCTION_INFO_V1(master_dist_shard_cache_invalidate);
+
+
+/*
+ * IsDistributedTable returns whether relationId is a distributed relation or
+ * not.
+ */
+bool
+IsDistributedTable(Oid relationId)
+{
+	DistTableCacheEntry *cacheEntry = NULL;
+
+	/*
+	 * Can't be a distributed relation if the extension hasn't been loaded
+	 * yet. As we can't do lookups in nonexistent tables, directly return
+	 * false.
+	 */
+	if (!CitusDBHasBeenLoaded())
+	{
+		return false;
+	}
+
+	cacheEntry = LookupDistTableCacheEntry(relationId);
+
+	return cacheEntry->isDistributedTable;
+}
+
+/*
+ * LoadShardInterval reads shard metadata for given shardId from pg_dist_shard,
+ * and converts min/max values in these metadata to their properly typed datum
+ * representations. The function then allocates a structure that stores the read
+ * and converted values, and returns this structure.
+ */
+ShardInterval *
+LoadShardInterval(uint64 shardId)
+{
+	ShardInterval *shardInterval;
+	SysScanDesc scanDescriptor = NULL;
+	ScanKeyData	scanKey[1];
+	int scanKeyCount = 1;
+	HeapTuple heapTuple = NULL;
+	Form_pg_dist_shard shardForm = NULL;
+	DistTableCacheEntry *partitionEntry;
+	Oid intervalTypeId = InvalidOid;
+	int32 intervalTypeMod = -1;
+
+	Relation pgDistShard = heap_open(DistShardRelationId(), AccessShareLock);
+	TupleDesc tupleDescriptor = RelationGetDescr(pgDistShard);
+
+	ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
+				BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
+
+	scanDescriptor = systable_beginscan(pgDistShard,
+										DistShardShardidIndexId(), true,
+										NULL, scanKeyCount, scanKey);
+
+	heapTuple = systable_getnext(scanDescriptor);
+	if (!HeapTupleIsValid(heapTuple))
+	{
+		ereport(ERROR, (errmsg("could not find valid entry for shard "
+							   UINT64_FORMAT, shardId)));
+	}
+
+	shardForm = (Form_pg_dist_shard) GETSTRUCT(heapTuple);
+	partitionEntry = DistributedTableCacheEntry(shardForm->logicalrelid);
+
+	GetPartitionTypeInputInfo(partitionEntry->partitionKeyString,
+	                          partitionEntry->partitionMethod, &intervalTypeId,
+	                          &intervalTypeMod);
+
+	shardInterval = TupleToShardInterval(heapTuple, tupleDescriptor, intervalTypeId,
+	                                     intervalTypeMod);
+
+	systable_endscan(scanDescriptor);
+	heap_close(pgDistShard, AccessShareLock);
+
+	return shardInterval;
+}
+
+/*
+ * DistributedTableCacheEntry looks up a pg_dist_partition entry for a
+ * relation.
+ *
+ * Errors out if no relation matching the criteria could be found.
+ */
+DistTableCacheEntry *
+DistributedTableCacheEntry(Oid distributedRelationId)
+{
+	DistTableCacheEntry *cacheEntry = NULL;
+
+	/*
+	 * Can't be a distributed relation if the extension hasn't been loaded
+	 * yet. As we can't do lookups in nonexistent tables, directly return NULL
+	 * here.
+	 */
+	if (!CitusDBHasBeenLoaded())
+	{
+		return NULL;
+	}
+
+	cacheEntry = LookupDistTableCacheEntry(distributedRelationId);
+
+	if (cacheEntry->isDistributedTable)
+	{
+		return cacheEntry;
+	}
+	else
+	{
+		ereport(ERROR, (errmsg("relation %u is not distributed",
+							   distributedRelationId)));
+	}
+}
+
+
+/*
+ * LookupDistTableCacheEntry returns the distributed table metadata for the
+ * passed relationId. For efficiency it caches lookups.
+ */
+static DistTableCacheEntry *
+LookupDistTableCacheEntry(Oid relationId)
+{
+	DistTableCacheEntry *cacheEntry = NULL;
+	bool foundInCache = false;
+	HeapTuple distPartitionTuple = NULL;
+	char *partitionKeyString = NULL;
+	char partitionMethod = 0;
+	List *distShardTupleList = NIL;
+	int shardIntervalArrayLength = 0;
+	ShardInterval *shardIntervalArray = NULL;
+	void *hashKey = (void *) &relationId;
+
+	if (DistTableCacheHash == NULL)
+	{
+		InitializeDistTableCache();
+	}
+
+	cacheEntry = hash_search(DistTableCacheHash, hashKey, HASH_FIND, &foundInCache);
+
+	/* return valid matches */
+	if ((cacheEntry != NULL) && (cacheEntry->isValid))
+	{
+		return cacheEntry;
+	}
+
+	/* free the content of old, invalid, entries */
+	if (cacheEntry != NULL)
+	{
+		ResetDistTableCacheEntry(cacheEntry);
+	}
+
+	distPartitionTuple = LookupDistPartitionTuple(relationId);
+	if (distPartitionTuple != NULL)
+	{
+		Form_pg_dist_partition partitionForm =
+			(Form_pg_dist_partition) GETSTRUCT(distPartitionTuple);
+		Datum partitionKeyDatum = PointerGetDatum(&partitionForm->partkey);
+
+		MemoryContext oldContext = MemoryContextSwitchTo(CacheMemoryContext);
+
+		partitionKeyString = TextDatumGetCString(partitionKeyDatum);
+		partitionMethod = partitionForm->partmethod;
+
+		MemoryContextSwitchTo(oldContext);
+
+		heap_freetuple(distPartitionTuple);
+	}
+
+	distShardTupleList = LookupDistShardTuples(relationId);
+	shardIntervalArrayLength = list_length(distShardTupleList);
+	if (shardIntervalArrayLength > 0)
+	{
+		Relation distShardRelation = heap_open(DistShardRelationId(), AccessShareLock);
+		TupleDesc distShardTupleDesc = RelationGetDescr(distShardRelation);
+		ListCell *distShardTupleCell = NULL;
+		int arrayIndex = 0;
+		Oid intervalTypeId = InvalidOid;
+		int32 intervalTypeMod = -1;
+
+		GetPartitionTypeInputInfo(partitionKeyString, partitionMethod, &intervalTypeId,
+		                          &intervalTypeMod);
+
+		shardIntervalArray = MemoryContextAllocZero(CacheMemoryContext,
+		                                            shardIntervalArrayLength *
+		                                            sizeof(ShardInterval));
+
+		foreach(distShardTupleCell, distShardTupleList)
+		{
+			HeapTuple shardTuple = lfirst(distShardTupleCell);
+			ShardInterval *shardInterval = TupleToShardInterval(shardTuple,
+			                                                    distShardTupleDesc,
+			                                                    intervalTypeId,
+			                                                    intervalTypeMod);
+			MemoryContext oldContext = MemoryContextSwitchTo(CacheMemoryContext);
+
+			CopyShardInterval(shardInterval, &shardIntervalArray[arrayIndex]);
+
+			MemoryContextSwitchTo(oldContext);
+
+			heap_freetuple(shardTuple);
+
+			arrayIndex++;
+		}
+
+		heap_close(distShardRelation, AccessShareLock);
+	}
+
+	cacheEntry = hash_search(DistTableCacheHash, hashKey, HASH_ENTER, NULL);
+
+	/* zero out entry, but not the key part */
+	memset(((char *) cacheEntry) + sizeof(Oid), 0,
+		   sizeof(DistTableCacheEntry) - sizeof(Oid));
+
+	if (distPartitionTuple == NULL)
+	{
+		cacheEntry->isValid = true;
+		cacheEntry->isDistributedTable = false;
+	}
+	else
+	{
+		cacheEntry->isValid = true;
+		cacheEntry->isDistributedTable = true;
+		cacheEntry->partitionKeyString = partitionKeyString;
+		cacheEntry->partitionMethod = partitionMethod;
+		cacheEntry->shardIntervalArrayLength = shardIntervalArrayLength;
+		cacheEntry->shardIntervalArray = shardIntervalArray;
+	}
+
+	return cacheEntry;
+}
+
+
+/*
+ * CitusDBHasBeenLoaded returns true if the citusdb extension has been created
+ * in the current database and the extension script has been executed. Otherwise,
+ * it returns false. The result is cached as this is called very frequently.
+ *
+ * NB: The way this is cached means the result will be wrong after the
+ * extension is dropped. A reconnect fixes that though, so that seems
+ * acceptable.
+ */
+bool
+CitusDBHasBeenLoaded(void)
+{
+	static bool extensionLoaded = false;
+
+	/* recheck presence until citusdb has been loaded */
+	if (!extensionLoaded)
+	{
+		bool extensionPresent = false;
+		bool extensionScriptExecuted = true;
+
+		Oid extensionOid = get_extension_oid("citusdb", true);
+		if (extensionOid != InvalidOid)
+		{
+			extensionPresent = true;
+		}
+
+		if (extensionPresent)
+		{
+			/* check if CitusDB extension objects are still being created */
+			if (creating_extension && CurrentExtensionObject == extensionOid)
+			{
+				extensionScriptExecuted = false;
+			}
+		}
+
+		extensionLoaded = extensionPresent && extensionScriptExecuted;
+	}
+
+	return extensionLoaded;
+}
+
+
+/* return oid of pg_dist_shard relation */
+Oid
+DistShardRelationId(void)
+{
+	static Oid cachedOid = InvalidOid;
+
+	CachedRelationLookup("pg_dist_shard", &cachedOid);
+
+	return cachedOid;
+}
+
+
+/* return oid of pg_dist_shard_placement relation */
+Oid
+DistShardPlacementRelationId(void)
+{
+	static Oid cachedOid = InvalidOid;
+
+	CachedRelationLookup("pg_dist_shard_placement", &cachedOid);
+
+	return cachedOid;
+}
+
+
+/* return oid of pg_dist_partition relation */
+Oid
+DistPartitionRelationId(void)
+{
+	static Oid cachedOid = InvalidOid;
+
+	CachedRelationLookup("pg_dist_partition", &cachedOid);
+
+	return cachedOid;
+}
+
+
+/* return oid of pg_dist_partition_logical_relid_index index */
+Oid
+DistPartitionLogicalRelidIndexId(void)
+{
+	static Oid cachedOid = InvalidOid;
+
+	CachedRelationLookup("pg_dist_partition_logical_relid_index", &cachedOid);
+
+	return cachedOid;
+}
+
+
+/* return oid of pg_dist_shard_logical_relid_index index */
+Oid
+DistShardLogicalRelidIndexId(void)
+{
+	static Oid cachedOid = InvalidOid;
+
+	CachedRelationLookup("pg_dist_shard_logical_relid_index", &cachedOid);
+
+	return cachedOid;
+}
+
+
+/* return oid of pg_dist_shard_shardid_index index */
+Oid
+DistShardShardidIndexId(void)
+{
+	static Oid cachedOid = InvalidOid;
+
+	CachedRelationLookup("pg_dist_shard_shardid_index", &cachedOid);
+
+	return cachedOid;
+}
+
+
+/* return oid of pg_dist_shard_placement_shardid_index */
+Oid
+DistShardPlacementShardidIndexId(void)
+{
+	static Oid cachedOid = InvalidOid;
+
+	CachedRelationLookup("pg_dist_shard_placement_shardid_index", &cachedOid);
+
+	return cachedOid;
+}
+
+
+/* return oid of the citus_extradata_container(internal) function */
+Oid
+CitusExtraDataContainerFuncId(void)
+{
+	static Oid cachedOid = 0;
+	List *nameList = NIL;
+	Oid paramOids[1] = { INTERNALOID };
+
+	if (cachedOid == InvalidOid)
+	{
+		nameList = list_make2(makeString("pg_catalog"),
+							  makeString("citusdb_extradata_container"));
+		cachedOid = LookupFuncName(nameList, 1, paramOids, false);
+	}
+
+	return cachedOid;
+}
+
+
+/*
+ * master_dist_partition_cache_invalidate is a trigger function that performs
+ * relcache invalidations when the contents of pg_dist_partition are changed
+ * on the SQL level.
+ */
+Datum
+master_dist_partition_cache_invalidate(PG_FUNCTION_ARGS)
+{
+	TriggerData *triggerData = (TriggerData *) fcinfo->context;
+	HeapTuple newTuple = NULL;
+	HeapTuple oldTuple = NULL;
+	Oid oldLogicalRelationId = InvalidOid;
+	Oid newLogicalRelationId = InvalidOid;
+
+	if (!CALLED_AS_TRIGGER(fcinfo))
+	{
+		ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
+						errmsg("must be called as trigger")));
+	}
+
+	newTuple = triggerData->tg_newtuple;
+	oldTuple = triggerData->tg_trigtuple;
+
+	/* collect logicalrelid for OLD and NEW tuple */
+	if (oldTuple != NULL)
+	{
+		Form_pg_dist_partition distPart = (Form_pg_dist_partition) GETSTRUCT(oldTuple);
+
+		oldLogicalRelationId = distPart->logicalrelid;
+	}
+
+	if (newTuple != NULL)
+	{
+		Form_pg_dist_partition distPart = (Form_pg_dist_partition) GETSTRUCT(newTuple);
+
+		newLogicalRelationId = distPart->logicalrelid;
+	}
+
+	/*
+	 * Invalidate relcache for the relevant relation(s). In theory
+	 * logicalrelid should never change, but it doesn't hurt to be
+	 * paranoid. We ignore the case that there's no corresponding pg_class
+	 * entry - that happens if the pg_dist_partition tuple is deleted after
+	 * the relation has been dropped.
+	 */
+	if (oldLogicalRelationId != InvalidOid &&
+		oldLogicalRelationId != newLogicalRelationId)
+	{
+		HeapTuple oldClassTuple =
+			SearchSysCache1(RELOID, ObjectIdGetDatum(oldLogicalRelationId));
+
+		if (HeapTupleIsValid(oldClassTuple))
+		{
+			CacheInvalidateRelcacheByTuple(oldClassTuple);
+			ReleaseSysCache(oldClassTuple);
+		}
+	}
+
+	if (newLogicalRelationId != InvalidOid)
+	{
+		HeapTuple newClassTuple =
+			SearchSysCache1(RELOID, ObjectIdGetDatum(newLogicalRelationId));
+
+		if (HeapTupleIsValid(newClassTuple))
+		{
+			CacheInvalidateRelcacheByTuple(newClassTuple);
+			ReleaseSysCache(newClassTuple);
+		}
+	}
+
+	PG_RETURN_DATUM(PointerGetDatum(NULL));
+}
+
+
+/*
+ * master_dist_shard_cache_invalidate is a trigger function that performs
+ * relcache invalidations when the contents of pg_dist_shard are changed
+ * on the SQL level.
+ */
+Datum
+master_dist_shard_cache_invalidate(PG_FUNCTION_ARGS)
+{
+	TriggerData *triggerData = (TriggerData *) fcinfo->context;
+	HeapTuple newTuple = NULL;
+	HeapTuple oldTuple = NULL;
+	Oid oldLogicalRelationId = InvalidOid;
+	Oid newLogicalRelationId = InvalidOid;
+
+	if (!CALLED_AS_TRIGGER(fcinfo))
+	{
+		ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
+						errmsg("must be called as trigger")));
+	}
+
+	newTuple = triggerData->tg_newtuple;
+	oldTuple = triggerData->tg_trigtuple;
+
+	/* collect logicalrelid for OLD and NEW tuple */
+	if (oldTuple != NULL)
+	{
+		Form_pg_dist_shard distShard = (Form_pg_dist_shard) GETSTRUCT(oldTuple);
+
+		oldLogicalRelationId = distShard->logicalrelid;
+	}
+
+	if (newTuple != NULL)
+	{
+		Form_pg_dist_shard distShard = (Form_pg_dist_shard) GETSTRUCT(newTuple);
+
+		newLogicalRelationId = distShard->logicalrelid;
+	}
+
+	/*
+	 * Invalidate relcache for the relevant relation(s). In theory
+	 * logicalrelid should never change, but it doesn't hurt to be
+	 * paranoid. We ignore the case that there's no corresponding pg_class
+	 * entry - that happens if the pg_dist_shard tuple is deleted after
+	 * the relation has been dropped.
+	 */
+	if (oldLogicalRelationId != InvalidOid &&
+		oldLogicalRelationId != newLogicalRelationId)
+	{
+		HeapTuple oldClassTuple =
+			SearchSysCache1(RELOID, ObjectIdGetDatum(oldLogicalRelationId));
+
+		if (HeapTupleIsValid(oldClassTuple))
+		{
+			CacheInvalidateRelcacheByTuple(oldClassTuple);
+			ReleaseSysCache(oldClassTuple);
+		}
+	}
+
+	if (newLogicalRelationId != InvalidOid)
+	{
+		HeapTuple newClassTuple =
+			SearchSysCache1(RELOID, ObjectIdGetDatum(newLogicalRelationId));
+
+		if (HeapTupleIsValid(newClassTuple))
+		{
+			CacheInvalidateRelcacheByTuple(newClassTuple);
+			ReleaseSysCache(newClassTuple);
+		}
+	}
+
+	PG_RETURN_DATUM(PointerGetDatum(NULL));
+}
+
+
+/* initialize the infrastructure for the metadata cache */
+static void
+InitializeDistTableCache(void)
+{
+	HASHCTL info;
+
+	/* make sure we've initialized CacheMemoryContext */
+	if (CacheMemoryContext == NULL)
+	{
+		CreateCacheMemoryContext();
+	}
+
+	/* build initial scan keys, copied for every relation scan */
+	memset(&DistPartitionScanKey, 0, sizeof(DistPartitionScanKey));
+
+	fmgr_info_cxt(F_OIDEQ,
+				  &DistPartitionScanKey[0].sk_func,
+				  CacheMemoryContext);
+	DistPartitionScanKey[0].sk_strategy = BTEqualStrategyNumber;
+	DistPartitionScanKey[0].sk_subtype = InvalidOid;
+	DistPartitionScanKey[0].sk_collation = InvalidOid;
+	DistPartitionScanKey[0].sk_attno = Anum_pg_dist_partition_logicalrelid;
+
+	memset(&DistShardScanKey, 0, sizeof(DistShardScanKey));
+
+	fmgr_info_cxt(F_OIDEQ,
+				  &DistShardScanKey[0].sk_func,
+				  CacheMemoryContext);
+	DistShardScanKey[0].sk_strategy = BTEqualStrategyNumber;
+	DistShardScanKey[0].sk_subtype = InvalidOid;
+	DistShardScanKey[0].sk_collation = InvalidOid;
+	DistShardScanKey[0].sk_attno = Anum_pg_dist_shard_logicalrelid;
+
+	/* initialize the hash table */
+	MemSet(&info, 0, sizeof(info));
+	info.keysize = sizeof(Oid);
+	info.entrysize = sizeof(DistTableCacheEntry);
+	info.hash = tag_hash;
+	DistTableCacheHash =
+		hash_create("Distributed Relation Cache", 32, &info,
+					HASH_ELEM | HASH_FUNCTION);
+
+	/* Watch for invalidation events. */
+	CacheRegisterRelcacheCallback(InvalidateDistRelationCacheCallback,
+								  (Datum) 0);
+}
+
+
+/*
+ * ResetDistTableCacheEntry frees any out-of-band memory used by a cache entry,
+ * but does not free the entry itself.
+ */
+void
+ResetDistTableCacheEntry(DistTableCacheEntry *cacheEntry)
+{
+	if (cacheEntry->partitionKeyString != NULL)
+	{
+		pfree(cacheEntry->partitionKeyString);
+		cacheEntry->partitionKeyString = NULL;
+	}
+
+	if (cacheEntry->shardIntervalArrayLength > 0)
+	{
+		int i = 0;
+
+		for (i = 0; i < cacheEntry->shardIntervalArrayLength; i++)
+		{
+			ShardInterval *shardInterval = &cacheEntry->shardIntervalArray[i];
+			bool valueByVal = shardInterval->valueByVal;
+
+			if (!valueByVal)
+			{
+				if (shardInterval->minValueExists)
+				{
+					pfree(DatumGetPointer(shardInterval->minValue));
+				}
+
+				if (shardInterval->maxValueExists)
+				{
+					pfree(DatumGetPointer(shardInterval->maxValue));
+				}
+			}
+		}
+
+		pfree(cacheEntry->shardIntervalArray);
+		cacheEntry->shardIntervalArray = NULL;
+		cacheEntry->shardIntervalArrayLength = 0;
+	}
+}
+
+
+/*
+ * InvalidateDistRelationCacheCallback flushes cache entries when a relation
+ * is updated (or flushes the entire cache).
+ */
+static void
+InvalidateDistRelationCacheCallback(Datum argument, Oid relationId)
+{
+	/* invalidate either entire cache or a specific entry */
+	if (relationId == InvalidOid)
+	{
+		DistTableCacheEntry *cacheEntry = NULL;
+		HASH_SEQ_STATUS status;
+
+		hash_seq_init(&status, DistTableCacheHash);
+
+		while ((cacheEntry = (DistTableCacheEntry *) hash_seq_search(&status)) != NULL)
+		{
+			cacheEntry->isValid = false;
+		}
+	}
+	else
+	{
+		void *hashKey = (void *) &relationId;
+		bool foundInCache = false;
+
+		DistTableCacheEntry *cacheEntry = hash_search(DistTableCacheHash, hashKey,
+													  HASH_FIND, &foundInCache);
+		if (foundInCache)
+		{
+			cacheEntry->isValid = false;
+		}
+	}
+}
+
+
+/*
+ * LookupDistPartitionTuple searches pg_dist_partition for relationId's entry
+ * and returns that or, if no matching entry was found, NULL.
+ */
+static HeapTuple
+LookupDistPartitionTuple(Oid relationId)
+{
+	Relation pgDistPartition = NULL;
+	HeapTuple distPartitionTuple = NULL;
+	HeapTuple currentPartitionTuple = NULL;
+	SysScanDesc scanDescriptor;
+	ScanKeyData scanKey[1];
+
+	pgDistPartition = heap_open(DistPartitionRelationId(), AccessShareLock);
+
+	/* copy scankey to local copy, it will be modified during the scan */
+	memcpy(scanKey, DistPartitionScanKey, sizeof(DistPartitionScanKey));
+
+	/* set scan arguments */
+	scanKey[0].sk_argument = ObjectIdGetDatum(relationId);
+
+	scanDescriptor = systable_beginscan(pgDistPartition,
+										DistPartitionLogicalRelidIndexId(),
+										true, NULL, 1, scanKey);
+
+	currentPartitionTuple = systable_getnext(scanDescriptor);
+	if (HeapTupleIsValid(currentPartitionTuple))
+	{
+		Assert(!HeapTupleHasNulls(currentPartitionTuple));
+
+		distPartitionTuple = heap_copytuple(currentPartitionTuple);
+	}
+
+	systable_endscan(scanDescriptor);
+
+	heap_close(pgDistPartition, NoLock);
+
+	return distPartitionTuple;
+}
+
+
+/*
+ * LookupDistShardTuples returns a list of all dist_shard tuples for the
+ * specified relation.
+ */
+static List *
+LookupDistShardTuples(Oid relationId)
+{
+	Relation pgDistShard = NULL;
+	List *distShardTupleList = NIL;
+	HeapTuple currentShardTuple = NULL;
+	SysScanDesc scanDescriptor;
+	ScanKeyData scanKey[1];
+
+	pgDistShard = heap_open(DistShardRelationId(), AccessShareLock);
+
+	/* copy scankey to local copy, it will be modified during the scan */
+	memcpy(scanKey, DistShardScanKey, sizeof(DistShardScanKey));
+
+	/* set scan arguments */
+	scanKey[0].sk_argument = ObjectIdGetDatum(relationId);
+
+	scanDescriptor = systable_beginscan(pgDistShard, DistShardLogicalRelidIndexId(), true,
+	                                    NULL, 1, scanKey);
+
+	currentShardTuple = systable_getnext(scanDescriptor);
+	while (HeapTupleIsValid(currentShardTuple))
+	{
+		HeapTuple shardTupleCopy = heap_copytuple(currentShardTuple);
+		distShardTupleList = lappend(distShardTupleList, shardTupleCopy);
+
+		currentShardTuple = systable_getnext(scanDescriptor);
+	}
+
+	systable_endscan(scanDescriptor);
+	heap_close(pgDistShard, AccessShareLock);
+
+	return distShardTupleList;
+}
+
+
+/*
+ * GetPartitionTypeInputInfo populates output parameters with the interval type
+ * identifier and modifier for the specified partition key/method combination.
+ */
+static void
+GetPartitionTypeInputInfo(char *partitionKeyString, char partitionMethod,
+                          Oid *intervalTypeId, int32 *intervalTypeMod)
+{
+	*intervalTypeId = InvalidOid;
+	*intervalTypeMod = -1;
+
+	switch (partitionMethod)
+	{
+		case DISTRIBUTE_BY_APPEND:
+		case DISTRIBUTE_BY_RANGE:
+		{
+			Node *partitionNode = stringToNode(partitionKeyString);
+			Var *partitionColumn = (Var *) partitionNode;
+			Assert(IsA(partitionNode, Var));
+
+			*intervalTypeId = partitionColumn->vartype;
+			*intervalTypeMod = partitionColumn->vartypmod;
+			break;
+		}
+
+		case DISTRIBUTE_BY_HASH:
+		{
+			*intervalTypeId = INT4OID;
+			break;
+		}
+
+		default:
+		{
+			ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							errmsg("unsupported table partition type: %c",
+							       partitionMethod)));
+		}
+	}
+}
+
+
+/*
+ * TupleToShardInterval transforms the specified dist_shard tuple into a new
+ * ShardInterval using the provided descriptor and partition type information.
+ */
+static ShardInterval *
+TupleToShardInterval(HeapTuple heapTuple, TupleDesc tupleDescriptor, Oid intervalTypeId,
+                     int32 intervalTypeMod)
+{
+	ShardInterval *shardInterval = NULL;
+	bool isNull = false;
+	bool minValueNull = false;
+	bool maxValueNull = false;
+	Oid inputFunctionId = InvalidOid;
+	Oid typeIoParam = InvalidOid;
+	Datum relationIdDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_logicalrelid,
+	                                     tupleDescriptor, &isNull);
+	Datum shardIdDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardid,
+	                                     tupleDescriptor, &isNull);
+	Datum storageTypeDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardstorage,
+	                                      tupleDescriptor, &isNull);
+
+	Datum minValueTextDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardminvalue,
+	                                       tupleDescriptor, &minValueNull);
+	Datum maxValueTextDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardmaxvalue,
+	                                       tupleDescriptor, &maxValueNull);
+
+	Oid relationId = DatumGetObjectId(relationIdDatum);
+	int64 shardId = DatumGetInt64(shardIdDatum);
+	char storageType = DatumGetChar(storageTypeDatum);
+	Datum minValue = 0;
+	Datum maxValue = 0;
+	bool minValueExists = false;
+	bool maxValueExists = false;
+	int16 intervalTypeLen = 0;
+	bool intervalByVal = false;
+	char intervalAlign = '0';
+	char intervalDelim = '0';
+
+	if (!minValueNull && !maxValueNull)
+	{
+		char *minValueString = TextDatumGetCString(minValueTextDatum);
+		char *maxValueString = TextDatumGetCString(maxValueTextDatum);
+
+		/* TODO: move this up the call stack to avoid per-tuple invocation? */
+		get_type_io_data(intervalTypeId, IOFunc_input, &intervalTypeLen, &intervalByVal,
+		                 &intervalAlign, &intervalDelim, &typeIoParam, &inputFunctionId);
+
+		/* finally convert min/max values to their actual types */
+		minValue = OidInputFunctionCall(inputFunctionId, minValueString,
+										typeIoParam, intervalTypeMod);
+		maxValue = OidInputFunctionCall(inputFunctionId, maxValueString,
+										typeIoParam, intervalTypeMod);
+
+		minValueExists = true;
+		maxValueExists = true;
+	}
+
+	shardInterval = CitusMakeNode(ShardInterval);
+	shardInterval->relationId = relationId;
+	shardInterval->storageType = storageType;
+	shardInterval->valueTypeId = intervalTypeId;
+	shardInterval->valueTypeLen = intervalTypeLen;
+	shardInterval->valueByVal = intervalByVal;
+	shardInterval->minValueExists = minValueExists;
+	shardInterval->maxValueExists = maxValueExists;
+	shardInterval->minValue = minValue;
+	shardInterval->maxValue = maxValue;
+	shardInterval->shardId = shardId;
+
+	return shardInterval;
+}
+
+
+/*
+ * CachedRelationLookup performs a cached lookup for the relation
+ * relationName, with the result cached in *cachedOid.
+ *
+ * NB: The way this is cached means the result will be wrong after the
+ * extension is dropped and reconnect. A reconnect fixes that though, so that
+ * seems acceptable.
+ */
+static void
+CachedRelationLookup(const char *relationName, Oid *cachedOid)
+{
+	if (*cachedOid == InvalidOid)
+	{
+		*cachedOid = get_relname_relid(relationName, PG_CATALOG_NAMESPACE);
+
+		if (*cachedOid == InvalidOid)
+		{
+			ereport(ERROR, (errmsg("cache lookup failed for %s, called to early?",
+								   relationName)));
+		}
+	}
+}
--- a/src/backend/distributed/utils/multi_resowner.c
+++ b/src/backend/distributed/utils/multi_resowner.c
@ -0,0 +1,148 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi_resowner.c
+ *	  CitusDB resource owner integration
+ *
+ * An extension can't directly add members to ResourceOwnerData. Instead we
+ * have to use the resource owner callback mechanism. Right now it's
+ * sufficient to have an array of referenced resources - there bascially are
+ * never more than a handful of entries, if that. If that changes we should
+ * probably rather use a hash table using the pointer value of the resource
+ * owner as key.
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "distributed/multi_server_executor.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+#include "distributed/multi_resowner.h"
+
+
+typedef struct JobDirectoryEntry {
+	ResourceOwner owner;
+	uint64 jobId;
+} JobDirectoryEntry;
+
+
+static bool RegisteredResownerCallback = false;
+JobDirectoryEntry *RegisteredJobDirectories = NULL;
+size_t NumRegisteredJobDirectories = 0;
+size_t NumAllocatedJobDirectories = 0;
+
+
+/*
+ * Resource owner callback - release resources still held by the resource
+ * owner.
+ */
+static void
+MultiResourceOwnerReleaseCallback(ResourceReleasePhase phase,
+								  bool isCommit,
+								  bool isTopLevel,
+								  void *arg)
+{
+	int	lastJobIndex = NumRegisteredJobDirectories - 1;
+	int	jobIndex = 0;
+
+	if (phase == RESOURCE_RELEASE_AFTER_LOCKS)
+	{
+		/*
+		 * Remove all remaining job directories, after locks have been
+		 * released.
+		 */
+		for (jobIndex = lastJobIndex; jobIndex >= 0; jobIndex--)
+		{
+			JobDirectoryEntry *entry = &RegisteredJobDirectories[jobIndex];
+
+			if (entry->owner == CurrentResourceOwner)
+			{
+				RemoveJobDirectory(entry->jobId);
+			}
+		}
+	}
+}
+
+
+/*
+ * ResourceOwnerEnlargeJobDirectories makes sure that there is space to
+ * reference at least one more job directory for the resource owner. Note that
+ * we only expect one job directory per portal, but we still use an array
+ * here.
+ *
+ * This function is separate from the one actually inserting an entry because
+ * if we run out of memory, it's critical to do so *before* acquiring the
+ * resource.
+ */
+void
+ResourceOwnerEnlargeJobDirectories(ResourceOwner owner)
+{
+	int	newMax = 0;
+
+	/* ensure callback is registered */
+	if (!RegisteredResownerCallback)
+	{
+		RegisterResourceReleaseCallback(MultiResourceOwnerReleaseCallback, NULL);
+		RegisteredResownerCallback = true;
+	}
+
+	if (RegisteredJobDirectories == NULL)
+	{
+		newMax = 16;
+		RegisteredJobDirectories = (JobDirectoryEntry *)
+			MemoryContextAlloc(TopMemoryContext, newMax * sizeof(JobDirectoryEntry));
+		NumAllocatedJobDirectories = newMax;
+	}
+	else if (NumRegisteredJobDirectories + 1 > NumAllocatedJobDirectories)
+	{
+		newMax = NumAllocatedJobDirectories * 2;
+		RegisteredJobDirectories = (JobDirectoryEntry *)
+			repalloc(RegisteredJobDirectories, newMax * sizeof(JobDirectoryEntry));
+		NumAllocatedJobDirectories = newMax;
+	}
+}
+
+
+/* Remembers that a temporary job directory is owned by a resource owner. */
+void
+ResourceOwnerRememberJobDirectory(ResourceOwner owner, uint64 jobId)
+{
+	JobDirectoryEntry *entry = NULL;
+
+	Assert(NumRegisteredJobDirectories + 1 <= NumAllocatedJobDirectories);
+	entry = &RegisteredJobDirectories[NumRegisteredJobDirectories];
+	entry->owner = owner;
+	entry->jobId = jobId;
+	NumRegisteredJobDirectories++;
+}
+
+
+/* Forgets that a temporary job directory is owned by a resource owner. */
+void
+ResourceOwnerForgetJobDirectory(ResourceOwner owner, uint64 jobId)
+{
+	int	lastJobIndex = NumRegisteredJobDirectories - 1;
+	int	jobIndex = 0;
+
+	for (jobIndex = lastJobIndex; jobIndex >= 0; jobIndex--)
+	{
+		JobDirectoryEntry *entry = &RegisteredJobDirectories[jobIndex];
+
+		if (entry->owner == owner && entry->jobId == jobId)
+		{
+			/* move all later entries one up */
+			while (jobIndex < lastJobIndex)
+			{
+				RegisteredJobDirectories[jobIndex] = RegisteredJobDirectories[jobIndex + 1];
+				jobIndex++;
+			}
+			NumRegisteredJobDirectories = lastJobIndex;
+			return;
+		}
+	}
+
+	elog(ERROR, "jobId " UINT64_FORMAT " is not owned by resource owner %p",
+		 jobId, owner);
+}
--- a/src/backend/distributed/utils/resource_lock.c
+++ b/src/backend/distributed/utils/resource_lock.c
@ -0,0 +1,118 @@
+/*-------------------------------------------------------------------------
+ *
+ * resource_lock.c
+ *	  Locking Infrastructure for CitusDB.
+ *
+ * To avoid introducing a new type of locktag - that then could not be
+ * displayed by core functionality - we reuse advisory locks. If we'd just
+ * reused them directly we'd run into danger conflicting with user-defined
+ * advisory locks, but luckily advisory locks only two values for 'field4' in
+ * the locktag.
+ *
+ * Copyright (c) 2012-2015, Citus Data, Inc.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+
+#include "distributed/resource_lock.h"
+#include "storage/lmgr.h"
+
+
+/*
+ * LockShardDistributionMetadata returns after grabbing a lock for distribution
+ * metadata related to the specified shard, blocking if required. ExclusiveLock
+ * and ShareLock modes are supported. Any locks acquired using this method are
+ * released at transaction end.
+ */
+void
+LockShardDistributionMetadata(int64 shardId, LOCKMODE lockMode)
+{
+	LOCKTAG	tag;
+	const bool sessionLock = false;
+	const bool dontWait = false;
+
+	SET_LOCKTAG_SHARD_METADATA_RESOURCE(tag, MyDatabaseId, shardId);
+
+	(void) LockAcquire(&tag, lockMode, sessionLock, dontWait);
+}
+
+
+/*
+ * LockRelationDistributionMetadata returns after getting a the lock used for a
+ * relation's distribution metadata, blocking if required. Only ExclusiveLock
+ * and ShareLock modes are supported. Any locks acquired using this method are
+ * released at transaction end.
+ */
+void
+LockRelationDistributionMetadata(Oid relationId, LOCKMODE lockMode)
+{
+	Assert(lockMode == ExclusiveLock || lockMode == ShareLock);
+
+	(void) LockRelationOid(relationId, lockMode);
+}
+
+
+/*
+ * LockShardResource acquires a lock needed to modify data on a remote shard.
+ * This task may be assigned to multiple backends at the same time, so the lock
+ * manages any concurrency issues associated with shard file fetching and DML
+ * command execution.
+ */
+void
+LockShardResource(uint64 shardId, LOCKMODE lockmode)
+{
+	LOCKTAG	tag;
+	const bool sessionLock = false;
+	const bool dontWait = false;
+
+	SET_LOCKTAG_SHARD_RESOURCE(tag, MyDatabaseId, shardId);
+
+	(void) LockAcquire(&tag, lockmode, sessionLock, dontWait);
+}
+
+
+/* Releases the lock associated with the relay file fetching/DML task. */
+void
+UnlockShardResource(uint64 shardId, LOCKMODE lockmode)
+{
+	LOCKTAG	tag;
+	const bool sessionLock = false;
+
+	SET_LOCKTAG_SHARD_RESOURCE(tag, MyDatabaseId, shardId);
+
+	LockRelease(&tag, lockmode, sessionLock);
+}
+
+
+/*
+ * LockJobResource acquires a lock for creating resources associated with the
+ * given jobId. This resource is typically a job schema (namespace), and less
+ * commonly a partition task directory.
+ */
+void
+LockJobResource(uint64 jobId, LOCKMODE lockmode)
+{
+	LOCKTAG	tag;
+	const bool sessionLock = false;
+	const bool dontWait = false;
+
+	SET_LOCKTAG_JOB_RESOURCE(tag, MyDatabaseId, jobId);
+
+	(void) LockAcquire(&tag, lockmode, sessionLock, dontWait);
+}
+
+
+/* Releases the lock for resources associated with the given job id. */
+void
+UnlockJobResource(uint64 jobId, LOCKMODE lockmode)
+{
+	LOCKTAG	tag;
+	const bool sessionLock = false;
+
+	SET_LOCKTAG_JOB_RESOURCE(tag, MyDatabaseId, jobId);
+
+	LockRelease(&tag, lockmode, sessionLock);
+}
--- a/src/backend/distributed/utils/ruleutils_94.c
+++ b/src/backend/distributed/utils/ruleutils_94.c
--- a/src/backend/distributed/utils/ruleutils_95.c
+++ b/src/backend/distributed/utils/ruleutils_95.c
--- a/src/backend/distributed/worker/task_tracker.c
+++ b/src/backend/distributed/worker/task_tracker.c
--- a/src/backend/distributed/worker/task_tracker_protocol.c
+++ b/src/backend/distributed/worker/task_tracker_protocol.c
@ -0,0 +1,387 @@
+/*-------------------------------------------------------------------------
+ *
+ * task_tracker_protocol.c
+ *
+ * The task tracker background process runs on every worker node. The following
+ * routines allow for the master node to assign tasks to the task tracker, check
+ * these tasks' statuses, and remove these tasks when they are no longer needed.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include <time.h>
+
+#include "access/xact.h"
+#include "commands/dbcommands.h"
+#include "commands/schemacmds.h"
+#include "distributed/multi_client_executor.h"
+#include "distributed/multi_server_executor.h"
+#include "distributed/resource_lock.h"
+#include "distributed/task_tracker.h"
+#include "distributed/task_tracker_protocol.h"
+#include "distributed/worker_protocol.h"
+#include "storage/lwlock.h"
+#include "storage/pmsignal.h"
+#include "utils/builtins.h"
+
+
+/* Local functions forward declarations */
+static bool TaskTrackerRunning(void);
+static void CreateJobSchema(StringInfo schemaName);
+static void CreateTask(uint64 jobId, uint32 taskId, char *taskCallString);
+static void UpdateTask(WorkerTask *workerTask, char *taskCallString);
+static void CleanupTask(WorkerTask *workerTask);
+
+
+/* exports for SQL callable functions */
+PG_FUNCTION_INFO_V1(task_tracker_assign_task);
+PG_FUNCTION_INFO_V1(task_tracker_task_status);
+PG_FUNCTION_INFO_V1(task_tracker_cleanup_job);
+
+
+/*
+ * task_tracker_assign_task creates a new task in the shared hash or updates an
+ * already existing task. The function also creates a schema for the job if it
+ * doesn't already exist.
+ */
+Datum
+task_tracker_assign_task(PG_FUNCTION_ARGS)
+{
+	uint64 jobId = PG_GETARG_INT64(0);
+	uint32 taskId = PG_GETARG_UINT32(1);
+	text  *taskCallStringText = PG_GETARG_TEXT_P(2);
+
+	StringInfo jobSchemaName = JobSchemaName(jobId);
+	bool schemaExists = false;
+
+	WorkerTask *workerTask = NULL;
+	char *taskCallString = text_to_cstring(taskCallStringText);
+	uint32 taskCallStringLength = strlen(taskCallString);
+
+	/* check that we have a running task tracker on this host */
+	bool taskTrackerRunning = TaskTrackerRunning();
+	if (!taskTrackerRunning)
+	{
+		ereport(ERROR, (errcode(ERRCODE_CANNOT_CONNECT_NOW),
+						errmsg("the task tracker has been disabled or shut down")));
+	}
+
+	/* check that we have enough space in our shared hash for this string */
+	if (taskCallStringLength >= TASK_CALL_STRING_SIZE)
+	{
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("task call string exceeds maximum assignable length")));
+	}
+
+	/*
+	 * If the schema does not exist, we create it. However, the schema does not
+	 * become visible to other processes until the transaction commits, and we
+	 * therefore do not release the resource lock in this case. Otherwise, the
+	 * schema is already visible, and we immediately release the resource lock.
+	 */
+	LockJobResource(jobId, AccessExclusiveLock);
+	schemaExists = JobSchemaExists(jobSchemaName);
+	if (!schemaExists)
+	{
+		/* lock gets automatically released upon return from this function */
+		CreateJobSchema(jobSchemaName);
+	}
+	else
+	{
+		UnlockJobResource(jobId, AccessExclusiveLock);
+	}
+
+	LWLockAcquire(WorkerTasksSharedState->taskHashLock, LW_EXCLUSIVE);
+
+	/* check if we already have the task in our shared hash */
+	workerTask = WorkerTasksHashFind(jobId, taskId);
+	if (workerTask == NULL)
+	{
+		CreateTask(jobId, taskId, taskCallString);
+	}
+	else
+	{
+		UpdateTask(workerTask, taskCallString);
+	}
+
+	LWLockRelease(WorkerTasksSharedState->taskHashLock);
+
+	PG_RETURN_VOID();
+}
+
+
+/* Returns the task status of an already existing task. */
+Datum
+task_tracker_task_status(PG_FUNCTION_ARGS)
+{
+	uint64 jobId = PG_GETARG_INT64(0);
+	uint32 taskId = PG_GETARG_UINT32(1);
+
+	WorkerTask *workerTask = NULL;
+	uint32 taskStatus = 0;
+
+	bool taskTrackerRunning = TaskTrackerRunning();
+	if (taskTrackerRunning)
+	{
+		LWLockAcquire(WorkerTasksSharedState->taskHashLock, LW_SHARED);
+
+		workerTask = WorkerTasksHashFind(jobId, taskId);
+		if (workerTask == NULL)
+		{
+			ereport(ERROR, (errmsg("could not find the worker task"),
+							errdetail("Task jobId: " UINT64_FORMAT " and taskId: %u",
+									  jobId, taskId)));
+		}
+
+		taskStatus = (uint32) workerTask->taskStatus;
+
+		LWLockRelease(WorkerTasksSharedState->taskHashLock);
+	}
+	else
+	{
+		ereport(ERROR, (errcode(ERRCODE_CANNOT_CONNECT_NOW),
+						errmsg("the task tracker has been disabled or shut down")));
+	}
+
+	PG_RETURN_UINT32(taskStatus);
+}
+
+
+/*
+ * task_tracker_cleanup_job finds all tasks for the given job, and cleans up
+ * files, connections, and shared hash enties associated with these tasks.
+ */
+Datum
+task_tracker_cleanup_job(PG_FUNCTION_ARGS)
+{
+	uint64 jobId = PG_GETARG_INT64(0);
+
+	HASH_SEQ_STATUS status;
+	WorkerTask *currentTask = NULL;
+	StringInfo jobDirectoryName = NULL;
+	StringInfo jobSchemaName = NULL;
+
+	/*
+	 * We first clean up any open connections, and remove tasks belonging to
+	 * this job from the shared hash.
+	 */
+	LWLockAcquire(WorkerTasksSharedState->taskHashLock, LW_EXCLUSIVE);
+
+	hash_seq_init(&status, WorkerTasksSharedState->taskHash);
+
+	currentTask = (WorkerTask *) hash_seq_search(&status);
+	while (currentTask != NULL)
+	{
+		if (currentTask->jobId == jobId)
+		{
+			CleanupTask(currentTask);
+		}
+
+		currentTask = (WorkerTask *) hash_seq_search(&status);		
+	}
+
+	LWLockRelease(WorkerTasksSharedState->taskHashLock);
+
+	/*
+	 * We then delete the job directory and schema, if they exist. This cleans
+	 * up all intermediate files and tables allocated for the job. Note that the
+	 * schema drop call can block if another process is creating the schema or
+	 * writing to a table within the schema.
+	 */
+	jobDirectoryName = JobDirectoryName(jobId);
+	RemoveDirectory(jobDirectoryName);
+
+	LockJobResource(jobId, AccessExclusiveLock);
+	jobSchemaName = JobSchemaName(jobId);
+	RemoveJobSchema(jobSchemaName);
+	UnlockJobResource(jobId, AccessExclusiveLock);
+
+	PG_RETURN_VOID();
+}
+
+
+/*
+ * TaskTrackerRunning checks if the task tracker process is running. To do this,
+ * the function checks if the task tracker is configured to start up, and infers
+ * from shared memory that the tracker hasn't received a shut down request.
+ */
+static bool
+TaskTrackerRunning(void)
+{
+	WorkerTask *workerTask = NULL;
+	bool postmasterAlive = true;
+	bool taskTrackerRunning = true;
+
+	/* if postmaster shut down, infer task tracker shut down from it */
+	postmasterAlive = PostmasterIsAlive();
+	if (!postmasterAlive)
+	{
+		return false;
+	}
+
+	/*
+	 * When the task tracker receives a termination signal, it inserts a special
+	 * marker task to the shared hash. We need to look up this marker task since
+	 * the postmaster doesn't send a terminate signal to running backends.
+	 */
+	LWLockAcquire(WorkerTasksSharedState->taskHashLock, LW_SHARED);
+
+	workerTask = WorkerTasksHashFind(RESERVED_JOB_ID, SHUTDOWN_MARKER_TASK_ID);
+	if (workerTask != NULL)
+	{
+		taskTrackerRunning = false;
+	}
+
+	LWLockRelease(WorkerTasksSharedState->taskHashLock);
+
+	return taskTrackerRunning;
+}
+
+
+/*
+ * CreateJobSchema creates a job schema with the given schema name. Note that
+ * this function ensures that our pg_ prefixed schema names can be created.
+ * Further note that the created schema does not become visible to other
+ * processes until the transaction commits.
+ */
+static void
+CreateJobSchema(StringInfo schemaName)
+{
+	const char *queryString = NULL;
+	bool oldAllowSystemTableMods = false;
+
+	CreateSchemaStmt *createSchemaStmt = makeNode(CreateSchemaStmt);
+	createSchemaStmt->schemaname = schemaName->data;
+#if (PG_VERSION_NUM >= 90500)
+	createSchemaStmt->authrole = NULL;
+#else
+	createSchemaStmt->authid = NULL;
+#endif
+	createSchemaStmt->schemaElts = NIL;
+
+	/* allow schema names that start with pg_ */
+	oldAllowSystemTableMods = allowSystemTableMods;
+	allowSystemTableMods = true;
+
+	CreateSchemaCommand(createSchemaStmt, queryString);
+	CommandCounterIncrement();
+	allowSystemTableMods = oldAllowSystemTableMods;
+}
+
+
+/*
+ * CreateTask creates a new task in shared hash, initializes the task, and sets
+ * the task to assigned state. Note that this function expects the caller to
+ * hold an exclusive lock over the shared hash.
+ */
+static void
+CreateTask(uint64 jobId, uint32 taskId, char *taskCallString)
+{
+	WorkerTask *workerTask = NULL;
+	uint32 assignmentTime = 0;
+	char *databaseName = get_database_name(MyDatabaseId);
+
+	/* increase task priority for cleanup tasks */
+	assignmentTime = (uint32) time(NULL);
+	if (taskId == JOB_CLEANUP_TASK_ID)
+	{
+		assignmentTime = HIGH_PRIORITY_TASK_TIME;
+	}
+
+	/* enter the worker task into shared hash and initialize the task */
+	workerTask = WorkerTasksHashEnter(jobId, taskId);
+	workerTask->assignedAt = assignmentTime;
+	strncpy(workerTask->taskCallString, taskCallString, TASK_CALL_STRING_SIZE);
+
+	workerTask->taskStatus = TASK_ASSIGNED;
+	workerTask->connectionId = INVALID_CONNECTION_ID;
+	workerTask->failureCount = 0;
+	strncpy(workerTask->databaseName, databaseName, NAMEDATALEN);
+}
+
+
+/* 
+ * UpdateTask updates the call string text for an already existing task. Note
+ * that this function expects the caller to hold an exclusive lock over the
+ * shared hash.
+ */
+static void
+UpdateTask(WorkerTask *workerTask, char *taskCallString)
+{
+	TaskStatus taskStatus = TASK_STATUS_INVALID_FIRST;
+
+	taskStatus = workerTask->taskStatus;
+	Assert(taskStatus != TASK_STATUS_INVALID_FIRST);
+
+	/*
+	 * 1. If the task has succeeded or has been canceled, we don't do anything.
+	 * 2. If the task has permanently failed, we update the task call string,
+	 * reset the failure count, and change the task's status to schedulable.
+	 * 3. If the task is in conduit, we update the task call string, and reset
+	 * the failure count.
+	 */
+	if (taskStatus == TASK_SUCCEEDED || taskStatus == TASK_CANCEL_REQUESTED ||
+		taskStatus == TASK_CANCELED)
+	{
+		;	/* nothing to do */
+	}
+	else if (taskStatus == TASK_PERMANENTLY_FAILED)
+	{
+		strncpy(workerTask->taskCallString, taskCallString, TASK_CALL_STRING_SIZE);
+		workerTask->failureCount = 0;
+		workerTask->taskStatus = TASK_ASSIGNED;
+	}
+	else
+	{
+		strncpy(workerTask->taskCallString, taskCallString, TASK_CALL_STRING_SIZE);
+		workerTask->failureCount = 0;
+	}
+}
+
+
+/* Cleans up connection and shared hash entry associated with the given task. */
+static void
+CleanupTask(WorkerTask *workerTask)
+{
+	WorkerTask *taskRemoved = NULL;
+	void *hashKey = (void *) workerTask;
+
+	/*
+	 * If the connection is still valid, the master node decided to terminate
+	 * the task prematurely. This can happen when the user wants to cancel the
+	 * query, or when a speculatively executed task finishes elsewhere and the
+	 * query completes.
+	 */
+	if (workerTask->connectionId != INVALID_CONNECTION_ID)
+	{
+		/*
+		 * The task tracker process owns the connections to local backends, and
+		 * we cannot interefere with those connections from another process. We
+		 * therefore ask the task tracker to clean up the connection and to
+		 * remove the task from the shared hash. Note that one of the cleaned up
+		 * tasks will always be the clean-up task itself.
+		 */
+		ereport(DEBUG3, (errmsg("requesting cancel for worker task"),
+						 errdetail("Task jobId: " UINT64_FORMAT " and taskId: %u",
+								   workerTask->jobId, workerTask->taskId)));
+
+		workerTask->taskStatus = TASK_CANCEL_REQUESTED;
+		return;
+	}
+
+	/* remove the task from the shared hash */
+	taskRemoved = hash_search(WorkerTasksSharedState->taskHash, hashKey, HASH_REMOVE,
+							  NULL);
+	if (taskRemoved == NULL)
+	{
+		ereport(FATAL, (errmsg("worker task hash corrupted")));
+	}
+}
--- a/src/backend/distributed/worker/worker_data_fetch_protocol.c
+++ b/src/backend/distributed/worker/worker_data_fetch_protocol.c
--- a/src/backend/distributed/worker/worker_file_access_protocol.c
+++ b/src/backend/distributed/worker/worker_file_access_protocol.c
@ -0,0 +1,87 @@
+/*-------------------------------------------------------------------------
+ *
+ * worker_file_access_protocol.c
+ *
+ * Routines for accessing file related information on this worker node.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "funcapi.h"
+
+#include "commands/defrem.h"
+#include "distributed/master_protocol.h"
+#include "distributed/worker_protocol.h"
+#include "foreign/foreign.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+
+
+/* exports for SQL callable functions */
+PG_FUNCTION_INFO_V1(worker_foreign_file_path);
+PG_FUNCTION_INFO_V1(worker_find_block_local_path);
+
+
+/*
+ * worker_foreign_file_path resolves the foreign table for the given table name,
+ * and extracts and returns the file path associated with that foreign table.
+ */
+Datum
+worker_foreign_file_path(PG_FUNCTION_ARGS)
+{
+	text *foreignTableName = PG_GETARG_TEXT_P(0);
+	text *foreignFilePath = NULL;
+	Oid relationId = ResolveRelationId(foreignTableName);
+	ForeignTable *foreignTable = GetForeignTable(relationId);
+
+	ListCell *optionCell = NULL;
+    foreach(optionCell, foreignTable->options)
+    {
+        DefElem *option = (DefElem *) lfirst(optionCell);
+		char *optionName = option->defname;
+
+        int compareResult = strncmp(optionName, FOREIGN_FILENAME_OPTION, MAXPGPATH);
+        if (compareResult == 0)
+        {
+            char *optionValue = defGetString(option);
+            foreignFilePath = cstring_to_text(optionValue);
+            break;
+        }
+    }
+
+	/* check that we found the filename option */
+    if (foreignFilePath == NULL)
+	{
+		char *relationName = get_rel_name(relationId);
+		ereport(ERROR, (errmsg("could not find filename for foreign table: \"%s\"",
+							   relationName)));
+	}
+
+	PG_RETURN_TEXT_P(foreignFilePath);
+}
+
+
+/*
+ * Protocol declaration for a function whose future implementation will find the
+ * given HDFS block's local file path.
+ */
+Datum
+worker_find_block_local_path(PG_FUNCTION_ARGS)
+{
+	int64 blockId = PG_GETARG_INT64(0);
+	ArrayType *dataDirectoryObject = PG_GETARG_ARRAYTYPE_P(1);
+
+	/* keep the compiler silent */
+	(void) blockId;
+	(void) dataDirectoryObject;
+
+	ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					errmsg("called function is currently unsupported")));
+
+	PG_RETURN_TEXT_P(NULL);
+}
--- a/src/backend/distributed/worker/worker_merge_protocol.c
+++ b/src/backend/distributed/worker/worker_merge_protocol.c
@ -0,0 +1,547 @@
+/*-------------------------------------------------------------------------
+ *
+ * worker_merge_protocol.c
+ *
+ * Routines for merging partitioned files into a single file or table. Merging
+ * files is one of the threee distributed execution primitives that we apply on
+ * worker nodes.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+
+#include "access/htup_details.h"
+#include "access/xact.h"
+#include "catalog/dependency.h"
+#include "catalog/pg_namespace.h"
+#include "commands/copy.h"
+#include "commands/tablecmds.h"
+#include "distributed/worker_protocol.h"
+#include "executor/spi.h"
+#include "nodes/makefuncs.h"
+#include "parser/parse_type.h"
+#include "storage/lmgr.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/tqual.h"
+
+
+/* Local functions forward declarations */
+static List * ArrayObjectToCStringList(ArrayType *arrayObject);
+static void CreateTaskTable(StringInfo schemaName, StringInfo relationName,
+							List *columnNameList, List *columnTypeList);
+static void CopyTaskFilesFromDirectory(StringInfo schemaName, StringInfo relationName,
+									   StringInfo sourceDirectoryName);
+
+
+/* exports for SQL callable functions */
+PG_FUNCTION_INFO_V1(worker_merge_files_into_table);
+PG_FUNCTION_INFO_V1(worker_merge_files_and_run_query);
+PG_FUNCTION_INFO_V1(worker_cleanup_job_schema_cache);
+
+
+/*
+ * worker_merge_files_into_table creates a task table within the job's schema,
+ * which should have already been created by the task tracker protocol, and
+ * copies files in its task directory into this table. If the schema doesn't
+ * exist, the function defaults to the 'public' schema. Note that, unlike
+ * partitioning functions, this function is not always idempotent. On success,
+ * the function creates the table and loads data, and subsequent calls to the
+ * function error out because the table already exist. On failure, the task
+ * table creation commands are rolled back, and the function can be called
+ * again.
+ */
+Datum
+worker_merge_files_into_table(PG_FUNCTION_ARGS)
+{
+	uint64 jobId = PG_GETARG_INT64(0);
+	uint32 taskId = PG_GETARG_UINT32(1);
+	ArrayType *columnNameObject = PG_GETARG_ARRAYTYPE_P(2);
+	ArrayType *columnTypeObject = PG_GETARG_ARRAYTYPE_P(3);
+
+	StringInfo jobSchemaName = JobSchemaName(jobId);
+	StringInfo taskTableName = TaskTableName(taskId);
+	StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
+	bool schemaExists = false;
+	List *columnNameList = NIL;
+	List *columnTypeList = NIL;
+
+	/* we should have the same number of column names and types */
+	int32 columnNameCount = ArrayObjectCount(columnNameObject);
+	int32 columnTypeCount = ArrayObjectCount(columnTypeObject);
+	if (columnNameCount != columnTypeCount)
+	{
+		ereport(ERROR, (errmsg("column name array size: %d and type array size: %d"
+							   " do not match", columnNameCount, columnTypeCount)));
+	}
+
+	/*
+	 * If the schema for the job isn't already created by the task tracker
+	 * protocol, we fall to using the default 'public' schema.
+	 */
+	schemaExists = JobSchemaExists(jobSchemaName);
+	if (!schemaExists)
+	{
+		resetStringInfo(jobSchemaName);
+		appendStringInfoString(jobSchemaName, "public");
+	}
+
+	/* create the task table and copy files into the table */
+	columnNameList = ArrayObjectToCStringList(columnNameObject);
+	columnTypeList = ArrayObjectToCStringList(columnTypeObject);
+
+	CreateTaskTable(jobSchemaName, taskTableName, columnNameList, columnTypeList);
+
+	CopyTaskFilesFromDirectory(jobSchemaName, taskTableName, taskDirectoryName);
+
+	PG_RETURN_VOID();
+}
+
+
+/*
+ * worker_merge_files_and_run_query creates a merge task table within the job's
+ * schema, which should have already been created by the task tracker protocol.
+ * It copies files in its task directory into this table. Then it runs final
+ * query to create result table of the job.
+ *
+ * Note that here we followed a different approach to create a task table for merge
+ * files than worker_merge_files_into_table(). In future we should unify these
+ * two approaches. For this purpose creating a directory_fdw extension and using
+ * it would make sense. Then we can merge files with a query or without query
+ * through directory_fdw.
+ */
+Datum
+worker_merge_files_and_run_query(PG_FUNCTION_ARGS)
+{
+	uint64 jobId = PG_GETARG_INT64(0);
+	uint32 taskId = PG_GETARG_UINT32(1);
+	text *createMergeTableQueryText = PG_GETARG_TEXT_P(2);
+	text *createIntermediateTableQueryText = PG_GETARG_TEXT_P(3);
+
+	const char *createMergeTableQuery = text_to_cstring(createMergeTableQueryText);
+	const char *createIntermediateTableQuery =
+			text_to_cstring(createIntermediateTableQueryText);
+
+	StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
+	StringInfo jobSchemaName = JobSchemaName(jobId);
+	StringInfo intermediateTableName = TaskTableName(taskId);
+	StringInfo mergeTableName = makeStringInfo();
+	StringInfo setSearchPathString = makeStringInfo();
+	bool schemaExists = false;
+	int connected = 0;
+	int setSearchPathResult = 0;
+	int createMergeTableResult = 0;
+	int createIntermediateTableResult = 0;
+	int finished = 0;
+
+	/*
+	 * If the schema for the job isn't already created by the task tracker
+	 * protocol, we fall to using the default 'public' schema.
+	 */
+	schemaExists = JobSchemaExists(jobSchemaName);
+	if (!schemaExists)
+	{
+		resetStringInfo(jobSchemaName);
+		appendStringInfoString(jobSchemaName, "public");
+	}
+
+	appendStringInfo(setSearchPathString, SET_SEARCH_PATH_COMMAND, jobSchemaName->data);
+
+	connected = SPI_connect();
+	if (connected != SPI_OK_CONNECT)
+	{
+		ereport(ERROR, (errmsg("could not connect to SPI manager")));
+	}
+
+	setSearchPathResult = SPI_exec(setSearchPathString->data, 0);
+	if (setSearchPathResult < 0)
+	{
+		ereport(ERROR, (errmsg("execution was not successful \"%s\"",
+								setSearchPathString->data)));
+	}
+
+	createMergeTableResult = SPI_exec(createMergeTableQuery, 0);
+	if (createMergeTableResult < 0)
+	{
+		ereport(ERROR, (errmsg("execution was not successful \"%s\"",
+								createMergeTableQuery)));
+	}
+
+	appendStringInfo(mergeTableName, "%s%s", intermediateTableName->data,
+					 MERGE_TABLE_SUFFIX);
+	CopyTaskFilesFromDirectory(jobSchemaName, mergeTableName, taskDirectoryName);
+
+	createIntermediateTableResult = SPI_exec(createIntermediateTableQuery, 0);
+	if (createIntermediateTableResult < 0)
+	{
+		ereport(ERROR, (errmsg("execution was not successful \"%s\"",
+						createIntermediateTableQuery)));
+	}
+
+	finished = SPI_finish();
+	if (finished != SPI_OK_FINISH)
+	{
+		ereport(ERROR, (errmsg("could not disconnect from SPI manager")));
+	}
+
+	PG_RETURN_VOID();
+}
+
+
+/*
+ * worker_cleanup_job_schema_cache walks over all schemas in the database, and
+ * removes schemas whose names start with the job schema prefix. Note that this
+ * function does not perform any locking; we expect it to be called at process
+ * start-up time before any merge tasks are run. Further note that this function
+ * runs within the scope of a particular database (template1, postgres) and can
+ * only delete schemas within that database.
+ */
+Datum
+worker_cleanup_job_schema_cache(PG_FUNCTION_ARGS)
+{
+	Relation pgNamespace = NULL;
+	HeapScanDesc scanDescriptor = NULL;
+	ScanKey scanKey = NULL;
+	int scanKeyCount = 0;
+	HeapTuple heapTuple = NULL;
+
+	pgNamespace = heap_open(NamespaceRelationId, AccessExclusiveLock);
+	scanDescriptor = heap_beginscan_catalog(pgNamespace, scanKeyCount, scanKey);
+
+	heapTuple = heap_getnext(scanDescriptor, ForwardScanDirection);
+	while (HeapTupleIsValid(heapTuple))
+	{
+		Form_pg_namespace schemaForm = (Form_pg_namespace) GETSTRUCT(heapTuple);
+		char *schemaName = NameStr(schemaForm->nspname);
+
+		char *jobSchemaFound = strstr(schemaName, JOB_SCHEMA_PREFIX);
+		if (jobSchemaFound != NULL)
+		{
+			StringInfo jobSchemaName = makeStringInfo();
+			appendStringInfoString(jobSchemaName, schemaName);
+
+			RemoveJobSchema(jobSchemaName);
+		}
+
+		heapTuple = heap_getnext(scanDescriptor, ForwardScanDirection);
+	}
+
+	heap_endscan(scanDescriptor);
+	heap_close(pgNamespace, AccessExclusiveLock);
+
+	PG_RETURN_VOID();
+}
+
+
+/* Constructs a standardized job schema name for the given job id. */
+StringInfo
+JobSchemaName(uint64 jobId)
+{
+	/*
+	 * We need to apply padding on our 64-bit job id, and therefore cannot use
+	 * UINT64_FORMAT here.
+	 */
+#ifdef HAVE_INTTYPES_H
+	StringInfo jobSchemaName = makeStringInfo();
+	appendStringInfo(jobSchemaName, "%s%0*"PRIu64,
+					 JOB_SCHEMA_PREFIX, MIN_JOB_DIRNAME_WIDTH, jobId);
+#else
+	StringInfo jobSchemaName = makeStringInfo();
+	appendStringInfo(jobSchemaName, "%s%0*llu",
+					 JOB_SCHEMA_PREFIX, MIN_JOB_DIRNAME_WIDTH, jobId);
+#endif
+
+	return jobSchemaName;
+}
+
+
+/* Constructs a standardized task table name for the given task id. */
+StringInfo
+TaskTableName(uint32 taskId)
+{
+	StringInfo taskTableName = makeStringInfo();
+	appendStringInfo(taskTableName, "%s%0*u",
+					 TASK_TABLE_PREFIX, MIN_TASK_FILENAME_WIDTH, taskId);
+
+	return taskTableName;
+}
+
+
+/* Creates a list of cstrings from a single dimensional array object. */
+static List *
+ArrayObjectToCStringList(ArrayType *arrayObject)
+{
+	List *cstringList = NIL;
+	Datum *datumArray = DeconstructArrayObject(arrayObject);
+	int32 arraySize = ArrayObjectCount(arrayObject);
+
+	int32 arrayIndex = 0;
+	for (arrayIndex = 0; arrayIndex < arraySize; arrayIndex++)
+	{
+		Datum datum = datumArray[arrayIndex];
+		char *cstring = TextDatumGetCString(datum);
+
+		cstringList = lappend(cstringList, cstring);
+	}
+
+	Assert(cstringList != NIL);
+	return cstringList;
+}
+
+
+/* Checks if a schema with the given schema name exists. */
+bool
+JobSchemaExists(StringInfo schemaName)
+{
+	Datum schemaNameDatum = CStringGetDatum(schemaName->data);
+	bool schemaExists = SearchSysCacheExists(NAMESPACENAME, schemaNameDatum, 0, 0, 0);
+
+	return schemaExists;
+}
+
+
+/* Removes the schema and all tables within the schema, if the schema exists. */
+void
+RemoveJobSchema(StringInfo schemaName)
+{
+	Datum schemaNameDatum = CStringGetDatum(schemaName->data);
+	Oid schemaId = InvalidOid;
+
+	schemaId = GetSysCacheOid(NAMESPACENAME, schemaNameDatum, 0, 0, 0);
+	if (OidIsValid(schemaId))
+	{
+		ObjectAddress schemaObject = { 0, 0, 0 };
+		bool showNotices = false;
+
+		bool permissionsOK = pg_namespace_ownercheck(schemaId, GetUserId());
+		if (!permissionsOK)
+		{
+			aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_NAMESPACE, schemaName->data);
+		}
+
+		schemaObject.classId = NamespaceRelationId;
+		schemaObject.objectId = schemaId;
+		schemaObject.objectSubId = 0;
+
+		/*
+		 * We first delete all tables in this schema. Rather than relying on the
+		 * schema command, we call the dependency mechanism directly so that we
+		 * can suppress notice messages that are typically displayed during
+		 * cascading deletes.
+		 */
+		deleteWhatDependsOn(&schemaObject, showNotices);
+		CommandCounterIncrement();
+
+		/* drop the empty schema */
+		performDeletion(&schemaObject, DROP_RESTRICT, 0);
+		CommandCounterIncrement();
+	}
+	else
+	{
+		ereport(DEBUG2, (errmsg("schema \"%s\" does not exist, skipping",
+								schemaName->data)));
+	}
+}
+
+
+/* Creates a simple table that only defines columns, in the given schema. */
+static void
+CreateTaskTable(StringInfo schemaName, StringInfo relationName,
+				List *columnNameList, List *columnTypeList)
+{
+	CreateStmt *createStatement = NULL;
+	RangeVar *relation = NULL;
+	List *columnDefinitionList = NIL;
+	Oid relationId = InvalidOid;
+#if (PG_VERSION_NUM >= 90500)
+	ObjectAddress relationObject;
+#endif
+
+	Assert(schemaName != NULL);
+	Assert(relationName != NULL);
+
+	/*
+	 * This new relation doesn't log to WAL, as the table creation and data copy
+	 * statements occur in the same transaction. Still, we want to make the
+	 * relation unlogged once we upgrade to PostgreSQL 9.1.
+	 */
+	relation = makeRangeVar(schemaName->data, relationName->data, -1);
+	columnDefinitionList = ColumnDefinitionList(columnNameList, columnTypeList);
+
+	createStatement = CreateStatement(relation, columnDefinitionList);
+
+#if (PG_VERSION_NUM >= 90500)
+	relationObject = DefineRelation(createStatement, RELKIND_RELATION, InvalidOid, NULL);
+	relationId = relationObject.objectId;
+#else
+	relationId = DefineRelation(createStatement, RELKIND_RELATION, InvalidOid);
+#endif
+
+	Assert(relationId != InvalidOid);
+	CommandCounterIncrement();
+}
+
+
+/*
+ * ColumnDefinitionList creates and returns a list of column definition objects
+ * from two lists of column names and types. As an example, this function takes
+ * in two single elements lists: "l_quantity" and "decimal(15, 2)". The function
+ * then returns a list with one column definition, where the column's name is
+ * l_quantity, its type is numeric, and the type modifier represents (15, 2).
+ */
+List *
+ColumnDefinitionList(List *columnNameList, List *columnTypeList)
+{
+	List *columnDefinitionList = NIL;
+	ListCell *columnNameCell = NULL;
+	ListCell *columnTypeCell = NULL;
+
+	forboth(columnNameCell, columnNameList, columnTypeCell, columnTypeList)
+	{
+		const char *columnName = (const char *) lfirst(columnNameCell);
+		const char *columnType = (const char *) lfirst(columnTypeCell);
+
+		/*
+		 * We should have a SQL compatible column type declaration; we first
+		 * convert this type to PostgreSQL's type identifiers and modifiers.
+		 */
+		Oid columnTypeId = InvalidOid;
+		int32 columnTypeMod = -1;
+		bool missingOK = false;
+		TypeName *typeName = NULL;
+		ColumnDef *columnDefinition = NULL;
+
+		parseTypeString(columnType, &columnTypeId, &columnTypeMod, missingOK);
+		typeName = makeTypeNameFromOid(columnTypeId, columnTypeMod);
+
+		/* we then create the column definition */
+		columnDefinition = makeNode(ColumnDef);
+		columnDefinition->colname = (char *) columnName;
+		columnDefinition->typeName = typeName;
+		columnDefinition->is_local = true;
+		columnDefinition->is_not_null = false;
+		columnDefinition->raw_default = NULL;
+		columnDefinition->cooked_default = NULL;
+		columnDefinition->constraints = NIL;
+
+		columnDefinitionList = lappend(columnDefinitionList, columnDefinition);
+	}
+
+	return columnDefinitionList;
+}
+
+
+/*
+ * CreateStatement creates and initializes a simple table create statement that
+ * only has column definitions.
+ */
+CreateStmt *
+CreateStatement(RangeVar *relation, List *columnDefinitionList)
+{
+	CreateStmt *createStatement = makeNode(CreateStmt);
+	createStatement->relation = relation;
+	createStatement->tableElts = columnDefinitionList;
+	createStatement->inhRelations = NIL;
+	createStatement->constraints = NIL;
+	createStatement->options = NIL;
+	createStatement->oncommit = ONCOMMIT_NOOP;
+	createStatement->tablespacename = NULL;
+	createStatement->if_not_exists = false;
+
+	return createStatement;
+}
+
+
+/*
+ * CopyTaskFilesFromDirectory finds all files in the given directory, except for
+ * those having an attempt suffix. The function then copies these files into the
+ * database table identified by the given schema and table name.
+ */
+static void
+CopyTaskFilesFromDirectory(StringInfo schemaName, StringInfo relationName,
+						   StringInfo sourceDirectoryName)
+{
+	const char *directoryName = sourceDirectoryName->data;
+	struct dirent *directoryEntry = NULL;
+	uint64 copiedRowTotal = 0;
+
+	DIR *directory = AllocateDir(directoryName);
+	if (directory == NULL)
+	{
+		ereport(ERROR, (errcode_for_file_access(),
+						errmsg("could not open directory \"%s\": %m", directoryName)));
+	}
+
+	directoryEntry = ReadDir(directory, directoryName);
+	for (; directoryEntry != NULL; directoryEntry = ReadDir(directory, directoryName))
+	{
+		const char *baseFilename = directoryEntry->d_name;
+		const char *queryString = NULL;
+		StringInfo fullFilename = NULL;
+		RangeVar *relation = NULL;
+		CopyStmt *copyStatement = NULL;
+		uint64 copiedRowCount = 0;
+
+		/* if system file or lingering task file, skip it */
+		if (strncmp(baseFilename, ".", MAXPGPATH) == 0 ||
+			strncmp(baseFilename, "..", MAXPGPATH) == 0 ||
+			strstr(baseFilename, ATTEMPT_FILE_SUFFIX) != NULL)
+		{
+			continue;
+		}
+
+		fullFilename = makeStringInfo();
+		appendStringInfo(fullFilename, "%s/%s", directoryName, baseFilename);
+
+		/* build relation object and copy statement */
+		relation = makeRangeVar(schemaName->data, relationName->data, -1);
+		copyStatement = CopyStatement(relation, fullFilename->data);
+		if (BinaryWorkerCopyFormat)
+		{
+			DefElem *copyOption = makeDefElem("format", (Node *) makeString("binary"));
+			copyStatement->options = list_make1(copyOption);
+		}
+
+		DoCopy(copyStatement, queryString, &copiedRowCount);
+		copiedRowTotal += copiedRowCount;
+		CommandCounterIncrement();
+	}
+
+	ereport(DEBUG2, (errmsg("copied " UINT64_FORMAT " rows into table: \"%s.%s\"",
+							copiedRowTotal, schemaName->data, relationName->data)));
+
+	FreeDir(directory);
+}
+
+
+/*
+ * CopyStatement creates and initializes a copy statement to read the given
+ * file's contents into the given table, using copy's standard text format.
+ */
+CopyStmt *
+CopyStatement(RangeVar *relation, char *sourceFilename)
+{
+	CopyStmt *copyStatement = makeNode(CopyStmt);
+	copyStatement->relation = relation;
+	copyStatement->query = NULL;
+	copyStatement->attlist = NIL;
+	copyStatement->options = NIL;
+	copyStatement->is_from = true;
+	copyStatement->is_program = false;
+	copyStatement->filename = sourceFilename;
+
+	return copyStatement;
+}
--- a/src/backend/distributed/worker/worker_partition_protocol.c
+++ b/src/backend/distributed/worker/worker_partition_protocol.c
--- a/src/bin/csql/.gitignore
+++ b/src/bin/csql/.gitignore
@ -0,0 +1,2 @@
+/psqlscan.c
+/csql
--- a/src/bin/csql/Makefile
+++ b/src/bin/csql/Makefile
@ -0,0 +1,40 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for src/bin/csql
+#
+# Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# src/bin/csql/Makefile
+#
+#-------------------------------------------------------------------------
+
+citusdb_subdir = src/bin/csql
+citusdb_top_builddir = ../../..
+
+PROGRAM = csql
+
+PGFILEDESC = "csql - the CitusDB interactive terminal"
+PGAPPICON=win32
+
+OBJS =command.o common.o help.o input.o stringutils.o mainloop.o copy.o \
+      copy_options.o stage.o \
+      startup.o prompt.o variables.o large_obj.o print.o describe.o \
+      tab-complete.o mbprint.o dumputils.o keywords.o kwlookup.o \
+      sql_help.o \
+      $(WIN32RES)
+
+PG_LIBS = $(libpq)
+
+include $(citusdb_top_builddir)/Makefile.global
+
+override CPPFLAGS += -I$(libpq_srcdir) -I$(top_srcdir)/src/bin/csql
+
+# psqlscan is compiled as part of mainloop
+mainloop.o: psqlscan.c
+psqlscan.c: FLEXFLAGS = -Cfe -p -p
+psqlscan.c: FLEX_NO_BACKUP=yes
+
+clean: csql-clean
+csql-clean:
+	rm -f csql$(X) $(OBJS) psqlscan.c lex.backup
--- a/src/bin/csql/command.c
+++ b/src/bin/csql/command.c
--- a/src/bin/csql/command.h
+++ b/src/bin/csql/command.h
@ -0,0 +1,43 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/command.h
+ */
+#ifndef COMMAND_H
+#define COMMAND_H
+
+#include "print.h"
+#include "psqlscan.h"
+
+
+typedef enum _backslashResult
+{
+	PSQL_CMD_UNKNOWN = 0,		/* not done parsing yet (internal only) */
+	PSQL_CMD_SEND,				/* query complete; send off */
+	PSQL_CMD_SKIP_LINE,			/* keep building query */
+	PSQL_CMD_TERMINATE,			/* quit program */
+	PSQL_CMD_NEWEDIT,			/* query buffer was changed (e.g., via \e) */
+	PSQL_CMD_ERROR				/* the execution of the backslash command
+								 * resulted in an error */
+} backslashResult;
+
+
+extern backslashResult HandleSlashCmds(PsqlScanState scan_state,
+				PQExpBuffer query_buf);
+
+extern int	process_file(char *filename, bool single_txn, bool use_relative_path);
+
+extern bool do_pset(const char *param,
+		const char *value,
+		printQueryOpt *popt,
+		bool quiet);
+
+extern void connection_warnings(bool in_startup);
+
+extern void SyncVariables(void);
+
+extern void UnsyncVariables(void);
+
+#endif   /* COMMAND_H */
--- a/src/bin/csql/common.c
+++ b/src/bin/csql/common.c
--- a/src/bin/csql/common.h
+++ b/src/bin/csql/common.h
@ -0,0 +1,59 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/common.h
+ */
+#ifndef COMMON_H
+#define COMMON_H
+
+#include "postgres_fe.h"
+#include <setjmp.h>
+#include "libpq-fe.h"
+
+#include "print.h"
+
+#define atooid(x)  ((Oid) strtoul((x), NULL, 10))
+
+extern bool openQueryOutputFile(const char *fname, FILE **fout, bool *is_pipe);
+extern bool setQFout(const char *fname);
+
+#if (PG_VERSION_NUM >= 90500)
+extern void psql_error(const char *fmt,...) pg_attribute_printf(1, 2);
+#else
+extern void
+psql_error(const char *fmt,...)
+/* This lets gcc check the format string for consistency. */
+__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
+#endif
+
+extern void NoticeProcessor(void *arg, const char *message);
+
+extern volatile bool sigint_interrupt_enabled;
+
+extern sigjmp_buf sigint_interrupt_jmp;
+
+extern volatile bool cancel_pressed;
+
+/* Note: cancel_pressed is defined in print.c, see that file for reasons */
+
+extern void setup_cancel_handler(void);
+
+extern void SetCancelConn(void);
+extern void ResetCancelConn(void);
+
+extern PGresult *PSQLexec(const char *query);
+extern int	PSQLexecWatch(const char *query, const printQueryOpt *opt);
+
+extern bool SendQuery(const char *query);
+
+extern bool is_superuser(void);
+extern bool standard_strings(void);
+extern const char *session_username(void);
+
+extern void expand_tilde(char **filename);
+
+extern bool recognized_connection_string(const char *connstr);
+
+#endif   /* COMMON_H */
--- a/src/bin/csql/copy.c
+++ b/src/bin/csql/copy.c
@ -0,0 +1,595 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/copy.c
+ */
+#include "postgres_fe.h"
+#include "copy.h"
+
+#include <signal.h>
+#include <sys/stat.h>
+#ifndef WIN32
+#include <unistd.h>				/* for isatty */
+#else
+#include <io.h>					/* I think */
+#endif
+
+#include "libpq-fe.h"
+#include "pqexpbuffer.h"
+#include "dumputils.h"
+
+#include "settings.h"
+#include "common.h"
+#include "prompt.h"
+
+
+/*
+ * Execute a \copy command (frontend copy). We have to open a file (or execute
+ * a command), then submit a COPY query to the backend and either feed it data
+ * from the file or route its response into the file.
+ */
+bool
+do_copy(const char *args)
+{
+	copy_options *options = NULL;
+	PQExpBufferData query = { NULL, 0, 0 };
+	FILE *copystream = NULL;
+	bool success = false;
+	bool fileClosed = false;
+
+	/* parse options */
+	options = parse_slash_copy(args);
+
+	if (!options)
+		return false;
+
+	/* open file stream to copy data into or out of */
+	copystream = OpenCopyStream(options);
+	if (copystream == NULL)
+	{
+		free_copy_options(options);
+
+		return false;
+	}
+
+	/* build the command we will send to the backend */
+	initPQExpBuffer(&query);
+	printfPQExpBuffer(&query, "COPY ");
+	appendPQExpBufferStr(&query, options->before_tofrom);
+	if (options->from)
+		appendPQExpBufferStr(&query, " FROM STDIN ");
+	else
+		appendPQExpBufferStr(&query, " TO STDOUT ");
+	if (options->after_tofrom)
+		appendPQExpBufferStr(&query, options->after_tofrom);
+
+	/* run it like a user command, but with copystream as data source/sink */
+	pset.copyStream = copystream;
+	success = SendQuery(query.data);
+	pset.copyStream = NULL;
+	termPQExpBuffer(&query);
+
+	/* close file stream */
+	fileClosed = CloseCopyStream(options, copystream);
+	if (!fileClosed)
+	{
+		success = false;
+	}
+
+	free_copy_options(options);
+	return success;
+}
+
+
+/*
+ * HandleCopyData executes client-side copy data protocols by dispatching the
+ * call to the appropriate copy protocol function. On successful execution of
+ * the protocol, the function returns true. Otherwise, the function returns
+ * false.
+ *
+ * Please note that we refactored this function from a previous version (v9.1)
+ * of PostgreSQL so that copy.c and stage.c could share the same code path. Now
+ * that do_copy uses SendQuery(), we should move or re-refactor this function.
+ */
+bool
+HandleCopyData(PGconn *connection, ExecStatusType copyStatus, bool copyIsBinary,
+			   FILE *copyStream, uint64 copySizeLimit)
+{
+	ExecStatusType drainStatus = 0;
+	PGresult *drainResult = NULL;
+	bool copyOK = true;
+
+	if (copyStatus == PGRES_COPY_OUT)
+	{
+		SetCancelConn();
+		copyOK = handleCopyOut(connection, copyStream, &drainResult);
+		ResetCancelConn();
+	}
+	else if (copyStatus == PGRES_COPY_IN)
+	{
+		SetCancelConn();
+		copyOK = handleCopyIn(connection, copyStream, copyIsBinary,
+							  &drainResult, copySizeLimit);
+		ResetCancelConn();
+	}
+	else if (copyStatus == PGRES_BAD_RESPONSE ||
+			 copyStatus == PGRES_NONFATAL_ERROR ||
+			 copyStatus == PGRES_FATAL_ERROR)
+	{
+		psql_error("\\copy: %s", PQerrorMessage(connection));
+		copyOK = false;
+	}
+	else
+	{
+		psql_error("\\copy: unexpected response (%d)\n", copyStatus);
+		copyOK = false;
+	}
+
+	PQclear(drainResult);
+
+	/*
+	 * Make sure we drain all results from libpq. Otherwise, the connection may
+	 * still be in ASYNC_BUSY state, leading to false readings in get_prompt().
+	 */
+	drainResult = PQgetResult(connection);
+	while (drainResult != NULL)
+	{
+		copyOK = false;
+
+		drainStatus = PQresultStatus(drainResult);
+		psql_error("\\copy: unexpected response (%d)\n", drainStatus);
+
+		/* if we are still in COPY IN state, try to get out of it */
+		if (drainStatus == PGRES_COPY_IN)
+		{
+			PQputCopyEnd(connection, _("trying to exit copy mode"));
+		}
+
+		PQclear(drainResult);
+		drainResult = PQgetResult(connection);
+	}
+
+	return copyOK;	
+}
+
+
+/* Opens input or output stream to be used during copy command. */
+FILE *
+OpenCopyStream(const copy_options *options)
+{
+	FILE *copyStream = NULL;
+
+	/* prepare to read or write the target file */
+	if (options->file && !options->program)
+		canonicalize_path(options->file);
+
+	if (options->from)
+	{
+		if (options->file)
+		{
+			if (options->program)
+			{
+				fflush(stdout);
+				fflush(stderr);
+				errno = 0;
+				copyStream = popen(options->file, PG_BINARY_R);
+			}
+			else
+				copyStream = fopen(options->file, PG_BINARY_R);
+		}
+		else if (!options->psql_inout)
+			copyStream = pset.cur_cmd_source;
+		else
+			copyStream = stdin;
+	}
+	else
+	{
+		if (options->file)
+		{
+			if (options->program)
+			{
+				fflush(stdout);
+				fflush(stderr);
+				errno = 0;
+#ifndef WIN32
+				pqsignal(SIGPIPE, SIG_IGN);
+#endif
+				copyStream = popen(options->file, PG_BINARY_W);
+			}
+			else
+				copyStream = fopen(options->file, PG_BINARY_W);
+		}
+		else if (!options->psql_inout)
+			copyStream = pset.queryFout;
+		else
+			copyStream = stdout;
+	}
+
+	if (!copyStream)
+	{
+		if (options->program)
+			psql_error("could not execute command \"%s\": %s\n",
+					   options->file, strerror(errno));
+		else
+			psql_error("%s: %s\n",
+					   options->file, strerror(errno));
+
+		return NULL;
+	}
+
+	if (!options->program)
+	{
+		struct stat st;
+		int			result;
+
+		/* make sure the specified file is not a directory */
+		if ((result = fstat(fileno(copyStream), &st)) < 0)
+			psql_error("could not stat file \"%s\": %s\n",
+					   options->file, strerror(errno));
+
+		if (result == 0 && S_ISDIR(st.st_mode))
+			psql_error("%s: cannot copy from/to a directory\n",
+					   options->file);
+
+		if (result < 0 || S_ISDIR(st.st_mode))
+		{
+			fclose(copyStream);
+
+			return NULL;
+		}
+	}
+
+	return copyStream;
+}
+
+
+/* Closes file stream used during copy command, if any. */
+bool
+CloseCopyStream(const copy_options *options, FILE *copyStream)
+{
+	bool success = true;
+
+	if (options->file != NULL)
+	{
+		if (options->program)
+		{
+			int			pclose_rc = pclose(copyStream);
+
+			if (pclose_rc != 0)
+			{
+				if (pclose_rc < 0)
+					psql_error("could not close pipe to external command: %s\n",
+							   strerror(errno));
+				else
+				{
+					char	   *reason = wait_result_to_str(pclose_rc);
+
+					psql_error("%s: %s\n", options->file,
+							   reason ? reason : "");
+					if (reason)
+						free(reason);
+				}
+				success = false;
+			}
+#ifndef WIN32
+			pqsignal(SIGPIPE, SIG_DFL);
+#endif
+		}
+		else
+		{
+			if (fclose(copyStream) != 0)
+			{
+				psql_error("%s: %s\n", options->file, strerror(errno));
+				success = false;
+			}
+		}
+	}
+
+	return success;
+}
+
+
+/*
+ * Functions for handling COPY IN/OUT data transfer.
+ *
+ * If you want to use COPY TO STDOUT/FROM STDIN in your application,
+ * this is the code to steal ;)
+ */
+
+/*
+ * handleCopyOut
+ * receives data as a result of a COPY ... TO STDOUT command
+ *
+ * conn should be a database connection that you just issued COPY TO on
+ * and got back a PGRES_COPY_OUT result.
+ * copystream is the file stream for the data to go to.
+ * The final status for the COPY is returned into *res (but note
+ * we already reported the error, if it's not a success result).
+ *
+ * result is true if successful, false if not.
+ */
+bool
+handleCopyOut(PGconn *conn, FILE *copystream, PGresult **res)
+{
+	bool		OK = true;
+	char	   *buf;
+	int			ret;
+
+	for (;;)
+	{
+		ret = PQgetCopyData(conn, &buf, 0);
+
+		if (ret < 0)
+			break;				/* done or server/connection error */
+
+		if (buf)
+		{
+			if (OK && fwrite(buf, 1, ret, copystream) != ret)
+			{
+				psql_error("could not write COPY data: %s\n",
+						   strerror(errno));
+				/* complain only once, keep reading data from server */
+				OK = false;
+			}
+			PQfreemem(buf);
+		}
+	}
+
+	if (OK && fflush(copystream))
+	{
+		psql_error("could not write COPY data: %s\n",
+				   strerror(errno));
+		OK = false;
+	}
+
+	if (ret == -2)
+	{
+		psql_error("COPY data transfer failed: %s", PQerrorMessage(conn));
+		OK = false;
+	}
+
+	/*
+	 * Check command status and return to normal libpq state.
+	 *
+	 * If for some reason libpq is still reporting PGRES_COPY_OUT state, we
+	 * would like to forcibly exit that state, since our caller would be
+	 * unable to distinguish that situation from reaching the next COPY in a
+	 * command string that happened to contain two consecutive COPY TO STDOUT
+	 * commands.  However, libpq provides no API for doing that, and in
+	 * principle it's a libpq bug anyway if PQgetCopyData() returns -1 or -2
+	 * but hasn't exited COPY_OUT state internally.  So we ignore the
+	 * possibility here.
+	 */
+	*res = PQgetResult(conn);
+	if (PQresultStatus(*res) != PGRES_COMMAND_OK)
+	{
+		psql_error("%s", PQerrorMessage(conn));
+		OK = false;
+	}
+
+	return OK;
+}
+
+/*
+ * handleCopyIn
+ * sends data to complete a COPY ... FROM STDIN command
+ *
+ * conn should be a database connection that you just issued COPY FROM on
+ * and got back a PGRES_COPY_IN result.
+ * copystream is the file stream to read the data from.
+ * isbinary can be set from PQbinaryTuples().
+ * The final status for the COPY is returned into *res (but note
+ * we already reported the error, if it's not a success result).
+ *
+ * result is true if successful, false if not.
+ */
+
+/* read chunk size for COPY IN - size set to double that of Hadoop's default */
+#define COPYBUFSIZ 32768
+
+bool
+handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary,
+			 PGresult **res, uint64 copySizeLimit)
+{
+	bool		OK;
+	const char *prompt;
+	char		buf[COPYBUFSIZ];
+	uint64		bytesCopied = 0;
+
+	/*
+	 * Establish longjmp destination for exiting from wait-for-input. (This is
+	 * only effective while sigint_interrupt_enabled is TRUE.)
+	 */
+	if (sigsetjmp(sigint_interrupt_jmp, 1) != 0)
+	{
+		/* got here with longjmp */
+
+		/* Terminate data transfer */
+		PQputCopyEnd(conn,
+					 (PQprotocolVersion(conn) < 3) ? NULL :
+					 _("canceled by user"));
+
+		OK = false;
+		goto copyin_cleanup;
+	}
+
+	/* Prompt if interactive input */
+	if (isatty(fileno(copystream)))
+	{
+		if (!pset.quiet)
+			puts(_("Enter data to be copied followed by a newline.\n"
+				   "End with a backslash and a period on a line by itself."));
+		prompt = get_prompt(PROMPT_COPY);
+	}
+	else
+		prompt = NULL;
+
+	OK = true;
+
+	if (isbinary)
+	{
+		/* interactive input probably silly, but give one prompt anyway */
+		if (prompt)
+		{
+			fputs(prompt, stdout);
+			fflush(stdout);
+		}
+
+		for (;;)
+		{
+			int			buflen;
+
+			/* enable longjmp while waiting for input */
+			sigint_interrupt_enabled = true;
+
+			buflen = fread(buf, 1, COPYBUFSIZ, copystream);
+
+			sigint_interrupt_enabled = false;
+
+			if (buflen <= 0)
+				break;
+
+			if (PQputCopyData(conn, buf, buflen) <= 0)
+			{
+				OK = false;
+				break;
+			}
+
+			/* if size limit is set, copy at most that many bytes*/
+			bytesCopied += buflen;
+			if (copySizeLimit > 0 && bytesCopied >= copySizeLimit)
+			{
+				break;
+			}
+		}
+	}
+	else
+	{
+		bool		copydone = false;
+
+		while (!copydone)
+		{						/* for each input line ... */
+			bool		firstload;
+			bool		linedone;
+
+			if (prompt)
+			{
+				fputs(prompt, stdout);
+				fflush(stdout);
+			}
+
+			firstload = true;
+			linedone = false;
+
+			while (!linedone)
+			{					/* for each bufferload in line ... */
+				int			linelen = 0;
+				char	   *fgresult;
+
+				/* enable longjmp while waiting for input */
+				sigint_interrupt_enabled = true;
+
+				fgresult = fgets(buf, sizeof(buf), copystream);
+
+				sigint_interrupt_enabled = false;
+
+				if (!fgresult)
+				{
+					copydone = true;
+					break;
+				}
+
+				linelen = strlen(buf);
+
+				/* current line is done? */
+				if (linelen > 0 && buf[linelen - 1] == '\n')
+					linedone = true;
+
+				/* check for EOF marker, but not on a partial line */
+				if (firstload)
+				{
+					/*
+					 * This code erroneously assumes '\.' on a line alone
+					 * inside a quoted CSV string terminates the \copy.
+					 * http://www.postgresql.org/message-id/E1TdNVQ-0001ju-GO@w
+					 * rigleys.postgresql.org
+					 */
+					if (strcmp(buf, "\\.\n") == 0 ||
+						strcmp(buf, "\\.\r\n") == 0)
+					{
+						copydone = true;
+						break;
+					}
+
+					firstload = false;
+				}
+
+				if (PQputCopyData(conn, buf, linelen) <= 0)
+				{
+					OK = false;
+					copydone = true;
+					break;
+				}
+				else
+				{
+					bytesCopied += linelen;
+				}
+			}
+
+			if (copystream == pset.cur_cmd_source)
+				pset.lineno++;
+
+			/* if size limit is set, copy at most that many bytes */
+			if (copySizeLimit > 0 && bytesCopied >= copySizeLimit)
+			{
+				break;
+			}
+		}
+	}
+
+	/* Check for read error */
+	if (ferror(copystream))
+		OK = false;
+
+	/*
+	 * Terminate data transfer.  We can't send an error message if we're using
+	 * protocol version 2.
+	 */
+	if (PQputCopyEnd(conn,
+					 (OK || PQprotocolVersion(conn) < 3) ? NULL :
+					 _("aborted because of read failure")) <= 0)
+		OK = false;
+
+copyin_cleanup:
+
+	/*
+	 * Check command status and return to normal libpq state.
+	 *
+	 * We do not want to return with the status still PGRES_COPY_IN: our
+	 * caller would be unable to distinguish that situation from reaching the
+	 * next COPY in a command string that happened to contain two consecutive
+	 * COPY FROM STDIN commands.  We keep trying PQputCopyEnd() in the hope
+	 * it'll work eventually.  (What's actually likely to happen is that in
+	 * attempting to flush the data, libpq will eventually realize that the
+	 * connection is lost.  But that's fine; it will get us out of COPY_IN
+	 * state, which is what we need.)
+	 */
+	while (*res = PQgetResult(conn), PQresultStatus(*res) == PGRES_COPY_IN)
+	{
+		OK = false;
+		PQclear(*res);
+		/* We can't send an error message if we're using protocol version 2 */
+		PQputCopyEnd(conn,
+					 (PQprotocolVersion(conn) < 3) ? NULL :
+					 _("trying to exit copy mode"));
+	}
+	if (PQresultStatus(*res) != PGRES_COMMAND_OK)
+	{
+		psql_error("%s", PQerrorMessage(conn));
+		OK = false;
+	}
+
+	return OK;
+}
--- a/src/bin/csql/copy.h
+++ b/src/bin/csql/copy.h
@ -0,0 +1,33 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/copy.h
+ */
+#ifndef COPY_H
+#define COPY_H
+
+#include "libpq-fe.h"
+
+#include "copy_options.h"
+#include "pqexpbuffer.h"
+
+
+/* handler for \copy */
+extern bool do_copy(const char *args);
+
+/* lower level processors for copy in/out streams */
+
+extern bool handleCopyOut(PGconn *conn, FILE *copystream,
+			  PGresult **res);
+extern bool handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary,
+			  PGresult **res, uint64 copySizeLimit);
+
+/* Function declarations shared between copy and stage commands */
+bool HandleCopyData(PGconn *connection, ExecStatusType copyStatus, 
+					bool copyIsBinary, FILE *copyStream, uint64 copySizeLimit);
+FILE * OpenCopyStream(const copy_options *options);
+bool CloseCopyStream(const copy_options *options, FILE *copyStream);
+
+#endif
--- a/src/bin/csql/copy_options.c
+++ b/src/bin/csql/copy_options.c
@ -0,0 +1,312 @@
+/*
+ * csql - the CitusDB interactive terminal
+ * copy_options.c
+ *	  Routines for parsing copy and stage meta commands.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ */
+
+#include "postgres_fe.h"
+#include "copy_options.h"
+
+#include "common.h"
+#include "settings.h"
+#include "stringutils.h"
+
+
+/* Concatenates "more" onto "var", and frees the original value of *var. */
+static void
+xstrcat(char **var, const char *more)
+{
+	char	   *newvar;
+
+	newvar = psprintf("%s%s", *var, more);
+	free(*var);
+	*var = newvar;
+}
+
+
+/*
+ * parse_slash_copy parses copy options from the given meta-command line. The
+ * function then returns a dynamically allocated structure with the options, or
+ * Null on parsing error.
+ */
+copy_options *
+parse_slash_copy(const char *args)
+{
+	struct copy_options *result;
+	char	   *token;
+	const char *whitespace = " \t\n\r";
+	char		nonstd_backslash = standard_strings() ? 0 : '\\';
+
+	if (!args)
+	{
+		psql_error("\\copy: arguments required\n");
+		return NULL;
+	}
+
+	result = pg_malloc0(sizeof(struct copy_options));
+
+	result->before_tofrom = pg_strdup("");		/* initialize for appending */
+
+	token = strtokx(args, whitespace, ".,()", "\"",
+					0, false, false, pset.encoding);
+	if (!token)
+		goto error;
+
+	/* The following can be removed when we drop 7.3 syntax support */
+	if (pg_strcasecmp(token, "binary") == 0)
+	{
+		xstrcat(&result->before_tofrom, token);
+		token = strtokx(NULL, whitespace, ".,()", "\"",
+						0, false, false, pset.encoding);
+		if (!token)
+			goto error;
+	}
+
+	/* Handle COPY (SELECT) case */
+	if (token[0] == '(')
+	{
+		int			parens = 1;
+
+		while (parens > 0)
+		{
+			xstrcat(&result->before_tofrom, " ");
+			xstrcat(&result->before_tofrom, token);
+			token = strtokx(NULL, whitespace, "()", "\"'",
+							nonstd_backslash, true, false, pset.encoding);
+			if (!token)
+				goto error;
+			if (token[0] == '(')
+				parens++;
+			else if (token[0] == ')')
+				parens--;
+		}
+	}
+
+	xstrcat(&result->before_tofrom, " ");
+	xstrcat(&result->before_tofrom, token);
+	token = strtokx(NULL, whitespace, ".,()", "\"",
+					0, false, false, pset.encoding);
+	if (!token)
+		goto error;
+
+	/*
+	 * strtokx() will not have returned a multi-character token starting with
+	 * '.', so we don't need strcmp() here.  Likewise for '(', etc, below.
+	 */
+	if (token[0] == '.')
+	{
+		/* handle schema . table */
+		xstrcat(&result->before_tofrom, token);
+		token = strtokx(NULL, whitespace, ".,()", "\"",
+						0, false, false, pset.encoding);
+		if (!token)
+			goto error;
+		xstrcat(&result->before_tofrom, token);
+		token = strtokx(NULL, whitespace, ".,()", "\"",
+						0, false, false, pset.encoding);
+		if (!token)
+			goto error;
+	}
+
+	if (token[0] == '(')
+	{
+		/* handle parenthesized column list */
+		for (;;)
+		{
+			xstrcat(&result->before_tofrom, " ");
+			xstrcat(&result->before_tofrom, token);
+			token = strtokx(NULL, whitespace, "()", "\"",
+							0, false, false, pset.encoding);
+			if (!token)
+				goto error;
+			if (token[0] == ')')
+				break;
+		}
+		xstrcat(&result->before_tofrom, " ");
+		xstrcat(&result->before_tofrom, token);
+		token = strtokx(NULL, whitespace, ".,()", "\"",
+						0, false, false, pset.encoding);
+		if (!token)
+			goto error;
+	}
+
+	if (pg_strcasecmp(token, "from") == 0)
+		result->from = true;
+	else if (pg_strcasecmp(token, "to") == 0)
+		result->from = false;
+	else
+		goto error;
+
+	/* { 'filename' | PROGRAM 'command' | STDIN | STDOUT | PSTDIN | PSTDOUT } */
+	token = strtokx(NULL, whitespace, ";", "'",
+					0, false, false, pset.encoding);
+	if (!token)
+		goto error;
+
+	if (pg_strcasecmp(token, "program") == 0)
+	{
+		int			toklen;
+
+		token = strtokx(NULL, whitespace, ";", "'",
+						0, false, false, pset.encoding);
+		if (!token)
+			goto error;
+
+		/*
+		 * The shell command must be quoted. This isn't fool-proof, but
+		 * catches most quoting errors.
+		 */
+		toklen = strlen(token);
+		if (token[0] != '\'' || toklen < 2 || token[toklen - 1] != '\'')
+			goto error;
+
+		strip_quotes(token, '\'', 0, pset.encoding);
+
+		result->program = true;
+		result->file = pg_strdup(token);
+	}
+	else if (pg_strcasecmp(token, "stdin") == 0 ||
+			 pg_strcasecmp(token, "stdout") == 0)
+	{
+		result->file = NULL;
+	}
+	else if (pg_strcasecmp(token, "pstdin") == 0 ||
+			 pg_strcasecmp(token, "pstdout") == 0)
+	{
+		result->psql_inout = true;
+		result->file = NULL;
+	}
+	else
+	{
+		/* filename can be optionally quoted */
+		strip_quotes(token, '\'', 0, pset.encoding);
+		result->file = pg_strdup(token);
+		expand_tilde(&result->file);
+	}
+
+	/* Collect the rest of the line (COPY options) */
+	token = strtokx(NULL, "", NULL, NULL,
+					0, false, false, pset.encoding);
+	if (token)
+		result->after_tofrom = pg_strdup(token);
+
+	/* set data staging options to null */
+	result->tableName = NULL;
+	result->columnList = NULL;
+
+	return result;
+
+error:
+	if (token)
+		psql_error("\\copy: parse error at \"%s\"\n", token);
+	else
+		psql_error("\\copy: parse error at end of line\n");
+	free_copy_options(result);
+
+	return NULL;
+}
+
+
+/* Frees copy options. */
+void
+free_copy_options(copy_options * ptr)
+{
+	if (!ptr)
+		return;
+	free(ptr->before_tofrom);
+	free(ptr->after_tofrom);
+	free(ptr->file);
+	free(ptr->tableName);
+	free(ptr->columnList);
+	free(ptr);
+}
+
+
+/*
+ * ParseStageOptions takes the given copy options, parses the additional options
+ * needed for the \stage command, and sets them in the copy options structure.
+ * The additional parsed options are the table name and the column list.
+ */
+copy_options *
+ParseStageOptions(copy_options *copyOptions)
+{
+	copy_options *stageOptions = NULL;
+	const char *whitespace = " \t\n\r";
+	char *tableName = NULL;
+	char *columnList = NULL;
+	char *token = NULL;
+
+	const char *beforeToFrom = copyOptions->before_tofrom;
+	Assert(beforeToFrom != NULL);
+
+	token = strtokx(beforeToFrom, whitespace, ".,()", "\"",
+					0, false, false, pset.encoding);
+
+	/*
+	 * We should have errored out earlier if the token were null. Similarly, we
+	 * should have errored out on the "\stage (select) to" case.
+	 */
+	Assert(token != NULL);
+	Assert(token[0] != '(');
+
+	/* we do not support PostgreSQL's 7.3 syntax */
+	if (pg_strcasecmp(token, "binary") == 0)
+	{
+		psql_error("\\stage: binary keyword before to/from is not supported\n");
+		Assert(false);
+	}
+
+	/* init table name and append either the table name or schema name */
+	tableName = pg_strdup("");
+	xstrcat(&tableName, token);
+
+	/* check for the schema.table use case */
+	token = strtokx(NULL, whitespace, ".,()", "\"", 0, false, false, pset.encoding);
+
+	if (token != NULL && token[0] == '.')
+	{
+		/* append the dot token */
+		xstrcat(&tableName, token);
+
+		token = strtokx(NULL, whitespace, ".,()", "\"", 0, false, false, pset.encoding);
+		Assert(token != NULL);
+
+		/* append the table name token */
+		xstrcat(&tableName, token);
+
+		token = strtokx(NULL, whitespace, ".,()", "\"", 0, false, false, pset.encoding);
+	}
+
+	/* check for the column list use case */
+	if (token != NULL && token[0] == '(')
+	{
+		/* init column list, and add columns */
+		columnList = pg_strdup("");
+		for (;;)
+		{
+			xstrcat(&columnList, " ");
+			xstrcat(&columnList, token);
+
+			token = strtokx(NULL, whitespace, "()", "\"", 0, false, false, pset.encoding);
+			Assert(token != NULL);
+
+			if (token[0] == ')')
+			{
+				break;
+			}
+		}
+		xstrcat(&columnList, " ");
+		xstrcat(&columnList, token);
+	}
+
+	/* finally set additional stage options */
+	stageOptions = copyOptions;
+	stageOptions->tableName = tableName;
+	stageOptions->columnList = columnList;
+
+	return stageOptions;
+}
--- a/src/bin/csql/copy_options.h
+++ b/src/bin/csql/copy_options.h
@ -0,0 +1,60 @@
+/*
+ * csql - the CitusDB interactive terminal
+ * copy_options.h
+ *	  Shared declarations for parsing copy and stage meta-commands. The stage
+ *	  meta-command borrows from copy's syntax, but does not yet support
+ *	  outputting table data to a file. Further, the stage command reuses copy's
+ *	  declarations to maintain compatibility with the copy command.
+ *
+ * Copyright (c) 2012, Citus Data, Inc.
+ *
+ * $Id$
+ */
+
+#ifndef COPY_OPTIONS_H
+#define COPY_OPTIONS_H
+
+#include "libpq-fe.h"
+
+
+/*
+ * The documented syntax is:
+ *	\copy tablename [(columnlist)] from|to filename [options]
+ *	\copy ( select stmt ) to filename [options]
+ *
+ * where 'filename' can be one of the following:
+ *	'<file path>' | PROGRAM '<command>' | stdin | stdout | pstdout | pstdout
+ *
+ * An undocumented fact is that you can still write BINARY before the
+ * tablename; this is a hangover from the pre-7.3 syntax.  The options
+ * syntax varies across backend versions, but we avoid all that mess
+ * by just transmitting the stuff after the filename literally.
+ *
+ * table name can be double-quoted and can have a schema part.
+ * column names can be double-quoted.
+ * filename can be single-quoted like SQL literals.
+ * command must be single-quoted like SQL literals.
+ *
+ * returns a malloc'ed structure with the options, or NULL on parsing error
+ */
+typedef struct copy_options
+{
+	char	   *before_tofrom;	/* COPY string before TO/FROM */
+	char	   *after_tofrom;	/* COPY string after TO/FROM filename */
+	char	   *file;			/* NULL = stdin/stdout */
+	bool		program;		/* is 'file' a program to popen? */
+	bool		psql_inout;		/* true = use psql stdin/stdout */
+	bool		from;			/* true = FROM, false = TO */
+
+	char 	   *tableName;		/* table name to stage data to */
+	char	   *columnList;		/* optional column list used in staging */
+} copy_options;
+
+
+/* Function declarations for parsing and freeing copy options */
+copy_options * parse_slash_copy(const char *args);
+void free_copy_options(copy_options * ptr);
+copy_options * ParseStageOptions(copy_options *copyOptions);
+
+
+#endif   /* COPY_OPTIONS_H */
--- a/src/bin/csql/create_help.pl
+++ b/src/bin/csql/create_help.pl
@ -0,0 +1,214 @@
+#! /usr/bin/perl -w
+
+#################################################################
+# create_help.pl -- converts SGML docs to internal psql help
+#
+# Copyright (c) 2000-2015, PostgreSQL Global Development Group
+#
+# src/bin/psql/create_help.pl
+#################################################################
+
+#
+# This script automatically generates the help on SQL in psql from
+# the SGML docs. So far the format of the docs was consistent
+# enough that this worked, but this here is by no means an SGML
+# parser.
+#
+# Call: perl create_help.pl docdir sql_help
+# The name of the header file doesn't matter to this script, but it
+# sure does matter to the rest of the source.
+#
+
+use strict;
+
+my $docdir = $ARGV[0] or die "$0: missing required argument: docdir\n";
+my $hfile = $ARGV[1] . '.h'
+  or die "$0: missing required argument: output file\n";
+my $cfile = $ARGV[1] . '.c';
+
+my $hfilebasename;
+if ($hfile =~ m!.*/([^/]+)$!)
+{
+	$hfilebasename = $1;
+}
+else
+{
+	$hfilebasename = $hfile;
+}
+
+my $define = $hfilebasename;
+$define =~ tr/a-z/A-Z/;
+$define =~ s/\W/_/g;
+
+opendir(DIR, $docdir)
+  or die "$0: could not open documentation source dir '$docdir': $!\n";
+open(HFILE, ">$hfile")
+  or die "$0: could not open output file '$hfile': $!\n";
+open(CFILE, ">$cfile")
+  or die "$0: could not open output file '$cfile': $!\n";
+
+print HFILE "/*
+ * *** Do not change this file by hand. It is automatically
+ * *** generated from the DocBook documentation.
+ *
+ * generated by
+ *     $^X $0 @ARGV
+ *
+ */
+
+#ifndef $define
+#define $define
+
+#define N_(x) (x)				/* gettext noop */
+
+#include \"postgres_fe.h\"
+#include \"pqexpbuffer.h\"
+
+struct _helpStruct
+{
+	const char	   *cmd;		/* the command name */
+	const char	   *help;		/* the help associated with it */
+	void (*syntaxfunc)(PQExpBuffer);	/* function that prints the syntax associated with it */
+	int				nl_count;	/* number of newlines in syntax (for pager) */
+};
+
+";
+
+print CFILE "/*
+ * *** Do not change this file by hand. It is automatically
+ * *** generated from the DocBook documentation.
+ *
+ * generated by
+ *     $^X $0 @ARGV
+ *
+ */
+
+#include \"$hfile\"
+
+";
+
+my $maxlen = 0;
+
+my %entries;
+
+foreach my $file (sort readdir DIR)
+{
+	my (@cmdnames, $cmddesc, $cmdsynopsis);
+	$file =~ /\.sgml$/ or next;
+
+	open(FILE, "$docdir/$file") or next;
+	my $filecontent = join('', <FILE>);
+	close FILE;
+
+	# Ignore files that are not for SQL language statements
+	$filecontent =~
+	  m!<refmiscinfo>\s*SQL - Language Statements\s*</refmiscinfo>!i
+	  or next;
+
+	# Collect multiple refnames
+  LOOP:
+	{
+		$filecontent =~ m!\G.*?<refname>\s*([a-z ]+?)\s*</refname>!cgis
+		  and push @cmdnames, $1
+		  and redo LOOP;
+	}
+	$filecontent =~ m!<refpurpose>\s*(.+?)\s*</refpurpose>!is
+	  and $cmddesc = $1;
+	$filecontent =~ m!<synopsis>\s*(.+?)\s*</synopsis>!is
+	  and $cmdsynopsis = $1;
+
+	if (@cmdnames && $cmddesc && $cmdsynopsis)
+	{
+		s/\"/\\"/g foreach @cmdnames;
+
+		$cmddesc =~ s/<[^>]+>//g;
+		$cmddesc =~ s/\s+/ /g;
+		$cmddesc =~ s/\"/\\"/g;
+
+		my @params = ();
+
+		my $nl_count = () = $cmdsynopsis =~ /\n/g;
+
+		$cmdsynopsis =~ m!</>!
+		  and die "$0:$file: null end tag not supported in synopsis\n";
+		$cmdsynopsis =~ s/%/%%/g;
+
+		while ($cmdsynopsis =~ m!<(\w+)[^>]*>(.+?)</\1[^>]*>!)
+		{
+			my $match = $2;
+			$match =~ s/<[^>]+>//g;
+			$match =~ s/%%/%/g;
+			push @params, $match;
+			$cmdsynopsis =~ s!<(\w+)[^>]*>.+?</\1[^>]*>!%s!;
+		}
+		$cmdsynopsis =~ s/\r?\n/\\n/g;
+		$cmdsynopsis =~ s/\"/\\"/g;
+
+		foreach my $cmdname (@cmdnames)
+		{
+			$entries{$cmdname} = {
+				cmddesc     => $cmddesc,
+				cmdsynopsis => $cmdsynopsis,
+				params      => \@params,
+				nl_count    => $nl_count };
+			$maxlen =
+			  ($maxlen >= length $cmdname) ? $maxlen : length $cmdname;
+		}
+	}
+	else
+	{
+		die "$0: parsing file '$file' failed (N='@cmdnames' D='$cmddesc')\n";
+	}
+}
+
+foreach (sort keys %entries)
+{
+	my $prefix = "\t" x 5 . '  ';
+	my $id     = $_;
+	$id =~ s/ /_/g;
+	my $synopsis = "\"$entries{$_}{cmdsynopsis}\"";
+	$synopsis =~ s/\\n/\\n"\n$prefix"/g;
+	my @args =
+	  ("buf", $synopsis, map("_(\"$_\")", @{ $entries{$_}{params} }));
+	print HFILE "extern void sql_help_$id(PQExpBuffer buf);\n";
+	print CFILE "void
+sql_help_$id(PQExpBuffer buf)
+{
+\tappendPQExpBuffer(" . join(",\n$prefix", @args) . ");
+}
+
+";
+}
+
+print HFILE "
+
+static const struct _helpStruct QL_HELP[] = {
+";
+foreach (sort keys %entries)
+{
+	my $id = $_;
+	$id =~ s/ /_/g;
+	print HFILE "    { \"$_\",
+      N_(\"$entries{$_}{cmddesc}\"),
+      sql_help_$id,
+      $entries{$_}{nl_count} },
+
+";
+}
+
+print HFILE "
+    { NULL, NULL, NULL }    /* End of list marker */
+};
+
+
+#define QL_HELP_COUNT	"
+  . scalar(keys %entries) . "		/* number of help items */
+#define QL_MAX_CMD_LEN	$maxlen		/* largest strlen(cmd) */
+
+
+#endif /* $define */
+";
+
+close CFILE;
+close HFILE;
+closedir DIR;
--- a/src/bin/csql/describe.c
+++ b/src/bin/csql/describe.c
--- a/src/bin/csql/describe.h
+++ b/src/bin/csql/describe.h
@ -0,0 +1,102 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/describe.h
+ */
+#ifndef DESCRIBE_H
+#define DESCRIBE_H
+
+
+/* \da */
+extern bool describeAggregates(const char *pattern, bool verbose, bool showSystem);
+
+/* \db */
+extern bool describeTablespaces(const char *pattern, bool verbose);
+
+/* \df, \dfa, \dfn, \dft, \dfw, etc. */
+extern bool describeFunctions(const char *functypes, const char *pattern, bool verbose, bool showSystem);
+
+/* \dT */
+extern bool describeTypes(const char *pattern, bool verbose, bool showSystem);
+
+/* \do */
+extern bool describeOperators(const char *pattern, bool verbose, bool showSystem);
+
+/* \du, \dg */
+extern bool describeRoles(const char *pattern, bool verbose);
+
+/* \drds */
+extern bool listDbRoleSettings(const char *pattern1, const char *pattern2);
+
+/* \z (or \dp) */
+extern bool permissionsList(const char *pattern);
+
+/* \ddp */
+extern bool listDefaultACLs(const char *pattern);
+
+/* \dd */
+extern bool objectDescription(const char *pattern, bool showSystem);
+
+/* \d foo */
+extern bool describeTableDetails(const char *pattern, bool verbose, bool showSystem);
+
+/* \dF */
+extern bool listTSConfigs(const char *pattern, bool verbose);
+
+/* \dFp */
+extern bool listTSParsers(const char *pattern, bool verbose);
+
+/* \dFd */
+extern bool listTSDictionaries(const char *pattern, bool verbose);
+
+/* \dFt */
+extern bool listTSTemplates(const char *pattern, bool verbose);
+
+/* \l */
+extern bool listAllDbs(const char *pattern, bool verbose);
+
+/* \dt, \di, \ds, \dS, etc. */
+extern bool listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSystem);
+
+/* \dD */
+extern bool listDomains(const char *pattern, bool verbose, bool showSystem);
+
+/* \dc */
+extern bool listConversions(const char *pattern, bool verbose, bool showSystem);
+
+/* \dC */
+extern bool listCasts(const char *pattern, bool verbose);
+
+/* \dO */
+extern bool listCollations(const char *pattern, bool verbose, bool showSystem);
+
+/* \dn */
+extern bool listSchemas(const char *pattern, bool verbose, bool showSystem);
+
+/* \dew */
+extern bool listForeignDataWrappers(const char *pattern, bool verbose);
+
+/* \des */
+extern bool listForeignServers(const char *pattern, bool verbose);
+
+/* \deu */
+extern bool listUserMappings(const char *pattern, bool verbose);
+
+/* \det */
+extern bool listForeignTables(const char *pattern, bool verbose);
+
+/* \dL */
+extern bool listLanguages(const char *pattern, bool verbose, bool showSystem);
+
+/* \dx */
+extern bool listExtensions(const char *pattern);
+
+/* \dx+ */
+extern bool listExtensionContents(const char *pattern);
+
+/* \dy */
+extern bool listEventTriggers(const char *pattern, bool verbose);
+
+#endif   /* DESCRIBE_H */
--- a/src/bin/csql/dumputils.c
+++ b/src/bin/csql/dumputils.c
--- a/src/bin/csql/help.c
+++ b/src/bin/csql/help.c
@ -0,0 +1,572 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/help.c
+ */
+#include "postgres_fe.h"
+
+#ifndef WIN32
+#include <sys/types.h>			/* (ditto) */
+#include <unistd.h>				/* for geteuid() */
+#else
+#include <win32.h>
+#endif
+
+#ifndef WIN32
+#include <sys/ioctl.h>			/* for ioctl() */
+#endif
+
+#ifdef HAVE_TERMIOS_H
+#include <termios.h>
+#endif
+
+#include "common.h"
+#include "common/username.h"
+#include "help.h"
+#include "input.h"
+#include "settings.h"
+#include "sql_help.h"
+
+
+/*
+ * PLEASE:
+ * If you change something in this file, also make the same changes
+ * in the DocBook documentation, file ref/psql-ref.sgml. If you don't
+ * know how to do it, please find someone who can help you.
+ */
+
+
+/*
+ * usage
+ *
+ * print out command line arguments
+ */
+#define ON(var) (var ? _("on") : _("off"))
+
+void
+usage(unsigned short int pager)
+{
+	const char *env;
+	const char *user;
+	char	   *errstr;
+	FILE	   *output;
+
+	/* Find default user, in case we need it. */
+	user = getenv("PGUSER");
+	if (!user)
+	{
+		user = get_user_name(&errstr);
+		if (!user)
+		{
+			psql_error("%s\n", errstr);
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	output = PageOutput(59, pager ? &(pset.popt.topt) : NULL);
+
+	printf(_("csql is the CitusDB interactive terminal.\n\n"));
+	fprintf(output, _("Usage:\n"));
+	printf(_("  csql [OPTION]... [DBNAME [USERNAME]]\n\n"));
+
+	fprintf(output, _("General options:\n"));
+	/* Display default database */
+	env = getenv("PGDATABASE");
+	if (!env)
+		env = user;
+	fprintf(output, _("  -c, --command=COMMAND    run only single command (SQL or internal) and exit\n"));
+	fprintf(output, _("  -d, --dbname=DBNAME      database name to connect to (default: \"%s\")\n"), env);
+	fprintf(output, _("  -f, --file=FILENAME      execute commands from file, then exit\n"));
+	fprintf(output, _("  -l, --list               list available databases, then exit\n"));
+	fprintf(output, _("  -v, --set=, --variable=NAME=VALUE\n"
+					  "                           set psql variable NAME to VALUE\n"
+					  "                           (e.g., -v ON_ERROR_STOP=1)\n"));
+	fprintf(output, _("  -V, --version            output version information, then exit\n"));
+	fprintf(output, _("  -X, --no-psqlrc          do not read startup file (~/.psqlrc)\n"));
+	fprintf(output, _("  -1 (\"one\"), --single-transaction\n"
+					  "                           execute as a single transaction (if non-interactive)\n"));
+	fprintf(output, _("  -?, --help[=options]     show this help, then exit\n"));
+	fprintf(output, _("      --help=commands      list backslash commands, then exit\n"));
+	fprintf(output, _("      --help=variables     list special variables, then exit\n"));
+
+	fprintf(output, _("\nInput and output options:\n"));
+	fprintf(output, _("  -a, --echo-all           echo all input from script\n"));
+	fprintf(output, _("  -b, --echo-errors        echo failed commands\n"));
+	fprintf(output, _("  -e, --echo-queries       echo commands sent to server\n"));
+	fprintf(output, _("  -E, --echo-hidden        display queries that internal commands generate\n"));
+	fprintf(output, _("  -L, --log-file=FILENAME  send session log to file\n"));
+	fprintf(output, _("  -n, --no-readline        disable enhanced command line editing (readline)\n"));
+	fprintf(output, _("  -o, --output=FILENAME    send query results to file (or |pipe)\n"));
+	fprintf(output, _("  -q, --quiet              run quietly (no messages, only query output)\n"));
+	fprintf(output, _("  -s, --single-step        single-step mode (confirm each query)\n"));
+	fprintf(output, _("  -S, --single-line        single-line mode (end of line terminates SQL command)\n"));
+
+	fprintf(output, _("\nOutput format options:\n"));
+	fprintf(output, _("  -A, --no-align           unaligned table output mode\n"));
+	fprintf(output, _("  -F, --field-separator=STRING\n"
+					  "                           field separator for unaligned output (default: \"%s\")\n"),
+			DEFAULT_FIELD_SEP);
+	fprintf(output, _("  -H, --html               HTML table output mode\n"));
+	fprintf(output, _("  -P, --pset=VAR[=ARG]     set printing option VAR to ARG (see \\pset command)\n"));
+	fprintf(output, _("  -R, --record-separator=STRING\n"
+					  "                           record separator for unaligned output (default: newline)\n"));
+	fprintf(output, _("  -t, --tuples-only        print rows only\n"));
+	fprintf(output, _("  -T, --table-attr=TEXT    set HTML table tag attributes (e.g., width, border)\n"));
+	fprintf(output, _("  -x, --expanded           turn on expanded table output\n"));
+	fprintf(output, _("  -z, --field-separator-zero\n"
+					  "                           set field separator for unaligned output to zero byte\n"));
+	fprintf(output, _("  -0, --record-separator-zero\n"
+					  "                           set record separator for unaligned output to zero byte\n"));
+
+	fprintf(output, _("\nConnection options:\n"));
+	/* Display default host */
+	env = getenv("PGHOST");
+	fprintf(output, _("  -h, --host=HOSTNAME      database server host or socket directory (default: \"%s\")\n"),
+			env ? env : _("local socket"));
+	/* Display default port */
+	env = getenv("PGPORT");
+	fprintf(output, _("  -p, --port=PORT          database server port (default: \"%s\")\n"),
+			env ? env : DEF_PGPORT_STR);
+	/* Display default user */
+	env = getenv("PGUSER");
+	if (!env)
+		env = user;
+	fprintf(output, _("  -U, --username=USERNAME  database user name (default: \"%s\")\n"), env);
+	fprintf(output, _("  -w, --no-password        never prompt for password\n"));
+	fprintf(output, _("  -W, --password           force password prompt (should happen automatically)\n"));
+
+	fprintf(output, _("\nFor more information, type \"\\?\" (for internal commands) or \"\\help\" (for SQL\n"
+					  "commands) from within psql, or consult the psql section in the PostgreSQL\n"
+					  "documentation.\n\n"));
+	fprintf(output, _("Report bugs to <pgsql-bugs@postgresql.org>.\n"));
+
+	ClosePager(output);
+}
+
+
+/*
+ * slashUsage
+ *
+ * print out help for the backslash commands
+ */
+void
+slashUsage(unsigned short int pager)
+{
+	FILE	   *output;
+	char	   *currdb;
+
+	currdb = PQdb(pset.db);
+
+	output = PageOutput(103, pager ? &(pset.popt.topt) : NULL);
+
+	/* if you add/remove a line here, change the row count above */
+
+	fprintf(output, _("General\n"));
+	fprintf(output, _("  \\copyright             show PostgreSQL usage and distribution terms\n"));
+	fprintf(output, _("  \\g [FILE] or ;         execute query (and send results to file or |pipe)\n"));
+	fprintf(output, _("  \\gset [PREFIX]         execute query and store results in psql variables\n"));
+	fprintf(output, _("  \\q                     quit psql\n"));
+	fprintf(output, _("  \\watch [SEC]           execute query every SEC seconds\n"));
+	fprintf(output, "\n");
+
+	fprintf(output, _("Help\n"));
+
+	fprintf(output, _("  \\? [commands]          show help on backslash commands\n"));
+	fprintf(output, _("  \\? options             show help on psql command-line options\n"));
+	fprintf(output, _("  \\? variables           show help on special variables\n"));
+	fprintf(output, _("  \\h [NAME]              help on syntax of SQL commands, * for all commands\n"));
+	fprintf(output, "\n");
+
+	fprintf(output, _("Query Buffer\n"));
+	fprintf(output, _("  \\e [FILE] [LINE]       edit the query buffer (or file) with external editor\n"));
+	fprintf(output, _("  \\ef [FUNCNAME [LINE]]  edit function definition with external editor\n"));
+	fprintf(output, _("  \\p                     show the contents of the query buffer\n"));
+	fprintf(output, _("  \\r                     reset (clear) the query buffer\n"));
+#ifdef USE_READLINE
+	fprintf(output, _("  \\s [FILE]              display history or save it to file\n"));
+#endif
+	fprintf(output, _("  \\w FILE                write query buffer to file\n"));
+	fprintf(output, "\n");
+
+	fprintf(output, _("Input/Output\n"));
+	fprintf(output, _("  \\copy ...              perform SQL COPY with data stream to the client host\n"));
+	fprintf(output, _("  \\echo [STRING]         write string to standard output\n"));
+	fprintf(output, _("  \\i FILE                execute commands from file\n"));
+	fprintf(output, _("  \\ir FILE               as \\i, but relative to location of current script\n"));
+	fprintf(output, _("  \\o [FILE]              send all query results to file or |pipe\n"));
+	fprintf(output, _("  \\qecho [STRING]        write string to query output stream (see \\o)\n"));
+	fprintf(output, "\n");
+
+	fprintf(output, _("Informational\n"));
+	fprintf(output, _("  (options: S = show system objects, + = additional detail)\n"));
+	fprintf(output, _("  \\d[S+]                 list tables, views, and sequences\n"));
+	fprintf(output, _("  \\d[S+]  NAME           describe table, view, sequence, or index\n"));
+	fprintf(output, _("  \\da[S]  [PATTERN]      list aggregates\n"));
+	fprintf(output, _("  \\db[+]  [PATTERN]      list tablespaces\n"));
+	fprintf(output, _("  \\dc[S+] [PATTERN]      list conversions\n"));
+	fprintf(output, _("  \\dC[+]  [PATTERN]      list casts\n"));
+	fprintf(output, _("  \\dd[S]  [PATTERN]      show object descriptions not displayed elsewhere\n"));
+	fprintf(output, _("  \\ddp    [PATTERN]      list default privileges\n"));
+	fprintf(output, _("  \\dD[S+] [PATTERN]      list domains\n"));
+	fprintf(output, _("  \\det[+] [PATTERN]      list foreign tables\n"));
+	fprintf(output, _("  \\des[+] [PATTERN]      list foreign servers\n"));
+	fprintf(output, _("  \\deu[+] [PATTERN]      list user mappings\n"));
+	fprintf(output, _("  \\dew[+] [PATTERN]      list foreign-data wrappers\n"));
+	fprintf(output, _("  \\df[antw][S+] [PATRN]  list [only agg/normal/trigger/window] functions\n"));
+	fprintf(output, _("  \\dF[+]  [PATTERN]      list text search configurations\n"));
+	fprintf(output, _("  \\dFd[+] [PATTERN]      list text search dictionaries\n"));
+	fprintf(output, _("  \\dFp[+] [PATTERN]      list text search parsers\n"));
+	fprintf(output, _("  \\dFt[+] [PATTERN]      list text search templates\n"));
+	fprintf(output, _("  \\dg[+]  [PATTERN]      list roles\n"));
+	fprintf(output, _("  \\di[S+] [PATTERN]      list indexes\n"));
+	fprintf(output, _("  \\dl                    list large objects, same as \\lo_list\n"));
+	fprintf(output, _("  \\dL[S+] [PATTERN]      list procedural languages\n"));
+	fprintf(output, _("  \\dm[S+] [PATTERN]      list materialized views\n"));
+	fprintf(output, _("  \\dn[S+] [PATTERN]      list schemas\n"));
+	fprintf(output, _("  \\do[S]  [PATTERN]      list operators\n"));
+	fprintf(output, _("  \\dO[S+] [PATTERN]      list collations\n"));
+	fprintf(output, _("  \\dp     [PATTERN]      list table, view, and sequence access privileges\n"));
+	fprintf(output, _("  \\drds [PATRN1 [PATRN2]] list per-database role settings\n"));
+	fprintf(output, _("  \\ds[S+] [PATTERN]      list sequences\n"));
+	fprintf(output, _("  \\dt[S+] [PATTERN]      list tables\n"));
+	fprintf(output, _("  \\dT[S+] [PATTERN]      list data types\n"));
+	fprintf(output, _("  \\du[+]  [PATTERN]      list roles\n"));
+	fprintf(output, _("  \\dv[S+] [PATTERN]      list views\n"));
+	fprintf(output, _("  \\dE[S+] [PATTERN]      list foreign tables\n"));
+	fprintf(output, _("  \\dx[+]  [PATTERN]      list extensions\n"));
+	fprintf(output, _("  \\dy     [PATTERN]      list event triggers\n"));
+	fprintf(output, _("  \\l[+]   [PATTERN]      list databases\n"));
+	fprintf(output, _("  \\sf[+] FUNCNAME        show a function's definition\n"));
+	fprintf(output, _("  \\z      [PATTERN]      same as \\dp\n"));
+	fprintf(output, "\n");
+
+	fprintf(output, _("Formatting\n"));
+	fprintf(output, _("  \\a                     toggle between unaligned and aligned output mode\n"));
+	fprintf(output, _("  \\C [STRING]            set table title, or unset if none\n"));
+	fprintf(output, _("  \\f [STRING]            show or set field separator for unaligned query output\n"));
+	fprintf(output, _("  \\H                     toggle HTML output mode (currently %s)\n"),
+			ON(pset.popt.topt.format == PRINT_HTML));
+	fprintf(output, _("  \\pset [NAME [VALUE]]   set table output option\n"
+					  "                         (NAME := {format|border|expanded|fieldsep|fieldsep_zero|footer|null|\n"
+					  "                         numericlocale|recordsep|recordsep_zero|tuples_only|title|tableattr|pager|\n"
+					  "                         unicode_border_linestyle|unicode_column_linestyle|unicode_header_linestyle})\n"));
+	fprintf(output, _("  \\t [on|off]            show only rows (currently %s)\n"),
+			ON(pset.popt.topt.tuples_only));
+	fprintf(output, _("  \\T [STRING]            set HTML <table> tag attributes, or unset if none\n"));
+	fprintf(output, _("  \\x [on|off|auto]       toggle expanded output (currently %s)\n"),
+		pset.popt.topt.expanded == 2 ? "auto" : ON(pset.popt.topt.expanded));
+	fprintf(output, "\n");
+
+	fprintf(output, _("Connection\n"));
+	if (currdb)
+		fprintf(output, _("  \\c[onnect] {[DBNAME|- USER|- HOST|- PORT|-] | conninfo}\n"
+						  "                         connect to new database (currently \"%s\")\n"),
+				currdb);
+	else
+		fprintf(output, _("  \\c[onnect] {[DBNAME|- USER|- HOST|- PORT|-] | conninfo}\n"
+						  "                         connect to new database (currently no connection)\n"));
+	fprintf(output, _("  \\encoding [ENCODING]   show or set client encoding\n"));
+	fprintf(output, _("  \\password [USERNAME]   securely change the password for a user\n"));
+	fprintf(output, _("  \\conninfo              display information about current connection\n"));
+	fprintf(output, "\n");
+
+	fprintf(output, _("Operating System\n"));
+	fprintf(output, _("  \\cd [DIR]              change the current working directory\n"));
+	fprintf(output, _("  \\setenv NAME [VALUE]   set or unset environment variable\n"));
+	fprintf(output, _("  \\timing [on|off]       toggle timing of commands (currently %s)\n"),
+			ON(pset.timing));
+	fprintf(output, _("  \\! [COMMAND]           execute command in shell or start interactive shell\n"));
+	fprintf(output, "\n");
+
+	fprintf(output, _("Variables\n"));
+	fprintf(output, _("  \\prompt [TEXT] NAME    prompt user to set internal variable\n"));
+	fprintf(output, _("  \\set [NAME [VALUE]]    set internal variable, or list all if no parameters\n"));
+	fprintf(output, _("  \\unset NAME            unset (delete) internal variable\n"));
+	fprintf(output, "\n");
+
+	fprintf(output, _("Large Objects\n"));
+	fprintf(output, _("  \\lo_export LOBOID FILE\n"
+					  "  \\lo_import FILE [COMMENT]\n"
+					  "  \\lo_list\n"
+					  "  \\lo_unlink LOBOID      large object operations\n"));
+
+	ClosePager(output);
+}
+
+
+/*
+ * helpVariables
+ *
+ * show list of available variables (options) from command line
+ */
+void
+helpVariables(unsigned short int pager)
+{
+	FILE	   *output;
+
+	output = PageOutput(85, pager ? &(pset.popt.topt) : NULL);
+
+	fprintf(output, _("List of specially treated variables\n\n"));
+
+	fprintf(output, _("psql variables:\n"));
+	fprintf(output, _("Usage:\n"));
+	fprintf(output, _("  psql --set=NAME=VALUE\n  or \\set NAME VALUE inside psql\n\n"));
+
+	fprintf(output, _("  AUTOCOMMIT         if set, successful SQL commands are automatically committed\n"));
+	fprintf(output, _("  COMP_KEYWORD_CASE  determines the case used to complete SQL key words\n"
+	"                     [lower, upper, preserve-lower, preserve-upper]\n"));
+	fprintf(output, _("  DBNAME             the currently connected database name\n"));
+	fprintf(output, _("  ECHO               controls what input is written to standard output\n"
+					  "                     [all, errors, none, queries]\n"));
+	fprintf(output, _("  ECHO_HIDDEN        if set, display internal queries executed by backslash commands;\n"
+					  "                     if set to \"noexec\", just show without execution\n"));
+	fprintf(output, _("  ENCODING           current client character set encoding\n"));
+	fprintf(output, _("  FETCH_COUNT        the number of result rows to fetch and display at a time\n"
+					  "                     (default: 0=unlimited)\n"));
+	fprintf(output, _("  HISTCONTROL        controls command history [ignorespace, ignoredups, ignoreboth]\n"));
+	fprintf(output, _("  HISTFILE           file name used to store the command history\n"));
+	fprintf(output, _("  HISTSIZE           the number of commands to store in the command history\n"));
+	fprintf(output, _("  HOST               the currently connected database server host\n"));
+	fprintf(output, _("  IGNOREEOF          if unset, sending an EOF to interactive session terminates application\n"));
+	fprintf(output, _("  LASTOID            value of the last affected OID\n"));
+	fprintf(output, _("  ON_ERROR_ROLLBACK  if set, an error doesn't stop a transaction (uses implicit savepoints)\n"));
+	fprintf(output, _("  ON_ERROR_STOP      stop batch execution after error\n"));
+	fprintf(output, _("  PORT               server port of the current connection\n"));
+	fprintf(output, _("  PROMPT1            specifies the standard psql prompt\n"));
+	fprintf(output, _("  PROMPT2            specifies the prompt used when a statement continues from a previous line\n"));
+	fprintf(output, _("  PROMPT3            specifies the prompt used during COPY ... FROM STDIN\n"));
+	fprintf(output, _("  QUIET              run quietly (same as -q option)\n"));
+	fprintf(output, _("  SINGLELINE         end of line terminates SQL command mode (same as -S option)\n"));
+	fprintf(output, _("  SINGLESTEP         single-step mode (same as -s option)\n"));
+	fprintf(output, _("  USER               the currently connected database user\n"));
+	fprintf(output, _("  VERBOSITY          controls verbosity of error reports [default, verbose, terse]\n"));
+
+	fprintf(output, _("\nDisplay settings:\n"));
+	fprintf(output, _("Usage:\n"));
+	fprintf(output, _("  psql --pset=NAME[=VALUE]\n  or \\pset NAME [VALUE] inside psql\n\n"));
+
+	fprintf(output, _("  border             border style (number)\n"));
+	fprintf(output, _("  columns            target width for the wrapped format\n"));
+	fprintf(output, _("  expanded (or x)    expanded output [on, off, auto]\n"));
+	fprintf(output, _("  fieldsep           field separator for unaligned output (default \"%s\")\n"), DEFAULT_FIELD_SEP);
+	fprintf(output, _("  fieldsep_zero      set field separator for unaligned output to zero byte\n"));
+	fprintf(output, _("  format             set output format [unaligned, aligned, wrapped, html, asciidoc, ...]\n"));
+	fprintf(output, _("  footer             enable or disable display of the table footer [on, off]\n"));
+	fprintf(output, _("  linestyle          set the border line drawing style [ascii, old-ascii, unicode]\n"));
+	fprintf(output, _("  null               set the string to be printed in place of a null value\n"));
+	fprintf(output, _("  numericlocale      enable or disable display of a locale-specific character to separate\n"
+					  "                     groups of digits [on, off]\n"));
+	fprintf(output, _("  pager              control when an external pager is used [yes, no, always]\n"));
+	fprintf(output, _("  recordsep          record (line) separator for unaligned output\n"));
+	fprintf(output, _("  recordsep_zero     set record separator for unaligned output to zero byte\n"));
+	fprintf(output, _("  tableattr (or T)   specify attributes for table tag in html format or proportional\n"
+					  "                     column widths for left-aligned data types in latex-longtable format\n"));
+	fprintf(output, _("  title              set the table title for any subsequently printed tables\n"));
+	fprintf(output, _("  tuples_only        if set, only actual table data is shown\n"));
+	fprintf(output, _("  unicode_border_linestyle\n"
+					  "  unicode_column_linestyle\n"
+					  "  unicode_header_linestyle\n"
+					  "                     set the style of Unicode line drawing [single, double]\n"));
+
+	fprintf(output, _("\nEnvironment variables:\n"));
+	fprintf(output, _("Usage:\n"));
+
+#ifndef WIN32
+	fprintf(output, _("  NAME=VALUE [NAME=VALUE] psql ...\n  or \\setenv NAME [VALUE] inside psql\n\n"));
+#else
+	fprintf(output, _("  set NAME=VALUE\n  psql ...\n  or \\setenv NAME [VALUE] inside psql\n\n"));
+#endif
+
+	fprintf(output, _("  COLUMNS            number of columns for wrapped format\n"));
+	fprintf(output, _("  PAGER              name of external pager program\n"));
+	fprintf(output, _("  PGAPPNAME          same as the application_name connection parameter\n"));
+	fprintf(output, _("  PGDATABASE         same as the dbname connection parameter\n"));
+	fprintf(output, _("  PGHOST             same as the host connection parameter\n"));
+	fprintf(output, _("  PGPORT             same as the port connection parameter\n"));
+	fprintf(output, _("  PGUSER             same as the user connection parameter\n"));
+	fprintf(output, _("  PGPASSWORD         connection password (not recommended)\n"));
+	fprintf(output, _("  PGPASSFILE         password file name\n"));
+	fprintf(output, _("  PSQL_EDITOR, EDITOR, VISUAL\n"
+		 "                     editor used by the \\e and \\ef commands\n"));
+	fprintf(output, _("  PSQL_EDITOR_LINENUMBER_ARG\n"
+					  "                     how to specify a line number when invoking the editor\n"));
+	fprintf(output, _("  PSQL_HISTORY       alternative location for the command history file\n"));
+	fprintf(output, _("  PSQLRC             alternative location for the user's .psqlrc file\n"));
+	fprintf(output, _("  SHELL              shell used by the \\! command\n"));
+	fprintf(output, _("  TMPDIR             directory for temporary files\n"));
+
+	ClosePager(output);
+}
+
+
+/*
+ * helpSQL -- help with SQL commands
+ *
+ * Note: we assume caller removed any trailing spaces in "topic".
+ */
+void
+helpSQL(const char *topic, unsigned short int pager)
+{
+#define VALUE_OR_NULL(a) ((a) ? (a) : "")
+
+	if (!topic || strlen(topic) == 0)
+	{
+		/* Print all the available command names */
+		int			screen_width;
+		int			ncolumns;
+		int			nrows;
+		FILE	   *output;
+		int			i;
+		int			j;
+
+#ifdef TIOCGWINSZ
+		struct winsize screen_size;
+
+		if (ioctl(fileno(stdout), TIOCGWINSZ, &screen_size) == -1)
+			screen_width = 80;	/* ioctl failed, assume 80 */
+		else
+			screen_width = screen_size.ws_col;
+#else
+		screen_width = 80;		/* default assumption */
+#endif
+
+		ncolumns = (screen_width - 3) / (QL_MAX_CMD_LEN + 1);
+		ncolumns = Max(ncolumns, 1);
+		nrows = (QL_HELP_COUNT + (ncolumns - 1)) / ncolumns;
+
+		output = PageOutput(nrows + 1, pager ? &(pset.popt.topt) : NULL);
+
+		fputs(_("Available help:\n"), output);
+
+		for (i = 0; i < nrows; i++)
+		{
+			fprintf(output, "  ");
+			for (j = 0; j < ncolumns - 1; j++)
+				fprintf(output, "%-*s",
+						QL_MAX_CMD_LEN + 1,
+						VALUE_OR_NULL(QL_HELP[i + j * nrows].cmd));
+			if (i + j * nrows < QL_HELP_COUNT)
+				fprintf(output, "%s",
+						VALUE_OR_NULL(QL_HELP[i + j * nrows].cmd));
+			fputc('\n', output);
+		}
+
+		ClosePager(output);
+	}
+	else
+	{
+		int			i,
+					j,
+					x = 0;
+		bool		help_found = false;
+		FILE	   *output = NULL;
+		size_t		len,
+					wordlen;
+		int			nl_count = 0;
+
+		/*
+		 * We first try exact match, then first + second words, then first
+		 * word only.
+		 */
+		len = strlen(topic);
+
+		for (x = 1; x <= 3; x++)
+		{
+			if (x > 1)			/* Nothing on first pass - try the opening
+								 * word(s) */
+			{
+				wordlen = j = 1;
+				while (topic[j] != ' ' && j++ < len)
+					wordlen++;
+				if (x == 2)
+				{
+					j++;
+					while (topic[j] != ' ' && j++ <= len)
+						wordlen++;
+				}
+				if (wordlen >= len)		/* Don't try again if the same word */
+				{
+					if (!output)
+						output = PageOutput(nl_count, pager ? &(pset.popt.topt) : NULL);
+					break;
+				}
+				len = wordlen;
+			}
+
+			/* Count newlines for pager */
+			for (i = 0; QL_HELP[i].cmd; i++)
+			{
+				if (pg_strncasecmp(topic, QL_HELP[i].cmd, len) == 0 ||
+					strcmp(topic, "*") == 0)
+				{
+					nl_count += 5 + QL_HELP[i].nl_count;
+
+					/* If we have an exact match, exit.  Fixes \h SELECT */
+					if (pg_strcasecmp(topic, QL_HELP[i].cmd) == 0)
+						break;
+				}
+			}
+
+			if (!output)
+				output = PageOutput(nl_count, pager ? &(pset.popt.topt) : NULL);
+
+			for (i = 0; QL_HELP[i].cmd; i++)
+			{
+				if (pg_strncasecmp(topic, QL_HELP[i].cmd, len) == 0 ||
+					strcmp(topic, "*") == 0)
+				{
+					PQExpBufferData buffer;
+
+					initPQExpBuffer(&buffer);
+					QL_HELP[i].syntaxfunc(&buffer);
+					help_found = true;
+					fprintf(output, _("Command:     %s\n"
+									  "Description: %s\n"
+									  "Syntax:\n%s\n\n"),
+							QL_HELP[i].cmd,
+							_(QL_HELP[i].help),
+							buffer.data);
+					/* If we have an exact match, exit.  Fixes \h SELECT */
+					if (pg_strcasecmp(topic, QL_HELP[i].cmd) == 0)
+						break;
+				}
+			}
+			if (help_found)		/* Don't keep trying if we got a match */
+				break;
+		}
+
+		if (!help_found)
+			fprintf(output, _("No help available for \"%s\".\nTry \\h with no arguments to see available help.\n"), topic);
+
+		ClosePager(output);
+	}
+}
+
+
+
+void
+print_copyright(void)
+{
+	puts(
+		 "PostgreSQL Database Management System\n"
+		 "(formerly known as Postgres, then as Postgres95)\n\n"
+		 "Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group\n\n"
+		 "Portions Copyright (c) 1994, The Regents of the University of California\n\n"
+	"Permission to use, copy, modify, and distribute this software and its\n"
+		 "documentation for any purpose, without fee, and without a written agreement\n"
+	 "is hereby granted, provided that the above copyright notice and this\n"
+	   "paragraph and the following two paragraphs appear in all copies.\n\n"
+		 "IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR\n"
+		 "DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING\n"
+		 "LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS\n"
+		 "DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE\n"
+		 "POSSIBILITY OF SUCH DAMAGE.\n\n"
+	  "THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,\n"
+		 "INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY\n"
+		 "AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS\n"
+		 "ON AN \"AS IS\" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO\n"
+	"PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.\n"
+		);
+}
--- a/src/bin/csql/help.h
+++ b/src/bin/csql/help.h
@ -0,0 +1,21 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/help.h
+ */
+#ifndef HELP_H
+#define HELP_H
+
+void		usage(unsigned short int pager);
+
+void		slashUsage(unsigned short int pager);
+
+void		helpVariables(unsigned short int pager);
+
+void		helpSQL(const char *topic, unsigned short int pager);
+
+void		print_copyright(void);
+
+#endif
--- a/src/bin/csql/input.c
+++ b/src/bin/csql/input.c
@ -0,0 +1,539 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/input.c
+ */
+#include "postgres_fe.h"
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+#include <fcntl.h>
+#include <limits.h>
+
+#include "input.h"
+#include "settings.h"
+#include "tab-complete.h"
+#include "common.h"
+
+#ifndef WIN32
+#define PSQLHISTORY ".psql_history"
+#else
+#define PSQLHISTORY "psql_history"
+#endif
+
+/* Runtime options for turning off readline and history */
+/* (of course there is no runtime command for doing that :) */
+#ifdef USE_READLINE
+static bool useReadline;
+static bool useHistory;
+
+static char *psql_history;
+
+static int	history_lines_added;
+
+
+/*
+ *	Preserve newlines in saved queries by mapping '\n' to NL_IN_HISTORY
+ *
+ *	It is assumed NL_IN_HISTORY will never be entered by the user
+ *	nor appear inside a multi-byte string.  0x00 is not properly
+ *	handled by the readline routines so it can not be used
+ *	for this purpose.
+ */
+#define NL_IN_HISTORY	0x01
+#endif
+
+static void finishInput(void);
+
+
+/*
+ * gets_interactive()
+ *
+ * Gets a line of interactive input, using readline if desired.
+ * The result is a malloc'd string.
+ *
+ * Caller *must* have set up sigint_interrupt_jmp before calling.
+ */
+char *
+gets_interactive(const char *prompt)
+{
+#ifdef USE_READLINE
+	if (useReadline)
+	{
+		char	   *result;
+
+		/*
+		 * Some versions of readline don't notice SIGWINCH signals that arrive
+		 * when not actively reading input.  The simplest fix is to always
+		 * re-read the terminal size.  This leaves a window for SIGWINCH to be
+		 * missed between here and where readline() enables libreadline's
+		 * signal handler, but that's probably short enough to be ignored.
+		 */
+#ifdef HAVE_RL_RESET_SCREEN_SIZE
+		rl_reset_screen_size();
+#endif
+
+		/* Enable SIGINT to longjmp to sigint_interrupt_jmp */
+		sigint_interrupt_enabled = true;
+
+		/* On some platforms, readline is declared as readline(char *) */
+		result = readline((char *) prompt);
+
+		/* Disable SIGINT again */
+		sigint_interrupt_enabled = false;
+
+		return result;
+	}
+#endif
+
+	fputs(prompt, stdout);
+	fflush(stdout);
+	return gets_fromFile(stdin);
+}
+
+
+/*
+ * Append the line to the history buffer, making sure there is a trailing '\n'
+ */
+void
+pg_append_history(const char *s, PQExpBuffer history_buf)
+{
+#ifdef USE_READLINE
+	if (useHistory && s)
+	{
+		appendPQExpBufferStr(history_buf, s);
+		if (!s[0] || s[strlen(s) - 1] != '\n')
+			appendPQExpBufferChar(history_buf, '\n');
+	}
+#endif
+}
+
+
+/*
+ * Emit accumulated history entry to readline's history mechanism,
+ * then reset the buffer to empty.
+ *
+ * Note: we write nothing if history_buf is empty, so extra calls to this
+ * function don't hurt.  There must have been at least one line added by
+ * pg_append_history before we'll do anything.
+ */
+void
+pg_send_history(PQExpBuffer history_buf)
+{
+#ifdef USE_READLINE
+	static char *prev_hist = NULL;
+
+	char	   *s = history_buf->data;
+	int			i;
+
+	/* Trim any trailing \n's (OK to scribble on history_buf) */
+	for (i = strlen(s) - 1; i >= 0 && s[i] == '\n'; i--)
+		;
+	s[i + 1] = '\0';
+
+	if (useHistory && s[0])
+	{
+		if (((pset.histcontrol & hctl_ignorespace) &&
+			 s[0] == ' ') ||
+			((pset.histcontrol & hctl_ignoredups) &&
+			 prev_hist && strcmp(s, prev_hist) == 0))
+		{
+			/* Ignore this line as far as history is concerned */
+		}
+		else
+		{
+			/* Save each previous line for ignoredups processing */
+			if (prev_hist)
+				free(prev_hist);
+			prev_hist = pg_strdup(s);
+			/* And send it to readline */
+			add_history(s);
+			/* Count lines added to history for use later */
+			history_lines_added++;
+		}
+	}
+
+	resetPQExpBuffer(history_buf);
+#endif
+}
+
+
+/*
+ * gets_fromFile
+ *
+ * Gets a line of noninteractive input from a file (which could be stdin).
+ * The result is a malloc'd string, or NULL on EOF or input error.
+ *
+ * Caller *must* have set up sigint_interrupt_jmp before calling.
+ *
+ * Note: we re-use a static PQExpBuffer for each call.  This is to avoid
+ * leaking memory if interrupted by SIGINT.
+ */
+char *
+gets_fromFile(FILE *source)
+{
+	static PQExpBuffer buffer = NULL;
+
+	char		line[1024];
+
+	if (buffer == NULL)			/* first time through? */
+		buffer = createPQExpBuffer();
+	else
+		resetPQExpBuffer(buffer);
+
+	for (;;)
+	{
+		char	   *result;
+
+		/* Enable SIGINT to longjmp to sigint_interrupt_jmp */
+		sigint_interrupt_enabled = true;
+
+		/* Get some data */
+		result = fgets(line, sizeof(line), source);
+
+		/* Disable SIGINT again */
+		sigint_interrupt_enabled = false;
+
+		/* EOF or error? */
+		if (result == NULL)
+		{
+			if (ferror(source))
+			{
+				psql_error("could not read from input file: %s\n",
+						   strerror(errno));
+				return NULL;
+			}
+			break;
+		}
+
+		appendPQExpBufferStr(buffer, line);
+
+		if (PQExpBufferBroken(buffer))
+		{
+			psql_error("out of memory\n");
+			return NULL;
+		}
+
+		/* EOL? */
+		if (buffer->data[buffer->len - 1] == '\n')
+		{
+			buffer->data[buffer->len - 1] = '\0';
+			return pg_strdup(buffer->data);
+		}
+	}
+
+	if (buffer->len > 0)		/* EOF after reading some bufferload(s) */
+		return pg_strdup(buffer->data);
+
+	/* EOF, so return null */
+	return NULL;
+}
+
+
+#ifdef USE_READLINE
+
+/*
+ * Macros to iterate over each element of the history list in order
+ *
+ * You would think this would be simple enough, but in its inimitable fashion
+ * libedit has managed to break it: in libreadline we must use next_history()
+ * to go from oldest to newest, but in libedit we must use previous_history().
+ * To detect what to do, we make a trial call of previous_history(): if it
+ * fails, then either next_history() is what to use, or there's zero or one
+ * history entry so that it doesn't matter which direction we go.
+ *
+ * In case that wasn't disgusting enough: the code below is not as obvious as
+ * it might appear.  In some libedit releases history_set_pos(0) fails until
+ * at least one add_history() call has been done.  This is not an issue for
+ * printHistory() or encode_history(), which cannot be invoked before that has
+ * happened.  In decode_history(), that's not so, and what actually happens is
+ * that we are sitting on the newest entry to start with, previous_history()
+ * fails, and we iterate over all the entries using next_history().  So the
+ * decode_history() loop iterates over the entries in the wrong order when
+ * using such a libedit release, and if there were another attempt to use
+ * BEGIN_ITERATE_HISTORY() before some add_history() call had happened, it
+ * wouldn't work.  Fortunately we don't care about either of those things.
+ *
+ * Usage pattern is:
+ *
+ *		BEGIN_ITERATE_HISTORY(varname);
+ *		{
+ *			loop body referencing varname->line;
+ *		}
+ *		END_ITERATE_HISTORY();
+ */
+#define BEGIN_ITERATE_HISTORY(VARNAME) \
+	do { \
+		HIST_ENTRY *VARNAME; \
+		bool		use_prev_; \
+		\
+		history_set_pos(0); \
+		use_prev_ = (previous_history() != NULL); \
+		history_set_pos(0); \
+		for (VARNAME = current_history(); VARNAME != NULL; \
+			 VARNAME = use_prev_ ? previous_history() : next_history()) \
+		{ \
+			(void) 0
+
+#define END_ITERATE_HISTORY() \
+		} \
+	} while(0)
+
+
+/*
+ * Convert newlines to NL_IN_HISTORY for safe saving in readline history file
+ */
+static void
+encode_history(void)
+{
+	BEGIN_ITERATE_HISTORY(cur_hist);
+	{
+		char	   *cur_ptr;
+
+		/* some platforms declare HIST_ENTRY.line as const char * */
+		for (cur_ptr = (char *) cur_hist->line; *cur_ptr; cur_ptr++)
+		{
+			if (*cur_ptr == '\n')
+				*cur_ptr = NL_IN_HISTORY;
+		}
+	}
+	END_ITERATE_HISTORY();
+}
+
+/*
+ * Reverse the above encoding
+ */
+static void
+decode_history(void)
+{
+	BEGIN_ITERATE_HISTORY(cur_hist);
+	{
+		char	   *cur_ptr;
+
+		/* some platforms declare HIST_ENTRY.line as const char * */
+		for (cur_ptr = (char *) cur_hist->line; *cur_ptr; cur_ptr++)
+		{
+			if (*cur_ptr == NL_IN_HISTORY)
+				*cur_ptr = '\n';
+		}
+	}
+	END_ITERATE_HISTORY();
+}
+#endif   /* USE_READLINE */
+
+
+/*
+ * Put any startup stuff related to input in here. It's good to maintain
+ * abstraction this way.
+ *
+ * The only "flag" right now is 1 for use readline & history.
+ */
+void
+initializeInput(int flags)
+{
+#ifdef USE_READLINE
+	if (flags & 1)
+	{
+		const char *histfile;
+		char		home[MAXPGPATH];
+
+		useReadline = true;
+
+		/* these two things must be done in this order: */
+		initialize_readline();
+		rl_initialize();
+
+		useHistory = true;
+		using_history();
+		history_lines_added = 0;
+
+		histfile = GetVariable(pset.vars, "HISTFILE");
+
+		if (histfile == NULL)
+		{
+			char	   *envhist;
+
+			envhist = getenv("PSQL_HISTORY");
+			if (envhist != NULL && strlen(envhist) > 0)
+				histfile = envhist;
+		}
+
+		if (histfile == NULL)
+		{
+			if (get_home_path(home))
+				psql_history = psprintf("%s/%s", home, PSQLHISTORY);
+		}
+		else
+		{
+			psql_history = pg_strdup(histfile);
+			expand_tilde(&psql_history);
+		}
+
+		if (psql_history)
+		{
+			read_history(psql_history);
+			decode_history();
+		}
+	}
+#endif
+
+	atexit(finishInput);
+}
+
+
+/*
+ * This function saves the readline history when psql exits.
+ *
+ * fname: pathname of history file.  (Should really be "const char *",
+ * but some ancient versions of readline omit the const-decoration.)
+ *
+ * max_lines: if >= 0, limit history file to that many entries.
+ */
+#ifdef USE_READLINE
+static bool
+saveHistory(char *fname, int max_lines)
+{
+	int			errnum;
+
+	/*
+	 * Suppressing the write attempt when HISTFILE is set to /dev/null may
+	 * look like a negligible optimization, but it's necessary on e.g. Darwin,
+	 * where write_history will fail because it tries to chmod the target
+	 * file.
+	 */
+	if (strcmp(fname, DEVNULL) != 0)
+	{
+		/*
+		 * Encode \n, since otherwise readline will reload multiline history
+		 * entries as separate lines.  (libedit doesn't really need this, but
+		 * we do it anyway since it's too hard to tell which implementation we
+		 * are using.)
+		 */
+		encode_history();
+
+		/*
+		 * On newer versions of libreadline, truncate the history file as
+		 * needed and then append what we've added.  This avoids overwriting
+		 * history from other concurrent sessions (although there are still
+		 * race conditions when two sessions exit at about the same time). If
+		 * we don't have those functions, fall back to write_history().
+		 */
+#if defined(HAVE_HISTORY_TRUNCATE_FILE) && defined(HAVE_APPEND_HISTORY)
+		{
+			int			nlines;
+			int			fd;
+
+			/* truncate previous entries if needed */
+			if (max_lines >= 0)
+			{
+				nlines = Max(max_lines - history_lines_added, 0);
+				(void) history_truncate_file(fname, nlines);
+			}
+			/* append_history fails if file doesn't already exist :-( */
+			fd = open(fname, O_CREAT | O_WRONLY | PG_BINARY, 0600);
+			if (fd >= 0)
+				close(fd);
+			/* append the appropriate number of lines */
+			if (max_lines >= 0)
+				nlines = Min(max_lines, history_lines_added);
+			else
+				nlines = history_lines_added;
+			errnum = append_history(nlines, fname);
+			if (errnum == 0)
+				return true;
+		}
+#else							/* don't have append support */
+		{
+			/* truncate what we have ... */
+			if (max_lines >= 0)
+				stifle_history(max_lines);
+			/* ... and overwrite file.  Tough luck for concurrent sessions. */
+			errnum = write_history(fname);
+			if (errnum == 0)
+				return true;
+		}
+#endif
+
+		psql_error("could not save history to file \"%s\": %s\n",
+				   fname, strerror(errnum));
+	}
+	return false;
+}
+#endif
+
+
+
+/*
+ * Print history to the specified file, or to the console if fname is NULL
+ * (psql \s command)
+ *
+ * We used to use saveHistory() for this purpose, but that doesn't permit
+ * use of a pager; moreover libedit's implementation behaves incompatibly
+ * (preferring to encode its output) and may fail outright when the target
+ * file is specified as /dev/tty.
+ */
+bool
+printHistory(const char *fname, unsigned short int pager)
+{
+#ifdef USE_READLINE
+	FILE	   *output;
+	bool		is_pager;
+
+	if (!useHistory)
+		return false;
+
+	if (fname == NULL)
+	{
+		/* use pager, if enabled, when printing to console */
+		output = PageOutput(INT_MAX, pager ? &(pset.popt.topt) : NULL);
+		is_pager = true;
+	}
+	else
+	{
+		output = fopen(fname, "w");
+		if (output == NULL)
+		{
+			psql_error("could not save history to file \"%s\": %s\n",
+					   fname, strerror(errno));
+			return false;
+		}
+		is_pager = false;
+	}
+
+	BEGIN_ITERATE_HISTORY(cur_hist);
+	{
+		fprintf(output, "%s\n", cur_hist->line);
+	}
+	END_ITERATE_HISTORY();
+
+	if (is_pager)
+		ClosePager(output);
+	else
+		fclose(output);
+
+	return true;
+#else
+	psql_error("history is not supported by this installation\n");
+	return false;
+#endif
+}
+
+
+static void
+finishInput(void)
+{
+#ifdef USE_READLINE
+	if (useHistory && psql_history)
+	{
+		int			hist_size;
+
+		hist_size = GetVariableNum(pset.vars, "HISTSIZE", 500, -1, true);
+		(void) saveHistory(psql_history, hist_size);
+		free(psql_history);
+		psql_history = NULL;
+	}
+#endif
+}
--- a/src/bin/csql/input.h
+++ b/src/bin/csql/input.h
@ -0,0 +1,51 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/input.h
+ */
+#ifndef INPUT_H
+#define INPUT_H
+
+/*
+ * If some other file needs to have access to readline/history, include this
+ * file and save yourself all this work.
+ *
+ * USE_READLINE is the definite pointers regarding existence or not.
+ */
+#ifdef HAVE_LIBREADLINE
+#define USE_READLINE 1
+
+#if defined(HAVE_READLINE_READLINE_H)
+#include <readline/readline.h>
+#if defined(HAVE_READLINE_HISTORY_H)
+#include <readline/history.h>
+#endif
+#elif defined(HAVE_EDITLINE_READLINE_H)
+#include <editline/readline.h>
+#if defined(HAVE_EDITLINE_HISTORY_H)
+#include <editline/history.h>
+#endif
+#elif defined(HAVE_READLINE_H)
+#include <readline.h>
+#if defined(HAVE_HISTORY_H)
+#include <history.h>
+#endif
+#endif   /* HAVE_READLINE_READLINE_H, etc */
+#endif   /* HAVE_LIBREADLINE */
+
+#include "pqexpbuffer.h"
+
+
+char	   *gets_interactive(const char *prompt);
+char	   *gets_fromFile(FILE *source);
+
+void		initializeInput(int flags);
+
+bool		printHistory(const char *fname, unsigned short int pager);
+
+void		pg_append_history(const char *s, PQExpBuffer history_buf);
+void		pg_send_history(PQExpBuffer history_buf);
+
+#endif   /* INPUT_H */
--- a/src/bin/csql/keywords.c
+++ b/src/bin/csql/keywords.c
@ -0,0 +1,30 @@
+/*-------------------------------------------------------------------------
+ *
+ * keywords.c
+ *	  lexical token lookup for key words in PostgreSQL
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/bin/pg_dump/keywords.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include "parser/keywords.h"
+
+/*
+ * We don't need the token number, so leave it out to avoid requiring other
+ * backend headers.
+ */
+#define PG_KEYWORD(a,b,c) {a,0,c},
+
+const ScanKeyword FEScanKeywords[] = {
+#include "parser/kwlist.h"
+};
+
+const int	NumFEScanKeywords = lengthof(FEScanKeywords);
--- a/src/bin/csql/kwlookup.c
+++ b/src/bin/csql/kwlookup.c
@ -0,0 +1,89 @@
+/*-------------------------------------------------------------------------
+ *
+ * kwlookup.c
+ *	  lexical token lookup for key words in PostgreSQL
+ *
+ * NB - this file is also used by ECPG and several frontend programs in
+ * src/bin/ including pg_dump and psql
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/parser/kwlookup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* use c.h so this can be built as either frontend or backend */
+#include "c.h"
+
+#include <ctype.h>
+
+#include "parser/keywords.h"
+
+/*
+ * ScanKeywordLookup - see if a given word is a keyword
+ *
+ * Returns a pointer to the ScanKeyword table entry, or NULL if no match.
+ *
+ * The match is done case-insensitively.  Note that we deliberately use a
+ * dumbed-down case conversion that will only translate 'A'-'Z' into 'a'-'z',
+ * even if we are in a locale where tolower() would produce more or different
+ * translations.  This is to conform to the SQL99 spec, which says that
+ * keywords are to be matched in this way even though non-keyword identifiers
+ * receive a different case-normalization mapping.
+ */
+const ScanKeyword *
+ScanKeywordLookup(const char *text,
+				  const ScanKeyword *keywords,
+				  int num_keywords)
+{
+	int			len,
+				i;
+	char		word[NAMEDATALEN];
+	const ScanKeyword *low;
+	const ScanKeyword *high;
+
+	len = strlen(text);
+	/* We assume all keywords are shorter than NAMEDATALEN. */
+	if (len >= NAMEDATALEN)
+		return NULL;
+
+	/*
+	 * Apply an ASCII-only downcasing.  We must not use tolower() since it may
+	 * produce the wrong translation in some locales (eg, Turkish).
+	 */
+	for (i = 0; i < len; i++)
+	{
+		char		ch = text[i];
+
+		if (ch >= 'A' && ch <= 'Z')
+			ch += 'a' - 'A';
+		word[i] = ch;
+	}
+	word[len] = '\0';
+
+	/*
+	 * Now do a binary search using plain strcmp() comparison.
+	 */
+	low = keywords;
+	high = keywords + (num_keywords - 1);
+	while (low <= high)
+	{
+		const ScanKeyword *middle;
+		int			difference;
+
+		middle = low + (high - low) / 2;
+		difference = strcmp(middle->name, word);
+		if (difference == 0)
+			return middle;
+		else if (difference < 0)
+			low = middle + 1;
+		else
+			high = middle - 1;
+	}
+
+	return NULL;
+}
--- a/src/bin/csql/large_obj.c
+++ b/src/bin/csql/large_obj.c
@ -0,0 +1,321 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/large_obj.c
+ */
+#include "postgres_fe.h"
+#include "large_obj.h"
+
+
+#include "settings.h"
+#include "common.h"
+
+#if (PG_VERSION_NUM >= 90500)
+static void print_lo_result(const char *fmt,...) pg_attribute_printf(1, 2);
+#else
+static void
+print_lo_result(const char *fmt,...)
+__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
+#endif
+
+static void
+print_lo_result(const char *fmt,...)
+{
+	va_list		ap;
+
+	if (!pset.quiet)
+	{
+		if (pset.popt.topt.format == PRINT_HTML)
+			fputs("<p>", pset.queryFout);
+
+		va_start(ap, fmt);
+		vfprintf(pset.queryFout, fmt, ap);
+		va_end(ap);
+
+		if (pset.popt.topt.format == PRINT_HTML)
+			fputs("</p>\n", pset.queryFout);
+		else
+			fputs("\n", pset.queryFout);
+	}
+
+	if (pset.logfile)
+	{
+		va_start(ap, fmt);
+		vfprintf(pset.logfile, fmt, ap);
+		va_end(ap);
+		fputs("\n", pset.logfile);
+	}
+}
+
+
+/*
+ * Prepare to do a large-object operation.  We *must* be inside a transaction
+ * block for all these operations, so start one if needed.
+ *
+ * Returns TRUE if okay, FALSE if failed.  *own_transaction is set to indicate
+ * if we started our own transaction or not.
+ */
+static bool
+start_lo_xact(const char *operation, bool *own_transaction)
+{
+	PGTransactionStatusType tstatus;
+	PGresult   *res;
+
+	*own_transaction = false;
+
+	if (!pset.db)
+	{
+		psql_error("%s: not connected to a database\n", operation);
+		return false;
+	}
+
+	tstatus = PQtransactionStatus(pset.db);
+
+	switch (tstatus)
+	{
+		case PQTRANS_IDLE:
+			/* need to start our own xact */
+			if (!(res = PSQLexec("BEGIN")))
+				return false;
+			PQclear(res);
+			*own_transaction = true;
+			break;
+		case PQTRANS_INTRANS:
+			/* use the existing xact */
+			break;
+		case PQTRANS_INERROR:
+			psql_error("%s: current transaction is aborted\n", operation);
+			return false;
+		default:
+			psql_error("%s: unknown transaction status\n", operation);
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Clean up after a successful LO operation
+ */
+static bool
+finish_lo_xact(const char *operation, bool own_transaction)
+{
+	PGresult   *res;
+
+	if (own_transaction && pset.autocommit)
+	{
+		/* close out our own xact */
+		if (!(res = PSQLexec("COMMIT")))
+		{
+			res = PSQLexec("ROLLBACK");
+			PQclear(res);
+			return false;
+		}
+		PQclear(res);
+	}
+
+	return true;
+}
+
+/*
+ * Clean up after a failed LO operation
+ */
+static bool
+fail_lo_xact(const char *operation, bool own_transaction)
+{
+	PGresult   *res;
+
+	if (own_transaction && pset.autocommit)
+	{
+		/* close out our own xact */
+		res = PSQLexec("ROLLBACK");
+		PQclear(res);
+	}
+
+	return false;				/* always */
+}
+
+
+/*
+ * do_lo_export()
+ *
+ * Write a large object to a file
+ */
+bool
+do_lo_export(const char *loid_arg, const char *filename_arg)
+{
+	int			status;
+	bool		own_transaction;
+
+	if (!start_lo_xact("\\lo_export", &own_transaction))
+		return false;
+
+	SetCancelConn();
+	status = lo_export(pset.db, atooid(loid_arg), filename_arg);
+	ResetCancelConn();
+
+	/* of course this status is documented nowhere :( */
+	if (status != 1)
+	{
+		psql_error("%s", PQerrorMessage(pset.db));
+		return fail_lo_xact("\\lo_export", own_transaction);
+	}
+
+	if (!finish_lo_xact("\\lo_export", own_transaction))
+		return false;
+
+	print_lo_result("lo_export");
+
+	return true;
+}
+
+
+/*
+ * do_lo_import()
+ *
+ * Copy large object from file to database
+ */
+bool
+do_lo_import(const char *filename_arg, const char *comment_arg)
+{
+	PGresult   *res;
+	Oid			loid;
+	char		oidbuf[32];
+	bool		own_transaction;
+
+	if (!start_lo_xact("\\lo_import", &own_transaction))
+		return false;
+
+	SetCancelConn();
+	loid = lo_import(pset.db, filename_arg);
+	ResetCancelConn();
+
+	if (loid == InvalidOid)
+	{
+		psql_error("%s", PQerrorMessage(pset.db));
+		return fail_lo_xact("\\lo_import", own_transaction);
+	}
+
+	/* insert description if given */
+	if (comment_arg)
+	{
+		char	   *cmdbuf;
+		char	   *bufptr;
+		size_t		slen = strlen(comment_arg);
+
+		cmdbuf = malloc(slen * 2 + 256);
+		if (!cmdbuf)
+			return fail_lo_xact("\\lo_import", own_transaction);
+		sprintf(cmdbuf, "COMMENT ON LARGE OBJECT %u IS '", loid);
+		bufptr = cmdbuf + strlen(cmdbuf);
+		bufptr += PQescapeStringConn(pset.db, bufptr, comment_arg, slen, NULL);
+		strcpy(bufptr, "'");
+
+		if (!(res = PSQLexec(cmdbuf)))
+		{
+			free(cmdbuf);
+			return fail_lo_xact("\\lo_import", own_transaction);
+		}
+
+		PQclear(res);
+		free(cmdbuf);
+	}
+
+	if (!finish_lo_xact("\\lo_import", own_transaction))
+		return false;
+
+	print_lo_result("lo_import %u", loid);
+
+	sprintf(oidbuf, "%u", loid);
+	SetVariable(pset.vars, "LASTOID", oidbuf);
+
+	return true;
+}
+
+
+/*
+ * do_lo_unlink()
+ *
+ * removes a large object out of the database
+ */
+bool
+do_lo_unlink(const char *loid_arg)
+{
+	int			status;
+	Oid			loid = atooid(loid_arg);
+	bool		own_transaction;
+
+	if (!start_lo_xact("\\lo_unlink", &own_transaction))
+		return false;
+
+	SetCancelConn();
+	status = lo_unlink(pset.db, loid);
+	ResetCancelConn();
+
+	if (status == -1)
+	{
+		psql_error("%s", PQerrorMessage(pset.db));
+		return fail_lo_xact("\\lo_unlink", own_transaction);
+	}
+
+	if (!finish_lo_xact("\\lo_unlink", own_transaction))
+		return false;
+
+	print_lo_result("lo_unlink %u", loid);
+
+	return true;
+}
+
+
+
+/*
+ * do_lo_list()
+ *
+ * Show all large objects in database with comments
+ */
+bool
+do_lo_list(void)
+{
+	PGresult   *res;
+	char		buf[1024];
+	printQueryOpt myopt = pset.popt;
+
+	if (pset.sversion >= 90000)
+	{
+		snprintf(buf, sizeof(buf),
+				 "SELECT oid as \"%s\",\n"
+				 "  pg_catalog.pg_get_userbyid(lomowner) as \"%s\",\n"
+			"  pg_catalog.obj_description(oid, 'pg_largeobject') as \"%s\"\n"
+				 "  FROM pg_catalog.pg_largeobject_metadata "
+				 "  ORDER BY oid",
+				 gettext_noop("ID"),
+				 gettext_noop("Owner"),
+				 gettext_noop("Description"));
+	}
+	else
+	{
+		snprintf(buf, sizeof(buf),
+				 "SELECT loid as \"%s\",\n"
+		   "  pg_catalog.obj_description(loid, 'pg_largeobject') as \"%s\"\n"
+			 "FROM (SELECT DISTINCT loid FROM pg_catalog.pg_largeobject) x\n"
+				 "ORDER BY 1",
+				 gettext_noop("ID"),
+				 gettext_noop("Description"));
+	}
+
+	res = PSQLexec(buf);
+	if (!res)
+		return false;
+
+	myopt.topt.tuples_only = false;
+	myopt.nullPrint = NULL;
+	myopt.title = _("Large objects");
+	myopt.translate_header = true;
+
+	printQuery(res, &myopt, pset.queryFout, false, pset.logfile);
+
+	PQclear(res);
+	return true;
+}
--- a/src/bin/csql/large_obj.h
+++ b/src/bin/csql/large_obj.h
@ -0,0 +1,16 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/large_obj.h
+ */
+#ifndef LARGE_OBJ_H
+#define LARGE_OBJ_H
+
+bool		do_lo_export(const char *loid_arg, const char *filename_arg);
+bool		do_lo_import(const char *filename_arg, const char *comment_arg);
+bool		do_lo_unlink(const char *loid_arg);
+bool		do_lo_list(void);
+
+#endif   /* LARGE_OBJ_H */
--- a/src/bin/csql/mainloop.c
+++ b/src/bin/csql/mainloop.c
@ -0,0 +1,462 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/mainloop.c
+ */
+#include "postgres_fe.h"
+#include "mainloop.h"
+
+
+#include "command.h"
+#include "common.h"
+#include "input.h"
+#include "settings.h"
+
+#include "mb/pg_wchar.h"
+
+
+/*
+ * Main processing loop for reading lines of input
+ *	and sending them to the backend.
+ *
+ * This loop is re-entrant. May be called by \i command
+ *	which reads input from a file.
+ */
+int
+MainLoop(FILE *source)
+{
+	PsqlScanState scan_state;	/* lexer working state */
+	volatile PQExpBuffer query_buf;		/* buffer for query being accumulated */
+	volatile PQExpBuffer previous_buf;	/* if there isn't anything in the new
+										 * buffer yet, use this one for \e,
+										 * etc. */
+	PQExpBuffer history_buf;	/* earlier lines of a multi-line command, not
+								 * yet saved to readline history */
+	char	   *line;			/* current line of input */
+	int			added_nl_pos;
+	bool		success;
+	bool		line_saved_in_history;
+	volatile int successResult = EXIT_SUCCESS;
+	volatile backslashResult slashCmdStatus = PSQL_CMD_UNKNOWN;
+	volatile promptStatus_t prompt_status = PROMPT_READY;
+	volatile int count_eof = 0;
+	volatile bool die_on_error = false;
+
+	/* Save the prior command source */
+	FILE	   *prev_cmd_source;
+	bool		prev_cmd_interactive;
+	uint64		prev_lineno;
+
+	/* Save old settings */
+	prev_cmd_source = pset.cur_cmd_source;
+	prev_cmd_interactive = pset.cur_cmd_interactive;
+	prev_lineno = pset.lineno;
+
+	/* Establish new source */
+	pset.cur_cmd_source = source;
+	pset.cur_cmd_interactive = ((source == stdin) && !pset.notty);
+	pset.lineno = 0;
+	pset.stmt_lineno = 1;
+
+	/* Create working state */
+	scan_state = psql_scan_create();
+
+	query_buf = createPQExpBuffer();
+	previous_buf = createPQExpBuffer();
+	history_buf = createPQExpBuffer();
+	if (PQExpBufferBroken(query_buf) ||
+		PQExpBufferBroken(previous_buf) ||
+		PQExpBufferBroken(history_buf))
+	{
+		psql_error("out of memory\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* main loop to get queries and execute them */
+	while (successResult == EXIT_SUCCESS)
+	{
+		/*
+		 * Clean up after a previous Control-C
+		 */
+		if (cancel_pressed)
+		{
+			if (!pset.cur_cmd_interactive)
+			{
+				/*
+				 * You get here if you stopped a script with Ctrl-C.
+				 */
+				successResult = EXIT_USER;
+				break;
+			}
+
+			cancel_pressed = false;
+		}
+
+		/*
+		 * Establish longjmp destination for exiting from wait-for-input. We
+		 * must re-do this each time through the loop for safety, since the
+		 * jmpbuf might get changed during command execution.
+		 */
+		if (sigsetjmp(sigint_interrupt_jmp, 1) != 0)
+		{
+			/* got here with longjmp */
+
+			/* reset parsing state */
+			psql_scan_finish(scan_state);
+			psql_scan_reset(scan_state);
+			resetPQExpBuffer(query_buf);
+			resetPQExpBuffer(history_buf);
+			count_eof = 0;
+			slashCmdStatus = PSQL_CMD_UNKNOWN;
+			prompt_status = PROMPT_READY;
+			pset.stmt_lineno = 1;
+			cancel_pressed = false;
+
+			if (pset.cur_cmd_interactive)
+				putc('\n', stdout);
+			else
+			{
+				successResult = EXIT_USER;
+				break;
+			}
+		}
+
+		fflush(stdout);
+
+		/*
+		 * get another line
+		 */
+		if (pset.cur_cmd_interactive)
+		{
+			/* May need to reset prompt, eg after \r command */
+			if (query_buf->len == 0)
+				prompt_status = PROMPT_READY;
+			line = gets_interactive(get_prompt(prompt_status));
+		}
+		else
+		{
+			line = gets_fromFile(source);
+			if (!line && ferror(source))
+				successResult = EXIT_FAILURE;
+		}
+
+		/*
+		 * query_buf holds query already accumulated.  line is the malloc'd
+		 * new line of input (note it must be freed before looping around!)
+		 */
+
+		/* No more input.  Time to quit, or \i done */
+		if (line == NULL)
+		{
+			if (pset.cur_cmd_interactive)
+			{
+				/* This tries to mimic bash's IGNOREEOF feature. */
+				count_eof++;
+
+				if (count_eof < GetVariableNum(pset.vars, "IGNOREEOF", 0, 10, false))
+				{
+					if (!pset.quiet)
+						printf(_("Use \"\\q\" to leave %s.\n"), pset.progname);
+					continue;
+				}
+
+				puts(pset.quiet ? "" : "\\q");
+			}
+			break;
+		}
+
+		count_eof = 0;
+
+		pset.lineno++;
+
+		/* ignore UTF-8 Unicode byte-order mark */
+		if (pset.lineno == 1 && pset.encoding == PG_UTF8 && strncmp(line, "\xef\xbb\xbf", 3) == 0)
+			memmove(line, line + 3, strlen(line + 3) + 1);
+
+		/* Detect attempts to run custom-format dumps as SQL scripts */
+		if (pset.lineno == 1 && !pset.cur_cmd_interactive &&
+			strncmp(line, "PGDMP", 5) == 0)
+		{
+			free(line);
+			puts(_("The input is a PostgreSQL custom-format dump.\n"
+				   "Use the pg_restore command-line client to restore this dump to a database.\n"));
+			fflush(stdout);
+			successResult = EXIT_FAILURE;
+			break;
+		}
+
+		/* no further processing of empty lines, unless within a literal */
+		if (line[0] == '\0' && !psql_scan_in_quote(scan_state))
+		{
+			free(line);
+			continue;
+		}
+
+		/* A request for help? Be friendly and give them some guidance */
+		if (pset.cur_cmd_interactive && query_buf->len == 0 &&
+			pg_strncasecmp(line, "help", 4) == 0 &&
+			(line[4] == '\0' || line[4] == ';' || isspace((unsigned char) line[4])))
+		{
+			free(line);
+			puts(_("You are using csql, the command-line interface to CitusDB."));
+			printf(_("Type:  \\copyright for distribution terms\n"
+					 "       \\h for help with SQL commands\n"
+					 "       \\? for help with csql commands\n"
+				  "       \\g or terminate with semicolon to execute query\n"
+					 "       \\q to quit\n"));
+
+			fflush(stdout);
+			continue;
+		}
+
+		/* echo back if flag is set, unless interactive */
+		if (pset.echo == PSQL_ECHO_ALL && !pset.cur_cmd_interactive)
+		{
+			puts(line);
+			fflush(stdout);
+		}
+
+		/* insert newlines into query buffer between source lines */
+		if (query_buf->len > 0)
+		{
+			appendPQExpBufferChar(query_buf, '\n');
+			added_nl_pos = query_buf->len;
+		}
+		else
+			added_nl_pos = -1;	/* flag we didn't add one */
+
+		/* Setting this will not have effect until next line. */
+		die_on_error = pset.on_error_stop;
+
+		/*
+		 * Parse line, looking for command separators.
+		 */
+		psql_scan_setup(scan_state, line, strlen(line));
+		success = true;
+		line_saved_in_history = false;
+
+		while (success || !die_on_error)
+		{
+			PsqlScanResult scan_result;
+			promptStatus_t prompt_tmp = prompt_status;
+			size_t		pos_in_query;
+			char	   *tmp_line;
+
+			pos_in_query = query_buf->len;
+			scan_result = psql_scan(scan_state, query_buf, &prompt_tmp);
+			prompt_status = prompt_tmp;
+
+			if (PQExpBufferBroken(query_buf))
+			{
+				psql_error("out of memory\n");
+				exit(EXIT_FAILURE);
+			}
+
+			/*
+			 * Increase statement line number counter for each linebreak added
+			 * to the query buffer by the last psql_scan() call. There only
+			 * will be ones to add when navigating to a statement in
+			 * readline's history containing newlines.
+			 */
+			tmp_line = query_buf->data + pos_in_query;
+			while (*tmp_line != '\0')
+			{
+				if (*(tmp_line++) == '\n')
+					pset.stmt_lineno++;
+			}
+
+			if (scan_result == PSCAN_EOL)
+				pset.stmt_lineno++;
+
+			/*
+			 * Send command if semicolon found, or if end of line and we're in
+			 * single-line mode.
+			 */
+			if (scan_result == PSCAN_SEMICOLON ||
+				(scan_result == PSCAN_EOL && pset.singleline))
+			{
+				/*
+				 * Save query in history.  We use history_buf to accumulate
+				 * multi-line queries into a single history entry.
+				 */
+				if (pset.cur_cmd_interactive && !line_saved_in_history)
+				{
+					pg_append_history(line, history_buf);
+					pg_send_history(history_buf);
+					line_saved_in_history = true;
+				}
+
+				/* execute query */
+				success = SendQuery(query_buf->data);
+				slashCmdStatus = success ? PSQL_CMD_SEND : PSQL_CMD_ERROR;
+				pset.stmt_lineno = 1;
+
+				/* transfer query to previous_buf by pointer-swapping */
+				{
+					PQExpBuffer swap_buf = previous_buf;
+
+					previous_buf = query_buf;
+					query_buf = swap_buf;
+				}
+				resetPQExpBuffer(query_buf);
+
+				added_nl_pos = -1;
+				/* we need not do psql_scan_reset() here */
+			}
+			else if (scan_result == PSCAN_BACKSLASH)
+			{
+				/* handle backslash command */
+
+				/*
+				 * If we added a newline to query_buf, and nothing else has
+				 * been inserted in query_buf by the lexer, then strip off the
+				 * newline again.  This avoids any change to query_buf when a
+				 * line contains only a backslash command.  Also, in this
+				 * situation we force out any previous lines as a separate
+				 * history entry; we don't want SQL and backslash commands
+				 * intermixed in history if at all possible.
+				 */
+				if (query_buf->len == added_nl_pos)
+				{
+					query_buf->data[--query_buf->len] = '\0';
+					pg_send_history(history_buf);
+				}
+				added_nl_pos = -1;
+
+				/* save backslash command in history */
+				if (pset.cur_cmd_interactive && !line_saved_in_history)
+				{
+					pg_append_history(line, history_buf);
+					pg_send_history(history_buf);
+					line_saved_in_history = true;
+				}
+
+				/* execute backslash command */
+				slashCmdStatus = HandleSlashCmds(scan_state,
+												 query_buf->len > 0 ?
+												 query_buf : previous_buf);
+
+				success = slashCmdStatus != PSQL_CMD_ERROR;
+				pset.stmt_lineno = 1;
+
+				if ((slashCmdStatus == PSQL_CMD_SEND || slashCmdStatus == PSQL_CMD_NEWEDIT) &&
+					query_buf->len == 0)
+				{
+					/* copy previous buffer to current for handling */
+					appendPQExpBufferStr(query_buf, previous_buf->data);
+				}
+
+				if (slashCmdStatus == PSQL_CMD_SEND)
+				{
+					success = SendQuery(query_buf->data);
+
+					/* transfer query to previous_buf by pointer-swapping */
+					{
+						PQExpBuffer swap_buf = previous_buf;
+
+						previous_buf = query_buf;
+						query_buf = swap_buf;
+					}
+					resetPQExpBuffer(query_buf);
+
+					/* flush any paren nesting info after forced send */
+					psql_scan_reset(scan_state);
+				}
+				else if (slashCmdStatus == PSQL_CMD_NEWEDIT)
+				{
+					/* rescan query_buf as new input */
+					psql_scan_finish(scan_state);
+					free(line);
+					line = pg_strdup(query_buf->data);
+					resetPQExpBuffer(query_buf);
+					/* reset parsing state since we are rescanning whole line */
+					psql_scan_reset(scan_state);
+					psql_scan_setup(scan_state, line, strlen(line));
+					line_saved_in_history = false;
+					prompt_status = PROMPT_READY;
+				}
+				else if (slashCmdStatus == PSQL_CMD_TERMINATE)
+					break;
+			}
+
+			/* fall out of loop if lexer reached EOL */
+			if (scan_result == PSCAN_INCOMPLETE ||
+				scan_result == PSCAN_EOL)
+				break;
+		}
+
+		/* Add line to pending history if we didn't execute anything yet */
+		if (pset.cur_cmd_interactive && !line_saved_in_history)
+			pg_append_history(line, history_buf);
+
+		psql_scan_finish(scan_state);
+		free(line);
+
+		if (slashCmdStatus == PSQL_CMD_TERMINATE)
+		{
+			successResult = EXIT_SUCCESS;
+			break;
+		}
+
+		if (!pset.cur_cmd_interactive)
+		{
+			if (!success && die_on_error)
+				successResult = EXIT_USER;
+			/* Have we lost the db connection? */
+			else if (!pset.db)
+				successResult = EXIT_BADCONN;
+		}
+	}							/* while !endoffile/session */
+
+	/*
+	 * Process query at the end of file without a semicolon
+	 */
+	if (query_buf->len > 0 && !pset.cur_cmd_interactive &&
+		successResult == EXIT_SUCCESS)
+	{
+		/* save query in history */
+		if (pset.cur_cmd_interactive)
+			pg_send_history(history_buf);
+
+		/* execute query */
+		success = SendQuery(query_buf->data);
+
+		if (!success && die_on_error)
+			successResult = EXIT_USER;
+		else if (pset.db == NULL)
+			successResult = EXIT_BADCONN;
+	}
+
+	/*
+	 * Let's just make real sure the SIGINT handler won't try to use
+	 * sigint_interrupt_jmp after we exit this routine.  If there is an outer
+	 * MainLoop instance, it will reset sigint_interrupt_jmp to point to
+	 * itself at the top of its loop, before any further interactive input
+	 * happens.
+	 */
+	sigint_interrupt_enabled = false;
+
+	destroyPQExpBuffer(query_buf);
+	destroyPQExpBuffer(previous_buf);
+	destroyPQExpBuffer(history_buf);
+
+	psql_scan_destroy(scan_state);
+
+	pset.cur_cmd_source = prev_cmd_source;
+	pset.cur_cmd_interactive = prev_cmd_interactive;
+	pset.lineno = prev_lineno;
+
+	return successResult;
+}	/* MainLoop() */
+
+
+/*
+ * psqlscan.c is #include'd here instead of being compiled on its own.
+ * This is because we need postgres_fe.h to be read before any system
+ * include files, else things tend to break on platforms that have
+ * multiple infrastructures for stdio.h and so on.  flex is absolutely
+ * uncooperative about that, so we can't compile psqlscan.c on its own.
+ */
+#include "psqlscan.c"
--- a/src/bin/csql/mainloop.h
+++ b/src/bin/csql/mainloop.h
@ -0,0 +1,15 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/mainloop.h
+ */
+#ifndef MAINLOOP_H
+#define MAINLOOP_H
+
+#include "postgres_fe.h"
+
+int			MainLoop(FILE *source);
+
+#endif   /* MAINLOOP_H */
--- a/src/bin/csql/mbprint.c
+++ b/src/bin/csql/mbprint.c
@ -0,0 +1,398 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/mbprint.c
+ *
+ * XXX this file does not really belong in psql/.  Perhaps move to libpq?
+ * It also seems that the mbvalidate function is redundant with existing
+ * functionality.
+ */
+
+#include "postgres_fe.h"
+#include "mbprint.h"
+#ifndef PGSCRIPTS
+#include "settings.h"
+#endif
+
+/*
+ * To avoid version-skew problems, this file must not use declarations
+ * from pg_wchar.h: the encoding IDs we are dealing with are determined
+ * by the libpq.so we are linked with, and that might not match the
+ * numbers we see at compile time.  (If this file were inside libpq,
+ * the problem would go away...)
+ *
+ * Hence, we have our own definition of pg_wchar, and we get the values
+ * of any needed encoding IDs on-the-fly.
+ */
+
+typedef unsigned int pg_wchar;
+
+static int
+pg_get_utf8_id(void)
+{
+	static int	utf8_id = -1;
+
+	if (utf8_id < 0)
+		utf8_id = pg_char_to_encoding("utf8");
+	return utf8_id;
+}
+
+#define PG_UTF8		pg_get_utf8_id()
+
+
+/*
+ * Convert a UTF-8 character to a Unicode code point.
+ * This is a one-character version of pg_utf2wchar_with_len.
+ *
+ * No error checks here, c must point to a long-enough string.
+ */
+static pg_wchar
+utf8_to_unicode(const unsigned char *c)
+{
+	if ((*c & 0x80) == 0)
+		return (pg_wchar) c[0];
+	else if ((*c & 0xe0) == 0xc0)
+		return (pg_wchar) (((c[0] & 0x1f) << 6) |
+						   (c[1] & 0x3f));
+	else if ((*c & 0xf0) == 0xe0)
+		return (pg_wchar) (((c[0] & 0x0f) << 12) |
+						   ((c[1] & 0x3f) << 6) |
+						   (c[2] & 0x3f));
+	else if ((*c & 0xf8) == 0xf0)
+		return (pg_wchar) (((c[0] & 0x07) << 18) |
+						   ((c[1] & 0x3f) << 12) |
+						   ((c[2] & 0x3f) << 6) |
+						   (c[3] & 0x3f));
+	else
+		/* that is an invalid code on purpose */
+		return 0xffffffff;
+}
+
+
+/*
+ * Unicode 3.1 compliant validation : for each category, it checks the
+ * combination of each byte to make sure it maps to a valid range. It also
+ * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
+ * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
+ */
+static int
+utf_charcheck(const unsigned char *c)
+{
+	if ((*c & 0x80) == 0)
+		return 1;
+	else if ((*c & 0xe0) == 0xc0)
+	{
+		/* two-byte char */
+		if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
+			return 2;
+		return -1;
+	}
+	else if ((*c & 0xf0) == 0xe0)
+	{
+		/* three-byte char */
+		if (((c[1] & 0xc0) == 0x80) &&
+			(((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
+			((c[2] & 0xc0) == 0x80))
+		{
+			int			z = c[0] & 0x0f;
+			int			yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
+			int			lx = yx & 0x7f;
+
+			/* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
+			if (((z == 0x0f) &&
+				 (((yx & 0xffe) == 0xffe) ||
+			   (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
+				((z == 0x0d) && ((yx & 0xb00) == 0x800)))
+				return -1;
+			return 3;
+		}
+		return -1;
+	}
+	else if ((*c & 0xf8) == 0xf0)
+	{
+		int			u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
+
+		/* four-byte char */
+		if (((c[1] & 0xc0) == 0x80) &&
+			(u > 0x00) && (u <= 0x10) &&
+			((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
+		{
+			/* test for 0xzzzzfffe/0xzzzzfffff */
+			if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
+				((c[3] & 0x3e) == 0x3e))
+				return -1;
+			return 4;
+		}
+		return -1;
+	}
+	return -1;
+}
+
+
+static void
+mb_utf_validate(unsigned char *pwcs)
+{
+	unsigned char *p = pwcs;
+
+	while (*pwcs)
+	{
+		int			len;
+
+		if ((len = utf_charcheck(pwcs)) > 0)
+		{
+			if (p != pwcs)
+			{
+				int			i;
+
+				for (i = 0; i < len; i++)
+					*p++ = *pwcs++;
+			}
+			else
+			{
+				pwcs += len;
+				p += len;
+			}
+		}
+		else
+			/* we skip the char */
+			pwcs++;
+	}
+	if (p != pwcs)
+		*p = '\0';
+}
+
+/*
+ * public functions : wcswidth and mbvalidate
+ */
+
+/*
+ * pg_wcswidth is the dumb display-width function.
+ * It assumes that everything will appear on one line.
+ * OTOH it is easier to use than pg_wcssize if this applies to you.
+ */
+int
+pg_wcswidth(const char *pwcs, size_t len, int encoding)
+{
+	int			width = 0;
+
+	while (len > 0)
+	{
+		int			chlen,
+					chwidth;
+
+		chlen = PQmblen(pwcs, encoding);
+		if (len < (size_t) chlen)
+			break;				/* Invalid string */
+
+		chwidth = PQdsplen(pwcs, encoding);
+		if (chwidth > 0)
+			width += chwidth;
+
+		pwcs += chlen;
+		len -= chlen;
+	}
+	return width;
+}
+
+/*
+ * pg_wcssize takes the given string in the given encoding and returns three
+ * values:
+ *	  result_width: Width in display characters of the longest line in string
+ *	  result_height: Number of lines in display output
+ *	  result_format_size: Number of bytes required to store formatted
+ *		representation of string
+ *
+ * This MUST be kept in sync with pg_wcsformat!
+ */
+void
+pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
+		   int *result_width, int *result_height, int *result_format_size)
+{
+	int			w,
+				chlen = 0,
+				linewidth = 0;
+	int			width = 0;
+	int			height = 1;
+	int			format_size = 0;
+
+	for (; *pwcs && len > 0; pwcs += chlen)
+	{
+		chlen = PQmblen((const char *) pwcs, encoding);
+		if (len < (size_t) chlen)
+			break;
+		w = PQdsplen((const char *) pwcs, encoding);
+
+		if (chlen == 1)			/* single-byte char */
+		{
+			if (*pwcs == '\n')	/* Newline */
+			{
+				if (linewidth > width)
+					width = linewidth;
+				linewidth = 0;
+				height += 1;
+				format_size += 1;		/* For NUL char */
+			}
+			else if (*pwcs == '\r')		/* Linefeed */
+			{
+				linewidth += 2;
+				format_size += 2;
+			}
+			else if (*pwcs == '\t')		/* Tab */
+			{
+				do
+				{
+					linewidth++;
+					format_size++;
+				} while (linewidth % 8 != 0);
+			}
+			else if (w < 0)		/* Other control char */
+			{
+				linewidth += 4;
+				format_size += 4;
+			}
+			else	/* Output it as-is */
+			{
+				linewidth += w;
+				format_size += 1;
+			}
+		}
+		else if (w < 0)			/* Non-ascii control char */
+		{
+			linewidth += 6;		/* \u0000 */
+			format_size += 6;
+		}
+		else	/* All other chars */
+		{
+			linewidth += w;
+			format_size += chlen;
+		}
+		len -= chlen;
+	}
+	if (linewidth > width)
+		width = linewidth;
+	format_size += 1;			/* For NUL char */
+
+	/* Set results */
+	if (result_width)
+		*result_width = width;
+	if (result_height)
+		*result_height = height;
+	if (result_format_size)
+		*result_format_size = format_size;
+}
+
+/*
+ *	Format a string into one or more "struct lineptr" lines.
+ *	lines[i].ptr == NULL indicates the end of the array.
+ *
+ * This MUST be kept in sync with pg_wcssize!
+ */
+void
+pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
+			 struct lineptr * lines, int count)
+{
+	int			w,
+				chlen = 0;
+	int			linewidth = 0;
+	unsigned char *ptr = lines->ptr;	/* Pointer to data area */
+
+	for (; *pwcs && len > 0; pwcs += chlen)
+	{
+		chlen = PQmblen((const char *) pwcs, encoding);
+		if (len < (size_t) chlen)
+			break;
+		w = PQdsplen((const char *) pwcs, encoding);
+
+		if (chlen == 1)			/* single-byte char */
+		{
+			if (*pwcs == '\n')	/* Newline */
+			{
+				*ptr++ = '\0';
+				lines->width = linewidth;
+				linewidth = 0;
+				lines++;
+				count--;
+				if (count <= 0)
+					exit(1);	/* Screwup */
+
+				/* make next line point to remaining memory */
+				lines->ptr = ptr;
+			}
+			else if (*pwcs == '\r')		/* Linefeed */
+			{
+				strcpy((char *) ptr, "\\r");
+				linewidth += 2;
+				ptr += 2;
+			}
+			else if (*pwcs == '\t')		/* Tab */
+			{
+				do
+				{
+					*ptr++ = ' ';
+					linewidth++;
+				} while (linewidth % 8 != 0);
+			}
+			else if (w < 0)		/* Other control char */
+			{
+				sprintf((char *) ptr, "\\x%02X", *pwcs);
+				linewidth += 4;
+				ptr += 4;
+			}
+			else	/* Output it as-is */
+			{
+				linewidth += w;
+				*ptr++ = *pwcs;
+			}
+		}
+		else if (w < 0)			/* Non-ascii control char */
+		{
+			if (encoding == PG_UTF8)
+				sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
+			else
+			{
+				/*
+				 * This case cannot happen in the current code because only
+				 * UTF-8 signals multibyte control characters. But we may need
+				 * to support it at some stage
+				 */
+				sprintf((char *) ptr, "\\u????");
+			}
+			ptr += 6;
+			linewidth += 6;
+		}
+		else	/* All other chars */
+		{
+			int			i;
+
+			for (i = 0; i < chlen; i++)
+				*ptr++ = pwcs[i];
+			linewidth += w;
+		}
+		len -= chlen;
+	}
+	lines->width = linewidth;
+	*ptr++ = '\0';				/* Terminate formatted string */
+
+	if (count <= 0)
+		exit(1);				/* Screwup */
+
+	(lines + 1)->ptr = NULL;	/* terminate line array */
+}
+
+unsigned char *
+mbvalidate(unsigned char *pwcs, int encoding)
+{
+	if (encoding == PG_UTF8)
+		mb_utf_validate(pwcs);
+	else
+	{
+		/*
+		 * other encodings needing validation should add their own routines
+		 * here
+		 */
+	}
+
+	return pwcs;
+}
--- a/src/bin/csql/mbprint.h
+++ b/src/bin/csql/mbprint.h
@ -0,0 +1,18 @@
+/* src/bin/psql/mbprint.h */
+#ifndef MBPRINT_H
+#define MBPRINT_H
+
+
+struct lineptr
+{
+	unsigned char *ptr;
+	int			width;
+};
+
+extern unsigned char *mbvalidate(unsigned char *pwcs, int encoding);
+extern int	pg_wcswidth(const char *pwcs, size_t len, int encoding);
+extern void pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, struct lineptr * lines, int count);
+extern void pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
+		   int *width, int *height, int *format_size);
+
+#endif   /* MBPRINT_H */
--- a/src/bin/csql/print.c
+++ b/src/bin/csql/print.c
--- a/src/bin/csql/print.h
+++ b/src/bin/csql/print.h
@ -0,0 +1,206 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/print.h
+ */
+#ifndef PRINT_H
+#define PRINT_H
+
+#include "libpq-fe.h"
+
+
+enum printFormat
+{
+	PRINT_NOTHING = 0,			/* to make sure someone initializes this */
+	PRINT_UNALIGNED,
+	PRINT_ALIGNED,
+	PRINT_WRAPPED,
+	PRINT_HTML,
+	PRINT_ASCIIDOC,
+	PRINT_LATEX,
+	PRINT_LATEX_LONGTABLE,
+	PRINT_TROFF_MS
+	/* add your favourite output format here ... */
+};
+
+typedef struct printTextLineFormat
+{
+	/* Line drawing characters to be used in various contexts */
+	const char *hrule;			/* horizontal line character */
+	const char *leftvrule;		/* left vertical line (+horizontal) */
+	const char *midvrule;		/* intra-column vertical line (+horizontal) */
+	const char *rightvrule;		/* right vertical line (+horizontal) */
+} printTextLineFormat;
+
+typedef enum printTextRule
+{
+	/* Additional context for selecting line drawing characters */
+	PRINT_RULE_TOP,				/* top horizontal line */
+	PRINT_RULE_MIDDLE,			/* intra-data horizontal line */
+	PRINT_RULE_BOTTOM,			/* bottom horizontal line */
+	PRINT_RULE_DATA				/* data line (hrule is unused here) */
+} printTextRule;
+
+typedef enum printTextLineWrap
+{
+	/* Line wrapping conditions */
+	PRINT_LINE_WRAP_NONE,		/* No wrapping */
+	PRINT_LINE_WRAP_WRAP,		/* Wraparound due to overlength line */
+	PRINT_LINE_WRAP_NEWLINE		/* Newline in data */
+} printTextLineWrap;
+
+typedef struct printTextFormat
+{
+	/* A complete line style */
+	const char *name;			/* for display purposes */
+	printTextLineFormat lrule[4];		/* indexed by enum printTextRule */
+	const char *midvrule_nl;	/* vertical line for continue after newline */
+	const char *midvrule_wrap;	/* vertical line for wrapped data */
+	const char *midvrule_blank; /* vertical line for blank data */
+	const char *header_nl_left; /* left mark after newline */
+	const char *header_nl_right;	/* right mark for newline */
+	const char *nl_left;		/* left mark after newline */
+	const char *nl_right;		/* right mark for newline */
+	const char *wrap_left;		/* left mark after wrapped data */
+	const char *wrap_right;		/* right mark for wrapped data */
+	bool		wrap_right_border;		/* use right-hand border for wrap
+										 * marks when border=0? */
+} printTextFormat;
+
+typedef enum unicode_linestyle
+{
+	UNICODE_LINESTYLE_SINGLE = 0,
+	UNICODE_LINESTYLE_DOUBLE
+} unicode_linestyle;
+
+struct separator
+{
+	char	   *separator;
+	bool		separator_zero;
+};
+
+typedef struct printTableOpt
+{
+	enum printFormat format;	/* see enum above */
+	unsigned short int expanded;/* expanded/vertical output (if supported by
+								 * output format); 0=no, 1=yes, 2=auto */
+	unsigned short int border;	/* Print a border around the table. 0=none,
+								 * 1=dividing lines, 2=full */
+	unsigned short int pager;	/* use pager for output (if to stdout and
+								 * stdout is a tty) 0=off 1=on 2=always */
+	int			pager_min_lines;/* don't use pager unless there are at least
+								 * this many lines */
+	bool		tuples_only;	/* don't output headers, row counts, etc. */
+	bool		start_table;	/* print start decoration, eg <table> */
+	bool		stop_table;		/* print stop decoration, eg </table> */
+	bool		default_footer; /* allow "(xx rows)" default footer */
+	unsigned long prior_records;	/* start offset for record counters */
+	const printTextFormat *line_style;	/* line style (NULL for default) */
+	struct separator fieldSep;	/* field separator for unaligned text mode */
+	struct separator recordSep; /* record separator for unaligned text mode */
+	bool		numericLocale;	/* locale-aware numeric units separator and
+								 * decimal marker */
+	char	   *tableAttr;		/* attributes for HTML <table ...> */
+	int			encoding;		/* character encoding */
+	int			env_columns;	/* $COLUMNS on psql start, 0 is unset */
+	int			columns;		/* target width for wrapped format */
+	unicode_linestyle unicode_border_linestyle;
+	unicode_linestyle unicode_column_linestyle;
+	unicode_linestyle unicode_header_linestyle;
+} printTableOpt;
+
+/*
+ * Table footers are implemented as a singly-linked list.
+ *
+ * This is so that you don't need to know the number of footers in order to
+ * initialise the printTableContent struct, which is very convenient when
+ * preparing complex footers (as in describeOneTableDetails).
+ */
+typedef struct printTableFooter
+{
+	char	   *data;
+	struct printTableFooter *next;
+} printTableFooter;
+
+/*
+ * The table content struct holds all the information which will be displayed
+ * by printTable().
+ */
+typedef struct printTableContent
+{
+	const printTableOpt *opt;
+	const char *title;			/* May be NULL */
+	int			ncolumns;		/* Specified in Init() */
+	int			nrows;			/* Specified in Init() */
+	const char **headers;		/* NULL-terminated array of header strings */
+	const char **header;		/* Pointer to the last added header */
+	const char **cells;			/* NULL-terminated array of cell content
+								 * strings */
+	const char **cell;			/* Pointer to the last added cell */
+	long		cellsadded;		/* Number of cells added this far */
+	bool	   *cellmustfree;	/* true for cells that need to be free()d */
+	printTableFooter *footers;	/* Pointer to the first footer */
+	printTableFooter *footer;	/* Pointer to the last added footer */
+	char	   *aligns;			/* Array of alignment specifiers; 'l' or 'r',
+								 * one per column */
+	char	   *align;			/* Pointer to the last added alignment */
+} printTableContent;
+
+typedef struct printQueryOpt
+{
+	printTableOpt topt;			/* the options above */
+	char	   *nullPrint;		/* how to print null entities */
+	bool		quote;			/* quote all values as much as possible */
+	char	   *title;			/* override title */
+	char	  **footers;		/* override footer (default is "(xx rows)") */
+	bool		translate_header;		/* do gettext on column headers */
+	const bool *translate_columns;		/* translate_columns[i-1] => do
+										 * gettext on col i */
+	int			n_translate_columns;	/* length of translate_columns[] */
+} printQueryOpt;
+
+
+extern const printTextFormat pg_asciiformat;
+extern const printTextFormat pg_asciiformat_old;
+extern const printTextFormat pg_utf8format;
+
+
+extern void disable_sigpipe_trap(void);
+extern void restore_sigpipe_trap(void);
+extern void set_sigpipe_trap_state(bool ignore);
+
+extern FILE *PageOutput(int lines, const printTableOpt *topt);
+extern void ClosePager(FILE *pagerpipe);
+
+extern void html_escaped_print(const char *in, FILE *fout);
+
+extern void printTableInit(printTableContent *const content,
+			   const printTableOpt *opt, const char *title,
+			   const int ncolumns, const int nrows);
+extern void printTableAddHeader(printTableContent *const content,
+					char *header, const bool translate, const char align);
+extern void printTableAddCell(printTableContent *const content,
+				  char *cell, const bool translate, const bool mustfree);
+extern void printTableAddFooter(printTableContent *const content,
+					const char *footer);
+extern void printTableSetFooter(printTableContent *const content,
+					const char *footer);
+extern void printTableCleanup(printTableContent *const content);
+extern void printTable(const printTableContent *cont,
+		   FILE *fout, bool is_pager, FILE *flog);
+extern void printQuery(const PGresult *result, const printQueryOpt *opt,
+		   FILE *fout, bool is_pager, FILE *flog);
+
+extern void setDecimalLocale(void);
+extern const printTextFormat *get_line_style(const printTableOpt *opt);
+extern void refresh_utf8format(const printTableOpt *opt);
+
+#ifndef __CYGWIN__
+#define DEFAULT_PAGER "more"
+#else
+#define DEFAULT_PAGER "less"
+#endif
+
+#endif   /* PRINT_H */
--- a/src/bin/csql/prompt.c
+++ b/src/bin/csql/prompt.c
@ -0,0 +1,325 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/prompt.c
+ */
+#include "postgres_fe.h"
+
+#ifdef WIN32
+#include <io.h>
+#include <win32.h>
+#endif
+
+#ifdef HAVE_UNIX_SOCKETS
+#include <unistd.h>
+#include <netdb.h>
+#endif
+
+#include "common.h"
+#include "input.h"
+#include "prompt.h"
+#include "settings.h"
+
+
+/*--------------------------
+ * get_prompt
+ *
+ * Returns a statically allocated prompt made by interpolating certain
+ * tcsh style escape sequences into pset.vars "PROMPT1|2|3".
+ * (might not be completely multibyte safe)
+ *
+ * Defined interpolations are:
+ * %M - database server "hostname.domainname", "[local]" for AF_UNIX
+ *		sockets, "[local:/dir/name]" if not default
+ * %m - like %M, but hostname only (before first dot), or always "[local]"
+ * %> - database server port number
+ * %n - database user name
+ * %/ - current database
+ * %~ - like %/ but "~" when database name equals user name
+ * %# - "#" if superuser, ">" otherwise
+ * %R - in prompt1 normally =, or ^ if single line mode,
+ *			or a ! if session is not connected to a database;
+ *		in prompt2 -, *, ', or ";
+ *		in prompt3 nothing
+ * %x - transaction status: empty, *, !, ? (unknown or no connection)
+ * %l - The line number inside the current statement, starting from 1.
+ * %? - the error code of the last query (not yet implemented)
+ * %% - a percent sign
+ *
+ * %[0-9]		   - the character with the given decimal code
+ * %0[0-7]		   - the character with the given octal code
+ * %0x[0-9A-Fa-f]  - the character with the given hexadecimal code
+ *
+ * %`command`	   - The result of executing command in /bin/sh with trailing
+ *					 newline stripped.
+ * %:name:		   - The value of the psql variable 'name'
+ * (those will not be rescanned for more escape sequences!)
+ *
+ * %[ ... %]	   - tell readline that the contained text is invisible
+ *
+ * If the application-wide prompts become NULL somehow, the returned string
+ * will be empty (not NULL!).
+ *--------------------------
+ */
+
+char *
+get_prompt(promptStatus_t status)
+{
+#define MAX_PROMPT_SIZE 256
+	static char destination[MAX_PROMPT_SIZE + 1];
+	char		buf[MAX_PROMPT_SIZE + 1];
+	bool		esc = false;
+	const char *p;
+	const char *prompt_string = "? ";
+
+	switch (status)
+	{
+		case PROMPT_READY:
+			prompt_string = pset.prompt1;
+			break;
+
+		case PROMPT_CONTINUE:
+		case PROMPT_SINGLEQUOTE:
+		case PROMPT_DOUBLEQUOTE:
+		case PROMPT_DOLLARQUOTE:
+		case PROMPT_COMMENT:
+		case PROMPT_PAREN:
+			prompt_string = pset.prompt2;
+			break;
+
+		case PROMPT_COPY:
+			prompt_string = pset.prompt3;
+			break;
+	}
+
+	destination[0] = '\0';
+
+	for (p = prompt_string;
+		 *p && strlen(destination) < sizeof(destination) - 1;
+		 p++)
+	{
+		memset(buf, 0, sizeof(buf));
+		if (esc)
+		{
+			switch (*p)
+			{
+					/* Current database */
+				case '/':
+					if (pset.db)
+						strlcpy(buf, PQdb(pset.db), sizeof(buf));
+					break;
+				case '~':
+					if (pset.db)
+					{
+						const char *var;
+
+						if (strcmp(PQdb(pset.db), PQuser(pset.db)) == 0 ||
+							((var = getenv("PGDATABASE")) && strcmp(var, PQdb(pset.db)) == 0))
+							strlcpy(buf, "~", sizeof(buf));
+						else
+							strlcpy(buf, PQdb(pset.db), sizeof(buf));
+					}
+					break;
+
+					/* DB server hostname (long/short) */
+				case 'M':
+				case 'm':
+					if (pset.db)
+					{
+						const char *host = PQhost(pset.db);
+
+						/* INET socket */
+						if (host && host[0] && !is_absolute_path(host))
+						{
+							strlcpy(buf, host, sizeof(buf));
+							if (*p == 'm')
+								buf[strcspn(buf, ".")] = '\0';
+						}
+#ifdef HAVE_UNIX_SOCKETS
+						/* UNIX socket */
+						else
+						{
+							if (!host
+								|| strcmp(host, DEFAULT_PGSOCKET_DIR) == 0
+								|| *p == 'm')
+								strlcpy(buf, "[local]", sizeof(buf));
+							else
+								snprintf(buf, sizeof(buf), "[local:%s]", host);
+						}
+#endif
+					}
+					break;
+					/* DB server port number */
+				case '>':
+					if (pset.db && PQport(pset.db))
+						strlcpy(buf, PQport(pset.db), sizeof(buf));
+					break;
+					/* DB server user name */
+				case 'n':
+					if (pset.db)
+						strlcpy(buf, session_username(), sizeof(buf));
+					break;
+
+				case '0':
+				case '1':
+				case '2':
+				case '3':
+				case '4':
+				case '5':
+				case '6':
+				case '7':
+					*buf = (char) strtol(p, (char **) &p, 8);
+					--p;
+					break;
+				case 'R':
+					switch (status)
+					{
+						case PROMPT_READY:
+							if (!pset.db)
+								buf[0] = '!';
+							else if (!pset.singleline)
+								buf[0] = '=';
+							else
+								buf[0] = '^';
+							break;
+						case PROMPT_CONTINUE:
+							buf[0] = '-';
+							break;
+						case PROMPT_SINGLEQUOTE:
+							buf[0] = '\'';
+							break;
+						case PROMPT_DOUBLEQUOTE:
+							buf[0] = '"';
+							break;
+						case PROMPT_DOLLARQUOTE:
+							buf[0] = '$';
+							break;
+						case PROMPT_COMMENT:
+							buf[0] = '*';
+							break;
+						case PROMPT_PAREN:
+							buf[0] = '(';
+							break;
+						default:
+							buf[0] = '\0';
+							break;
+					}
+					break;
+
+				case 'x':
+					if (!pset.db)
+						buf[0] = '?';
+					else
+						switch (PQtransactionStatus(pset.db))
+						{
+							case PQTRANS_IDLE:
+								buf[0] = '\0';
+								break;
+							case PQTRANS_ACTIVE:
+							case PQTRANS_INTRANS:
+								buf[0] = '*';
+								break;
+							case PQTRANS_INERROR:
+								buf[0] = '!';
+								break;
+							default:
+								buf[0] = '?';
+								break;
+						}
+					break;
+
+				case 'l':
+					snprintf(buf, sizeof(buf), UINT64_FORMAT, pset.stmt_lineno);
+					break;
+
+				case '?':
+					/* not here yet */
+					break;
+
+				case '#':
+					if (is_superuser())
+						buf[0] = '#';
+					else
+						buf[0] = '>';
+					break;
+
+					/* execute command */
+				case '`':
+					{
+						FILE	   *fd;
+						char	   *file = pg_strdup(p + 1);
+						int			cmdend;
+
+						cmdend = strcspn(file, "`");
+						file[cmdend] = '\0';
+						fd = popen(file, "r");
+						if (fd)
+						{
+							if (fgets(buf, sizeof(buf), fd) == NULL)
+								buf[0] = '\0';
+							pclose(fd);
+						}
+						if (strlen(buf) > 0 && buf[strlen(buf) - 1] == '\n')
+							buf[strlen(buf) - 1] = '\0';
+						free(file);
+						p += cmdend + 1;
+						break;
+					}
+
+					/* interpolate variable */
+				case ':':
+					{
+						char	   *name;
+						const char *val;
+						int			nameend;
+
+						name = pg_strdup(p + 1);
+						nameend = strcspn(name, ":");
+						name[nameend] = '\0';
+						val = GetVariable(pset.vars, name);
+						if (val)
+							strlcpy(buf, val, sizeof(buf));
+						free(name);
+						p += nameend + 1;
+						break;
+					}
+
+				case '[':
+				case ']':
+#if defined(USE_READLINE) && defined(RL_PROMPT_START_IGNORE)
+
+					/*
+					 * readline >=4.0 undocumented feature: non-printing
+					 * characters in prompt strings must be marked as such, in
+					 * order to properly display the line during editing.
+					 */
+					buf[0] = (*p == '[') ? RL_PROMPT_START_IGNORE : RL_PROMPT_END_IGNORE;
+					buf[1] = '\0';
+#endif   /* USE_READLINE */
+					break;
+
+				default:
+					buf[0] = *p;
+					buf[1] = '\0';
+					break;
+
+			}
+			esc = false;
+		}
+		else if (*p == '%')
+			esc = true;
+		else
+		{
+			buf[0] = *p;
+			buf[1] = '\0';
+			esc = false;
+		}
+
+		if (!esc)
+			strlcat(destination, buf, sizeof(destination));
+	}
+
+	return destination;
+}
--- a/src/bin/csql/prompt.h
+++ b/src/bin/csql/prompt.h
@ -0,0 +1,25 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/prompt.h
+ */
+#ifndef PROMPT_H
+#define PROMPT_H
+
+typedef enum _promptStatus
+{
+	PROMPT_READY,
+	PROMPT_CONTINUE,
+	PROMPT_COMMENT,
+	PROMPT_SINGLEQUOTE,
+	PROMPT_DOUBLEQUOTE,
+	PROMPT_DOLLARQUOTE,
+	PROMPT_PAREN,
+	PROMPT_COPY
+} promptStatus_t;
+
+char	   *get_prompt(promptStatus_t status);
+
+#endif   /* PROMPT_H */
--- a/src/bin/csql/psqlrc.sample
+++ b/src/bin/csql/psqlrc.sample
@ -0,0 +1,8 @@
+--
+--	system-wide psql configuration file
+--
+--  This file is read before the .psqlrc file in the user's home directory.
+--
+--  Copy this to your installation's sysconf directory and rename it psqlrc.
+--  The sysconf directory can be identified via "pg_config --sysconfdir".
+--
--- a/src/bin/csql/psqlscan.h
+++ b/src/bin/csql/psqlscan.h
@ -0,0 +1,64 @@
+/*
+ * psql - the PostgreSQL interactive terminal
+ *
+ * Copyright (c) 2000-2015, PostgreSQL Global Development Group
+ *
+ * src/bin/psql/psqlscan.h
+ */
+#ifndef PSQLSCAN_H
+#define PSQLSCAN_H
+
+#include "pqexpbuffer.h"
+
+#include "prompt.h"
+
+
+/* Abstract type for lexer's internal state */
+typedef struct PsqlScanStateData *PsqlScanState;
+
+/* Termination states for psql_scan() */
+typedef enum
+{
+	PSCAN_SEMICOLON,			/* found command-ending semicolon */
+	PSCAN_BACKSLASH,			/* found backslash command */
+	PSCAN_INCOMPLETE,			/* end of line, SQL statement incomplete */
+	PSCAN_EOL					/* end of line, SQL possibly complete */
+} PsqlScanResult;
+
+/* Different ways for scan_slash_option to handle parameter words */
+enum slash_option_type
+{
+	OT_NORMAL,					/* normal case */
+	OT_SQLID,					/* treat as SQL identifier */
+	OT_SQLIDHACK,				/* SQL identifier, but don't downcase */
+	OT_FILEPIPE,				/* it's a filename or pipe */
+	OT_WHOLE_LINE,				/* just snarf the rest of the line */
+	OT_NO_EVAL					/* no expansion of backticks or variables */
+};
+
+
+extern PsqlScanState psql_scan_create(void);
+extern void psql_scan_destroy(PsqlScanState state);
+
+extern void psql_scan_setup(PsqlScanState state,
+				const char *line, int line_len);
+extern void psql_scan_finish(PsqlScanState state);
+
+extern PsqlScanResult psql_scan(PsqlScanState state,
+		  PQExpBuffer query_buf,
+		  promptStatus_t *prompt);
+
+extern void psql_scan_reset(PsqlScanState state);
+
+extern bool psql_scan_in_quote(PsqlScanState state);
+
+extern char *psql_scan_slash_command(PsqlScanState state);
+
+extern char *psql_scan_slash_option(PsqlScanState state,
+					   enum slash_option_type type,
+					   char *quote,
+					   bool semicolon);
+
+extern void psql_scan_slash_command_end(PsqlScanState state);
+
+#endif   /* PSQLSCAN_H */
--- a/src/bin/csql/psqlscan.l
+++ b/src/bin/csql/psqlscan.l
--- a/Show More
+++ b/Show More