Initial commit of Citus 5.0

pull/328/head
Onder Kalaci 2016-02-11 04:05:32 +02:00
commit 136306a1fe
357 changed files with 137231 additions and 0 deletions

22
.gitattributes vendored Normal file
View File

@ -0,0 +1,22 @@
* whitespace=space-before-tab,trailing-space
*.[chly] whitespace=space-before-tab,trailing-space,indent-with-non-tab,tabwidth=4
*.dsl whitespace=space-before-tab,trailing-space,tab-in-indent
*.patch -whitespace
*.pl whitespace=space-before-tab,trailing-space,tabwidth=4
*.po whitespace=space-before-tab,trailing-space,tab-in-indent,-blank-at-eof
*.sgml whitespace=space-before-tab,trailing-space,tab-in-indent,-blank-at-eol
*.x[ms]l whitespace=space-before-tab,trailing-space,tab-in-indent
# Avoid confusing ASCII underlines with leftover merge conflict markers
README conflict-marker-size=32
README.* conflict-marker-size=32
# Certain data files that contain special whitespace, and other special cases
*.data -whitespace
# Test output files that contain extra whitespace
*.out -whitespace
src/test/regress/output/*.source -whitespace
# These files are maintained or generated elsewhere. We take them as is.
configure -whitespace

38
.gitignore vendored Normal file
View File

@ -0,0 +1,38 @@
# Global excludes across all subdirectories
*.o
*.so
*.so.[0-9]
*.so.[0-9].[0-9]
*.sl
*.sl.[0-9]
*.sl.[0-9].[0-9]
*.dylib
*.dll
*.a
*.mo
*.pot
objfiles.txt
.deps/
*.gcno
*.gcda
*.gcov
*.gcov.out
lcov.info
coverage/
*.vcproj
*.vcxproj
win32ver.rc
*.exe
lib*dll.def
lib*.pc
# Local excludes in root directory
/config.log
/config.status
/pgsql.sln
/pgsql.sln.cache
/Debug/
/Release/
/autom4te.cache
/Makefile.global
/src/Makefile.custom

15
.travis.yml Normal file
View File

@ -0,0 +1,15 @@
sudo: required
dist: trusty
language: c
cache: apt
env:
matrix:
- PGVERSION=9.5
- PGVERSION=9.4
before_install:
- git clone --depth 1 https://github.com/citusdata/tools.git
- tools/travis/setup_apt.sh
- tools/travis/nuke_pg.sh
install:
- tools/travis/install_pg.sh
script: tools/travis/pg_travis_multi_test.sh

49
Makefile Normal file
View File

@ -0,0 +1,49 @@
# CitusDB toplevel Makefile
citusdb_subdir = .
citusdb_top_builddir = .
# Hint that configure should be run first
ifeq (,$(wildcard Makefile.global))
$(error ./configure needs to be run before compiling CitusDB)
endif
include Makefile.global
all: extension csql
# build extension
extension:
$(MAKE) -C src/backend/distributed/ all
install-extension:
$(MAKE) -C src/backend/distributed/ install
install-headers:
$(MKDIR_P) '$(includedir_server)/distributed/'
# generated headers are located in the build directory
$(INSTALL_DATA) src/include/citusdb_config.h '$(includedir_server)/'
# the rest in the source tree
$(INSTALL_DATA) $(citusdb_abs_srcdir)/src/include/distributed/*.h '$(includedir_server)/distributed/'
clean-extension:
$(MAKE) -C src/backend/distributed/ clean
.PHONY: extension install-extension clean-extension
# Add to generic targets
install: install-extension install-headers
clean: clean-extension
# build csql binary
csql:
$(MAKE) -C src/bin/csql/ all
install-csql:
$(MAKE) -C src/bin/csql/ install
clean-csql:
$(MAKE) -C src/bin/csql/ clean
.PHONY: csql install-csql clean-csql
# Add to generic targets
install: install-csql
clean: clean-csql
# depend on install for now
check: all install
$(MAKE) -C src/test/regress check-full
.PHONY: all check install clean

61
Makefile.global.in Normal file
View File

@ -0,0 +1,61 @@
# -*-makefile-*-
# @configure_input@
# Makefile.global.in - Makefile to be included by all submakes
#
# This file is converted by configure into an actual Makefile,
# replacing the @varname@ placeholders by actual values.
#
# This files is intended to contain infrastructure needed by several
# makefiles, particulary central handling of compilation flags and
# rules.
citusdb_abs_srcdir:=@abs_top_srcdir@/${citusdb_subdir}
citusdb_abs_top_srcdir:=@abs_top_srcdir@
PG_CONFIG:=@PG_CONFIG@
PGXS:=$(shell $(PG_CONFIG) --pgxs)
# Support for VPATH builds (i.e. builds from outside the source tree)
vpath_build=@vpath_build@
ifeq ($(vpath_build),yes)
VPATH:=$(citusdb_abs_srcdir)
USE_VPATH:=$(VPATH)
endif
# CitusDB is built using PostgreSQL's pgxs
USE_PGXS=1
include $(PGXS)
# Remake Makefile.global from Makefile.global.in if the latter
# changed. In order to trigger this rule, the including file must
# write `include $(citusdb_top_builddir)/Makefile.global', not some
# shortcut thereof. This makes it less likely to accidentally run
# with some outdated Makefile.global.
# Make internally restarts whenever included Makefiles are
# regenerated.
$(citusdb_top_builddir)/Makefile.global: $(citusdb_top_builddir)/Makefile.global.in @top_srcdir@/configure $(citusdb_top_builddir)/config.status
cd @abs_top_builddir@ && ./config.status Makefile.global
# Ensure configuration is generated by the most recent configure,
# useful for longer existing build directories.
$(citusdb_top_builddir)/config.status: @top_srcdir@/configure
cd @abs_top_builddir@ && ./config.status --recheck
# Regenerate configure if configure.in changed
@top_srcdir@/configure: $(citusdb_abs_srcdir)/configure.in
cd ${citusdb_abs_srcdir} && ./autogen.sh
# If specified via configure, replace the default compiler. Normally
# we'll build with the one postgres was built with. But it's useful to
# be able to use a different one, especially when building against
# distribution packages.
ifneq (@CC@,)
override CC=@CC@
endif
# Add options passed to configure or computed therein, to CFLAGS/CPPFLAGS/...
override CFLAGS += @CFLAGS@ @CITUS_CFLAGS@
override CPPFLAGS := @CPPFLAGS@ -I '${citusdb_abs_top_srcdir}/src/include' $(CPPFLAGS)
override LDFLAGS += @LDFLAGS@
# optional file with user defined, additional, rules
-include ${citusdb_abs_srcdir}/src/Makefile.custom

7
autogen.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
#
# autogen.sh converts configure.in to configure and creates
# citusdb_config.h.in. The resuting resulting files are checked into
# the SCM, to avoid everyone needing autoconf installed.
autoreconf -f

4170
configure vendored Executable file

File diff suppressed because it is too large Load Diff

109
configure.in Normal file
View File

@ -0,0 +1,109 @@
# CitusDB autoconf input script.
#
# Converted into an actual configure script by autogen.sh. This
# conversion only has to be done when configure.in changes. To avoid
# everyone needing autoconf installed, the resulting files are checked
# into the SCM.
AC_INIT([CitusDB], [5.0], [], [citusdb], [])
AC_COPYRIGHT([Copyright (c) Copyright (c) 2012-2015, Citus Data, Inc.])
AC_PROG_SED
# Locate pg_config binary
AC_ARG_VAR([PG_CONFIG], [Location to find pg_config for target PostgreSQL instalation (default PATH)])
AC_ARG_VAR([PATH], [PATH for target PostgreSQL install pg_config])
if test -z "$PG_CONFIG"; then
AC_PATH_PROG(PG_CONFIG, pg_config)
fi
if test -z "$PG_CONFIG"; then
AC_MSG_ERROR([Could not find pg_config. Set PG_CONFIG or PATH.])
fi
# check we're building against a supported version of PostgreSQL
citusac_pg_config_version=$($PG_CONFIG --version 2>/dev/null)
version_num=$(echo "$citusac_pg_config_version"|
$SED -e 's/^PostgreSQL \([[0-9]]*\)\.\([[0-9]]*\)\([[a-zA-Z0-9.]]*\)$/\1.\2/')
if test -z "$version_num"; then
AC_MSG_ERROR([Could not detect PostgreSQL version from pg_config.])
fi
if test "$version_num" != '9.4' -a "$version_num" != '9.5'; then
AC_MSG_ERROR([CitusDB is not compatible with the detected PostgreSQL version ${version_num}.])
else
AC_MSG_NOTICE([building against PostgreSQL $version_num])
fi;
# Check whether we're building inside the source tree, if not, prepare
# the build directory.
if test "$srcdir" -ef '.' ; then
vpath_build=no
else
vpath_build=yes
_AS_ECHO_N([preparing build tree... ])
citusac_abs_top_srcdir=`cd "$srcdir" && pwd`
$SHELL "$citusac_abs_top_srcdir/prep_buildtree" "$citusac_abs_top_srcdir" "." \
|| AC_MSG_ERROR(failed)
AC_MSG_RESULT(done)
fi
AC_SUBST(vpath_build)
# Allow to overwrite the C compiler, default to the one postgres was
# compiled with
AC_PROG_CC([$($PG_CONFIG --cc)])
# check for a number of CFLAGS that make development easier
# CITUSAC_PROG_CC_CFLAGS_OPT
# -----------------------
# Given a string, check if the compiler supports the string as a
# command-line option. If it does, add the string to CFLAGS.
AC_DEFUN([CITUSAC_PROG_CC_CFLAGS_OPT],
[define([Ac_cachevar], [AS_TR_SH([citusac_cv_prog_cc_cflags_$1])])dnl
AC_CACHE_CHECK([whether $CC supports $1], [Ac_cachevar],
[citusac_save_CFLAGS=$CFLAGS
CFLAGS="$citusac_save_CFLAGS $1"
ac_save_c_werror_flag=$ac_c_werror_flag
ac_c_werror_flag=yes
_AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
[Ac_cachevar=yes],
[Ac_cachevar=no])
ac_c_werror_flag=$ac_save_c_werror_flag
CFLAGS="$citusac_save_CFLAGS"])
if test x"$Ac_cachevar" = x"yes"; then
CITUS_CFLAGS="$CITUS_CFLAGS $1"
fi
undefine([Ac_cachevar])dnl
])# CITUSAC_PROG_CC_CFLAGS_OPT
CITUSAC_PROG_CC_CFLAGS_OPT([-Wall])
CITUSAC_PROG_CC_CFLAGS_OPT([-Wextra])
# disarm options included in the above, which are too noisy for now
CITUSAC_PROG_CC_CFLAGS_OPT([-Wno-unused-parameter])
CITUSAC_PROG_CC_CFLAGS_OPT([-Wno-sign-compare])
CITUSAC_PROG_CC_CFLAGS_OPT([-Wno-missing-field-initializers])
CITUSAC_PROG_CC_CFLAGS_OPT([-Wno-clobbered])
# And add a few extra warnings
CITUSAC_PROG_CC_CFLAGS_OPT([-Wdeclaration-after-statement])
CITUSAC_PROG_CC_CFLAGS_OPT([-Wendif-labels])
CITUSAC_PROG_CC_CFLAGS_OPT([-Wmissing-format-attribute])
CITUSAC_PROG_CC_CFLAGS_OPT([-Wmissing-declarations])
CITUSAC_PROG_CC_CFLAGS_OPT([-Wmissing-prototypes])
AC_SUBST(CITUS_CFLAGS, "$CITUS_CFLAGS")
AC_CONFIG_FILES([Makefile.global])
AC_CONFIG_HEADERS([src/include/citusdb_config.h])
AH_TOP([
/*
* citusdb_config.h.in is generated by autoconf/autoheader and
* converted into citusdb_config.h by configure. Include when code needs to
* depend on determinations made by configure.
*
* Do not manually edit!
*/
])
AC_OUTPUT

47
prep_buildtree Normal file
View File

@ -0,0 +1,47 @@
#! /bin/sh
#
# CitusDB copy of PostgreSQL's config/prep_buildtree
#
# This script prepares a CitusDB build tree for an out-of-tree/VPATH
# build. It is intended to be run by the configure script.
me=`basename $0`
help="\
Usage: $me sourcetree [buildtree]"
if test -z "$1"; then
echo "$help" 1>&2
exit 1
elif test x"$1" = x"--help"; then
echo "$help"
exit 0
fi
unset CDPATH
sourcetree=`cd $1 && pwd`
buildtree=`cd ${2:-'.'} && pwd`
# We must not auto-create the subdirectories holding built documentation.
# If we did, it would interfere with installation of prebuilt docs from
# the source tree, if a VPATH build is done from a distribution tarball.
# See bug #5595.
for item in `find "$sourcetree" -type d \( \( -name CVS -prune \) -o \( -name .git -prune \) -o -print \) | grep -v "$sourcetree/doc/src/sgml/\+"`; do
subdir=`expr "$item" : "$sourcetree\(.*\)"`
if test ! -d "$buildtree/$subdir"; then
mkdir -p "$buildtree/$subdir" || exit 1
fi
done
for item in `find "$sourcetree" -not -path '*/.git/hg/*' \( -name Makefile -print -o -name GNUmakefile -print \)`; do
filename=`expr "$item" : "$sourcetree\(.*\)"`
if test ! -f "${item}.in"; then
if cmp "$item" "$buildtree/$filename" >/dev/null 2>&1; then : ; else
ln -fs "$item" "$buildtree/$filename" || exit 1
fi
fi
done
exit 0

0
src/backend/.gitignore vendored Normal file
View File

13
src/backend/distributed/.gitignore vendored Normal file
View File

@ -0,0 +1,13 @@
# ====================
# = Project-Specific =
# ====================
# regression test detritus
/log/
/regression.diffs
/regression.out
/results/
/tmp_check*
# ignore latest install file
citusdb--5.0.sql

View File

@ -0,0 +1,33 @@
# Makefile for the CitusDB extension
citusdb_subdir = src/backend/distributed
citusdb_top_builddir = ../../..
MODULE_big = citusdb
EXTENSION = citusdb
EXTVERSION = 5.0
DATA_built = $(EXTENSION)--$(EXTVERSION).sql
SCRIPTS = $(wildcard $(citusdb_top_builddir)/src/bin/scripts/*)
# directories with source files
SUBDIRS = . commands executor master planner relay test utils worker
# That patsubst rule searches all directories listed in SUBDIRS for .c
# files, and adds the corresponding .o files to OBJS
OBJS += \
$(patsubst $(citusdb_abs_srcdir)/%.c,%.o,$(foreach dir,$(SUBDIRS), $(wildcard $(citusdb_abs_srcdir)/$(dir)/*.c)))
# define build process for latest install file
$(EXTENSION)--$(EXTVERSION).sql: $(EXTENSION).sql
cat $^ > $@
# be explicit about the default target
all:
NO_PGXS = 1
SHLIB_LINK = $(libpq)
include $(citusdb_top_builddir)/Makefile.global
override CPPFLAGS += -I$(libpq_srcdir)

View File

@ -0,0 +1,6 @@
# CitusDB extension
comment = 'CitusDB distributed database'
default_version = '5.0'
module_pathname = '$libdir/citusdb'
relocatable = false
schema = pg_catalog

View File

@ -0,0 +1,497 @@
/* citusdb.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION citusdb" to load this file. \quit
CREATE SCHEMA citusdb;
-- Ensure CREATE EXTENSION is not run against an old citusdb data
-- directory, we're not compatible (due to the builtin functions/tables)
DO $$
BEGIN
IF EXISTS(SELECT * FROM pg_proc WHERE proname = 'worker_apply_shard_ddl_command') THEN
RAISE 'cannot install citusdb extension in CitusDB 4 data directory';
END IF;
END;
$$;
/*****************************************************************************
* CitusDB data types
*****************************************************************************/
CREATE TYPE citusdb.distribution_type AS ENUM (
'hash',
'range',
'append'
);
/*****************************************************************************
* CitusDB tables & corresponding indexes
*****************************************************************************/
CREATE TABLE citusdb.pg_dist_partition(
logicalrelid Oid NOT NULL,
partmethod "char" NOT NULL,
partkey text NOT NULL
);
CREATE UNIQUE INDEX pg_dist_partition_logical_relid_index
ON citusdb.pg_dist_partition using btree(logicalrelid);
ALTER TABLE citusdb.pg_dist_partition SET SCHEMA pg_catalog;
CREATE TABLE citusdb.pg_dist_shard(
logicalrelid oid NOT NULL,
shardid int8 NOT NULL,
shardstorage "char" NOT NULL,
shardalias text,
shardminvalue text,
shardmaxvalue text
);
CREATE UNIQUE INDEX pg_dist_shard_shardid_index
ON citusdb.pg_dist_shard using btree(shardid);
CREATE INDEX pg_dist_shard_logical_relid_index
ON citusdb.pg_dist_shard using btree(logicalrelid);
ALTER TABLE citusdb.pg_dist_shard SET SCHEMA pg_catalog;
CREATE TABLE citusdb.pg_dist_shard_placement(
shardid int8 NOT NULL,
shardstate int4 NOT NULL,
shardlength int8 NOT NULL,
nodename text NOT NULL,
nodeport int8 NOT NULL
) WITH oids;
CREATE UNIQUE INDEX pg_dist_shard_placement_oid_index
ON citusdb.pg_dist_shard_placement using btree(oid);
CREATE INDEX pg_dist_shard_placement_shardid_index
ON citusdb.pg_dist_shard_placement using btree(shardid);
CREATE INDEX pg_dist_shard_placement_nodeid_index
ON citusdb.pg_dist_shard_placement using btree(nodename, nodeport);
ALTER TABLE citusdb.pg_dist_shard_placement SET SCHEMA pg_catalog;
/*****************************************************************************
* CitusDB sequences
*****************************************************************************/
/*
* Unternal sequence to generate 64-bit shard ids. These identifiers are then
* used to identify shards in the distributed database.
*/
CREATE SEQUENCE citusdb.pg_dist_shardid_seq
MINVALUE 102008
NO CYCLE;
ALTER SEQUENCE citusdb.pg_dist_shardid_seq SET SCHEMA pg_catalog;
/*
* internal sequence to generate 32-bit jobIds. These identifiers are then
* used to identify jobs in the distributed database; and they wrap at 32-bits
* to allow for slave nodes to independently execute their distributed jobs.
*/
CREATE SEQUENCE citusdb.pg_dist_jobid_seq
MINVALUE 2 /* first jobId reserved for clean up jobs */
MAXVALUE 4294967296;
ALTER SEQUENCE citusdb.pg_dist_jobid_seq SET SCHEMA pg_catalog;
/*****************************************************************************
* CitusDB functions
*****************************************************************************/
/* For backward compatibility and ease of use create functions et al. in pg_catalog */
SET search_path = 'pg_catalog';
/* master_* functions */
CREATE FUNCTION master_get_table_metadata(relation_name text, OUT logical_relid oid,
OUT part_storage_type "char",
OUT part_method "char", OUT part_key text,
OUT part_replica_count integer,
OUT part_max_size bigint,
OUT part_placement_policy integer)
RETURNS record
LANGUAGE C STABLE STRICT
AS 'MODULE_PATHNAME', $$master_get_table_metadata$$;
COMMENT ON FUNCTION master_get_table_metadata(relation_name text)
IS 'fetch metadata values for the table';
CREATE FUNCTION master_get_table_ddl_events(text)
RETURNS SETOF text
LANGUAGE C STRICT ROWS 100
AS 'MODULE_PATHNAME', $$master_get_table_ddl_events$$;
COMMENT ON FUNCTION master_get_table_ddl_events(text)
IS 'fetch set of ddl statements for the table';
CREATE FUNCTION master_get_new_shardid()
RETURNS bigint
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$master_get_new_shardid$$;
COMMENT ON FUNCTION master_get_new_shardid()
IS 'fetch unique shardId';
CREATE FUNCTION master_get_local_first_candidate_nodes(OUT node_name text,
OUT node_port bigint)
RETURNS SETOF record
LANGUAGE C STRICT ROWS 100
AS 'MODULE_PATHNAME', $$master_get_local_first_candidate_nodes$$;
COMMENT ON FUNCTION master_get_local_first_candidate_nodes()
IS 'fetch set of candidate nodes for shard uploading choosing the local node first';
CREATE FUNCTION master_create_empty_shard(text)
RETURNS bigint
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$master_create_empty_shard$$;
COMMENT ON FUNCTION master_create_empty_shard(text)
IS 'create an empty shard and shard placements for the table';
CREATE FUNCTION master_append_table_to_shard(bigint, text, text, integer)
RETURNS real
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$master_append_table_to_shard$$;
COMMENT ON FUNCTION master_append_table_to_shard(bigint, text, text, integer)
IS 'append given table to all shard placements and update metadata';
CREATE FUNCTION master_apply_delete_command(text)
RETURNS integer
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$master_apply_delete_command$$;
COMMENT ON FUNCTION master_apply_delete_command(text)
IS 'drop shards matching delete criteria and update metadata';
CREATE FUNCTION master_get_active_worker_nodes(OUT node_name text, OUT node_port bigint)
RETURNS SETOF record
LANGUAGE C STRICT ROWS 100
AS 'MODULE_PATHNAME', $$master_get_active_worker_nodes$$;
COMMENT ON FUNCTION master_get_active_worker_nodes()
IS 'fetch set of active worker nodes';
CREATE FUNCTION master_get_round_robin_candidate_nodes(shard_id bigint,
OUT node_name text,
OUT node_port bigint)
RETURNS SETOF record
LANGUAGE C STRICT ROWS 100
AS 'MODULE_PATHNAME', $$master_get_round_robin_candidate_nodes$$;
COMMENT ON FUNCTION master_get_round_robin_candidate_nodes(shard_id bigint)
IS 'fetch set of candidate nodes for shard uploading in round-robin manner';
CREATE FUNCTION master_create_distributed_table(table_name regclass,
distribution_column text,
distribution_method citusdb.distribution_type)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$master_create_distributed_table$$;
COMMENT ON FUNCTION master_create_distributed_table(table_name regclass,
distribution_column text,
distribution_method citusdb.distribution_type)
IS 'define the table distribution functions';
-- define shard creation function for hash-partitioned tables
CREATE FUNCTION master_create_worker_shards(table_name text, shard_count integer,
replication_factor integer DEFAULT 2)
RETURNS void
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
/* task_tracker_* functions */
CREATE FUNCTION task_tracker_assign_task(bigint, integer, text)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$task_tracker_assign_task$$;
COMMENT ON FUNCTION task_tracker_assign_task(bigint, integer, text)
IS 'assign a task to execute';
CREATE FUNCTION task_tracker_task_status(bigint, integer)
RETURNS integer
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$task_tracker_task_status$$;
COMMENT ON FUNCTION task_tracker_task_status(bigint, integer)
IS 'check an assigned task''s execution status';
CREATE FUNCTION task_tracker_cleanup_job(bigint)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$task_tracker_cleanup_job$$;
COMMENT ON FUNCTION task_tracker_cleanup_job(bigint)
IS 'clean up all tasks associated with a job';
/* worker_* functions */
CREATE FUNCTION worker_fetch_partition_file(bigint, integer, integer, integer, text,
integer)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_fetch_partition_file$$;
COMMENT ON FUNCTION worker_fetch_partition_file(bigint, integer, integer, integer, text,
integer)
IS 'fetch partition file from remote node';
CREATE FUNCTION worker_fetch_query_results_file(bigint, integer, integer, text, integer)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_fetch_query_results_file$$;
COMMENT ON FUNCTION worker_fetch_query_results_file(bigint, integer, integer, text,
integer)
IS 'fetch query results file from remote node';
CREATE FUNCTION worker_fetch_foreign_file(text, bigint, text[], integer[])
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_fetch_foreign_file$$;
COMMENT ON FUNCTION worker_fetch_foreign_file(text, bigint, text[], integer[])
IS 'fetch foreign file from remote node and apply file';
CREATE FUNCTION worker_fetch_regular_table(text, bigint, text[], integer[])
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_fetch_regular_table$$;
COMMENT ON FUNCTION worker_fetch_regular_table(text, bigint, text[], integer[])
IS 'fetch PostgreSQL table from remote node';
CREATE FUNCTION worker_range_partition_table(bigint, integer, text, text, oid, anyarray)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_range_partition_table$$;
COMMENT ON FUNCTION worker_range_partition_table(bigint, integer, text, text, oid,
anyarray)
IS 'range partition query results';
CREATE FUNCTION worker_hash_partition_table(bigint, integer, text, text, oid, integer)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_hash_partition_table$$;
COMMENT ON FUNCTION worker_hash_partition_table(bigint, integer, text, text, oid,
integer)
IS 'hash partition query results';
CREATE FUNCTION worker_merge_files_into_table(bigint, integer, text[], text[])
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_merge_files_into_table$$;
COMMENT ON FUNCTION worker_merge_files_into_table(bigint, integer, text[], text[])
IS 'merge files into a table';
CREATE FUNCTION worker_merge_files_and_run_query(bigint, integer, text, text)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_merge_files_and_run_query$$;
COMMENT ON FUNCTION worker_merge_files_and_run_query(bigint, integer, text, text)
IS 'merge files and run a reduce query on merged files';
CREATE FUNCTION worker_cleanup_job_schema_cache()
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_cleanup_job_schema_cache$$;
COMMENT ON FUNCTION worker_cleanup_job_schema_cache()
IS 'cleanup all job schemas in current database';
CREATE FUNCTION worker_foreign_file_path(text)
RETURNS text
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_foreign_file_path$$;
COMMENT ON FUNCTION worker_foreign_file_path(text)
IS 'get a foreign table''s local file path';
CREATE FUNCTION worker_find_block_local_path(bigint, text[])
RETURNS text
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_find_block_local_path$$;
COMMENT ON FUNCTION worker_find_block_local_path(bigint, text[])
IS 'find an HDFS block''s local file path';
CREATE FUNCTION worker_apply_shard_ddl_command(bigint, text)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_apply_shard_ddl_command$$;
COMMENT ON FUNCTION worker_apply_shard_ddl_command(bigint, text)
IS 'extend ddl command with shardId and apply on database';
CREATE FUNCTION worker_append_table_to_shard(text, text, text, integer)
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_append_table_to_shard$$;
COMMENT ON FUNCTION worker_append_table_to_shard(text, text, text, integer)
IS 'append a regular table''s contents to the shard';
/* trigger functions */
CREATE OR REPLACE FUNCTION citusdb_drop_trigger()
RETURNS event_trigger
LANGUAGE plpgsql
SET search_path = pg_catalog
AS $cdbdt$
DECLARE v_obj record;
BEGIN
FOR v_obj IN SELECT * FROM pg_event_trigger_dropped_objects() LOOP
IF v_obj.object_type <> 'table' THEN
CONTINUE;
END IF;
-- nothing to do if not a distributed table
IF NOT EXISTS(SELECT * FROM pg_dist_partition WHERE logicalrelid = v_obj.objid) THEN
CONTINUE;
END IF;
-- check if there's shards for the table, error out if so
IF EXISTS(SELECT * FROM pg_dist_shard WHERE logicalrelid = v_obj.objid) THEN
RAISE EXCEPTION USING
MESSAGE = 'cannot drop distributed table with existing shards',
HINT = $$Delete shards first using: $$ ||
$$SELECT master_apply_delete_command('DELETE FROM $$ ||
v_obj.object_identity || $$')$$;
END IF;
-- delete partition entry
DELETE FROM pg_dist_partition WHERE logicalrelid = v_obj.objid;
IF NOT FOUND THEN
RAISE EXCEPTION 'could not find previously found pg_dist_partition entry';
END IF;
END LOOP;
END;
$cdbdt$;
COMMENT ON FUNCTION citusdb_drop_trigger()
IS 'perform checks and actions at the end of DROP actions';
CREATE FUNCTION master_dist_partition_cache_invalidate()
RETURNS trigger
LANGUAGE C
AS 'MODULE_PATHNAME', $$master_dist_partition_cache_invalidate$$;
COMMENT ON FUNCTION master_dist_partition_cache_invalidate()
IS 'register relcache invalidation for changed rows';
CREATE FUNCTION master_dist_shard_cache_invalidate()
RETURNS trigger
LANGUAGE C
AS 'MODULE_PATHNAME', $$master_dist_shard_cache_invalidate$$;
COMMENT ON FUNCTION master_dist_shard_cache_invalidate()
IS 'register relcache invalidation for changed rows';
/* internal functions, not user accessible */
CREATE FUNCTION citusdb_extradata_container(INTERNAL)
RETURNS void
LANGUAGE C
AS 'MODULE_PATHNAME', $$citusdb_extradata_container$$;
COMMENT ON FUNCTION pg_catalog.citusdb_extradata_container(INTERNAL)
IS 'placeholder function to store additional data in postgres node trees';
/*****************************************************************************
* CitusDB triggers
*****************************************************************************/
CREATE EVENT TRIGGER citusdb_cascade_to_partition
ON SQL_DROP
EXECUTE PROCEDURE citusdb_drop_trigger();
CREATE TRIGGER dist_partition_cache_invalidate
AFTER INSERT OR UPDATE OR DELETE
ON pg_catalog.pg_dist_partition
FOR EACH ROW EXECUTE PROCEDURE master_dist_partition_cache_invalidate();
CREATE TRIGGER dist_shard_cache_invalidate
AFTER INSERT OR UPDATE OR DELETE
ON pg_catalog.pg_dist_shard
FOR EACH ROW EXECUTE PROCEDURE master_dist_shard_cache_invalidate();
/*****************************************************************************
* CitusDB aggregates
*****************************************************************************/
CREATE AGGREGATE array_cat_agg(anyarray) (SFUNC = array_cat, STYPE = anyarray);
COMMENT ON AGGREGATE array_cat_agg(anyarray)
IS 'concatenate input arrays into a single array';
/*
* Creates a temporary table exactly like the specified target table along with
* a trigger to redirect any INSERTed rows from the proxy to the underlying
* table. Users may optionally provide a sequence which will be incremented
* after each row that has been successfully proxied (useful for counting rows
* processed). Returns the name of the proxy table that was created.
*/
CREATE FUNCTION create_insert_proxy_for_table(target_table regclass,
sequence regclass DEFAULT NULL)
RETURNS text
AS $create_insert_proxy_for_table$
DECLARE
temp_table_name text;
attr_names text[];
attr_list text;
param_list text;
using_list text;
insert_command text;
-- templates to create dynamic functions, tables, and triggers
func_tmpl CONSTANT text := $$CREATE FUNCTION pg_temp.copy_to_insert()
RETURNS trigger
AS $copy_to_insert$
BEGIN
EXECUTE %L USING %s;
PERFORM nextval(%L);
RETURN NULL;
END;
$copy_to_insert$ LANGUAGE plpgsql;$$;
table_tmpl CONSTANT text := $$CREATE TEMPORARY TABLE %I
(LIKE %s INCLUDING DEFAULTS)$$;
trigger_tmpl CONSTANT text := $$CREATE TRIGGER copy_to_insert
BEFORE INSERT ON %s FOR EACH ROW
EXECUTE PROCEDURE pg_temp.copy_to_insert()$$;
BEGIN
-- create name of temporary table using unqualified input table name
SELECT format('%s_insert_proxy', relname)
INTO STRICT temp_table_name
FROM pg_class
WHERE oid = target_table;
-- get list of all attributes in table, we'll need shortly
SELECT array_agg(attname)
INTO STRICT attr_names
FROM pg_attribute
WHERE attrelid = target_table AND
attnum > 0 AND
NOT attisdropped;
-- build fully specified column list and USING clause from attr. names
SELECT string_agg(quote_ident(attr_name), ','),
string_agg(format('NEW.%I', attr_name), ',')
INTO STRICT attr_list,
using_list
FROM unnest(attr_names) AS attr_name;
-- build ($1, $2, $3)-style VALUE list to bind parameters
SELECT string_agg('$' || param_num, ',')
INTO STRICT param_list
FROM generate_series(1, array_length(attr_names, 1)) AS param_num;
-- use the above lists to generate appropriate INSERT command
insert_command = format('INSERT INTO %s (%s) VALUES (%s)', target_table,
attr_list, param_list);
-- use the command to make one-off trigger targeting specified table
EXECUTE format(func_tmpl, insert_command, using_list, sequence);
-- create a temporary table exactly like the target table...
EXECUTE format(table_tmpl, temp_table_name, target_table);
-- ... and install the trigger on that temporary table
EXECUTE format(trigger_tmpl, quote_ident(temp_table_name)::regclass);
RETURN temp_table_name;
END;
$create_insert_proxy_for_table$ LANGUAGE plpgsql SET search_path = 'pg_catalog';
COMMENT ON FUNCTION create_insert_proxy_for_table(regclass, regclass)
IS 'create a proxy table that redirects INSERTed rows to a target table';
-- define shard repair function
CREATE FUNCTION master_copy_shard_placement(shard_id bigint,
source_node_name text,
source_node_port integer,
target_node_name text,
target_node_port integer)
RETURNS void
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
RESET search_path;

View File

@ -0,0 +1,377 @@
/*-------------------------------------------------------------------------
*
* create_distributed_relation.c
* Routines relation to the creation of distributed relations.
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/hash.h"
#include "access/heapam.h"
#include "access/htup.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "catalog/dependency.h"
#include "catalog/index.h"
#include "catalog/indexing.h"
#include "catalog/pg_am.h"
#include "catalog/pg_enum.h"
#include "catalog/pg_extension.h"
#include "catalog/pg_opclass.h"
#include "commands/defrem.h"
#include "commands/extension.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/metadata_cache.h"
#include "distributed/pg_dist_partition.h"
#include "nodes/execnodes.h"
#include "nodes/nodeFuncs.h"
#include "nodes/pg_list.h"
#include "parser/parse_expr.h"
#include "parser/parse_node.h"
#include "parser/parse_relation.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/syscache.h"
#include "utils/inval.h"
/* local function forward declarations */
static char LookupDistributionMethod(Oid distributionMethodOid);
static void RecordDistributedRelationDependencies(Oid distributedRelationId,
Node *distributionKey);
static Oid SupportFunctionForColumn(Var *partitionColumn, Oid accessMethodId,
int16 supportFunctionNumber);
/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(master_create_distributed_table);
/*
* master_create_distributed_table accepts a table, distribution column and
* method and performs the corresponding catalog changes.
*
* XXX: We should perform more checks here to see if this table is fit for
* partitioning. At a minimum, we should validate the following: (i) this node
* runs as the master node, (ii) table does not make use of the inheritance
* mechanism, (iii) table does not own columns that are sequences, and (iv)
* table does not have collated columns. (v) table does not have
* preexisting content.
*/
Datum
master_create_distributed_table(PG_FUNCTION_ARGS)
{
Oid distributedRelationId = PG_GETARG_OID(0);
text *distributionColumnText = PG_GETARG_TEXT_P(1);
Oid distributionMethodOid = PG_GETARG_OID(2);
Relation distributedRelation = NULL;
char *distributedRelationName = NULL;
char relationKind = '\0';
Relation pgDistPartition = NULL;
char distributionMethod = LookupDistributionMethod(distributionMethodOid);
char *distributionColumnName = text_to_cstring(distributionColumnText);
Node *distributionKey = NULL;
Var *distributionColumn = NULL;
char *distributionKeyString = NULL;
List *indexOidList = NIL;
ListCell *indexOidCell = NULL;
HeapTuple newTuple = NULL;
Datum newValues[Natts_pg_dist_partition];
bool newNulls[Natts_pg_dist_partition];
/*
* Lock target relation with an access exclusive lock - there's no way to
* make sense of this table until we've committed, and we don't want
* multiple backends manipulating this relation.
*/
distributedRelation = relation_open(distributedRelationId, AccessExclusiveLock);
distributedRelationName = RelationGetRelationName(distributedRelation);
/* open system catalog and insert new tuple */
pgDistPartition = heap_open(DistPartitionRelationId(), RowExclusiveLock);
/* check that the relation is not already distributed */
if (IsDistributedTable(distributedRelationId))
{
ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
errmsg("table \"%s\" is already distributed",
distributedRelationName)));
}
/* verify target relation is either regular or foreign table */
relationKind = distributedRelation->rd_rel->relkind;
if (relationKind != RELKIND_RELATION && relationKind != RELKIND_FOREIGN_TABLE)
{
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot distribute relation: %s",
distributedRelationName),
errdetail("Distributed relations must be regular or "
"foreign tables.")));
}
distributionKey = BuildDistributionKeyFromColumnName(distributedRelation,
distributionColumnName);
distributionKeyString = nodeToString(distributionKey);
/* the distribution key should always be a Var for now */
Assert(IsA(distributionKey, Var));
distributionColumn = (Var *) distributionKey;
/* check for support function needed by specified partition method */
if (distributionMethod == DISTRIBUTE_BY_HASH)
{
Oid hashSupportFunction = SupportFunctionForColumn(distributionColumn,
HASH_AM_OID, HASHPROC);
if (hashSupportFunction == InvalidOid)
{
ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FUNCTION),
errmsg("could not identify a hash function for type %s",
format_type_be(distributionColumn->vartype)),
errdatatype(distributionColumn->vartype),
errdetail("Partition column types must have a hash function "
"defined to use hash partitioning.")));
}
}
else if (distributionMethod == DISTRIBUTE_BY_RANGE)
{
Oid btreeSupportFunction = SupportFunctionForColumn(distributionColumn,
BTREE_AM_OID, BTORDER_PROC);
if (btreeSupportFunction == InvalidOid)
{
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_FUNCTION),
errmsg("could not identify a comparison function for type %s",
format_type_be(distributionColumn->vartype)),
errdatatype(distributionColumn->vartype),
errdetail("Partition column types must have a comparison function "
"defined to use range partitioning.")));
}
}
/*
* Do not allow UNIQUE constraint and/or PRIMARY KEY on append partitioned tables,
* since currently there is no way of enforcing uniqueness for overlapping shards.
*
* Similarly, do not allow UNIQUE constraint and/or PRIMARY KEY if it does not
* include partition column. This check is important for two reasons. First,
* currently CitusDB does not enforce uniqueness constraint on multiple shards.
* Second, INSERT INTO .. ON CONFLICT (i.e., UPSERT) queries can be executed with no
* further check for constraints.
*/
indexOidList = RelationGetIndexList(distributedRelation);
foreach(indexOidCell, indexOidList)
{
Oid indexOid = lfirst_oid(indexOidCell);
Relation indexDesc = index_open(indexOid, RowExclusiveLock);
IndexInfo *indexInfo = NULL;
AttrNumber *attributeNumberArray = NULL;
bool hasDistributionColumn = false;
int attributeCount = 0;
int attributeIndex = 0;
/* extract index key information from the index's pg_index info */
indexInfo = BuildIndexInfo(indexDesc);
/* only check unique indexes */
if (indexInfo->ii_Unique == false)
{
index_close(indexDesc, NoLock);
continue;
}
/*
* CitusDB cannot enforce uniqueness constraints with overlapping shards. Thus,
* emit a warning for unique indexes on append partitioned tables.
*/
if (distributionMethod == DISTRIBUTE_BY_APPEND)
{
ereport(WARNING, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("table \"%s\" has a unique constraint",
distributedRelationName),
errdetail("Unique constraints and primary keys on "
"append-partitioned tables cannot be enforced."),
errhint("Consider using hash partitioning.")));
}
attributeCount = indexInfo->ii_NumIndexAttrs;
attributeNumberArray = indexInfo->ii_KeyAttrNumbers;
for (attributeIndex = 0; attributeIndex < attributeCount; attributeIndex++)
{
AttrNumber attributeNumber = attributeNumberArray[attributeIndex];
if (distributionColumn->varattno == attributeNumber)
{
hasDistributionColumn = true;
break;
}
}
if (!hasDistributionColumn)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot distribute relation: \"%s\"",
distributedRelationName),
errdetail("Distributed relations cannot have "
"UNIQUE constraints or PRIMARY KEYs that do not "
"include the partition column.")));
}
index_close(indexDesc, NoLock);
}
/* form new tuple for pg_dist_partition */
memset(newValues, 0, sizeof(newValues));
memset(newNulls, false, sizeof(newNulls));
newValues[Anum_pg_dist_partition_logicalrelid - 1] =
ObjectIdGetDatum(distributedRelationId);
newValues[Anum_pg_dist_partition_partmethod - 1] =
CharGetDatum(distributionMethod);
newValues[Anum_pg_dist_partition_partkey - 1] =
CStringGetTextDatum(distributionKeyString);
newTuple = heap_form_tuple(RelationGetDescr(pgDistPartition), newValues, newNulls);
/* finally insert tuple, build index entries & register cache invalidation */
simple_heap_insert(pgDistPartition, newTuple);
CatalogUpdateIndexes(pgDistPartition, newTuple);
CacheInvalidateRelcacheByRelid(distributedRelationId);
RecordDistributedRelationDependencies(distributedRelationId, distributionKey);
heap_close(pgDistPartition, NoLock);
relation_close(distributedRelation, NoLock);
PG_RETURN_VOID();
}
/*
* RecordDistributedRelationDependencies creates the dependency entries
* necessary for a distributed relation in addition to the preexisting ones
* for a normal relation.
*
* We create one dependency from the (now distributed) relation to the citusdb
* extension to prevent the extension from being dropped while distributed
* tables exist. Furthermore a dependency from pg_dist_partition's
* distribution clause to the underlying columns is created, but it's marked
* as being owned by the relation itself. That means the entire table can be
* dropped, but the column itself can't. Neither can the type of the
* distribution column be changed (c.f. ATExecAlterColumnType).
*/
static void
RecordDistributedRelationDependencies(Oid distributedRelationId, Node *distributionKey)
{
ObjectAddress relationAddr = { 0, 0, 0 };
ObjectAddress citusExtensionAddr = { 0, 0, 0 };
relationAddr.classId = RelationRelationId;
relationAddr.objectId = distributedRelationId;
relationAddr.objectSubId = 0;
citusExtensionAddr.classId = ExtensionRelationId;
citusExtensionAddr.objectId = get_extension_oid("citusdb", false);
citusExtensionAddr.objectSubId = 0;
/* dependency from table entry to extension */
recordDependencyOn(&relationAddr, &citusExtensionAddr, DEPENDENCY_NORMAL);
/* make sure the distribution key column/expression does not just go away */
recordDependencyOnSingleRelExpr(&relationAddr, distributionKey, distributedRelationId,
DEPENDENCY_NORMAL, DEPENDENCY_NORMAL);
}
/*
* LookupDistributionMethod maps the oids of citusdb.distribution_type enum
* values to pg_dist_partition.partmethod values.
*
* The passed in oid has to belong to a value of citusdb.distribution_type.
*/
static char
LookupDistributionMethod(Oid distributionMethodOid)
{
HeapTuple enumTuple = NULL;
Form_pg_enum enumForm = NULL;
char distributionMethod = 0;
const char *enumLabel = NULL;
enumTuple = SearchSysCache1(ENUMOID, ObjectIdGetDatum(distributionMethodOid));
if (!HeapTupleIsValid(enumTuple))
{
ereport(ERROR, (errmsg("invalid internal value for enum: %u",
distributionMethodOid)));
}
enumForm = (Form_pg_enum) GETSTRUCT(enumTuple);
enumLabel = NameStr(enumForm->enumlabel);
if (strncmp(enumLabel, "append", NAMEDATALEN) == 0)
{
distributionMethod = DISTRIBUTE_BY_APPEND;
}
else if (strncmp(enumLabel, "hash", NAMEDATALEN) == 0)
{
distributionMethod = DISTRIBUTE_BY_HASH;
}
else if (strncmp(enumLabel, "range", NAMEDATALEN) == 0)
{
distributionMethod = DISTRIBUTE_BY_RANGE;
}
else
{
ereport(ERROR, (errmsg("invalid label for enum: %s", enumLabel)));
}
ReleaseSysCache(enumTuple);
return distributionMethod;
}
/*
* SupportFunctionForColumn locates a support function given a column, an access method,
* and and id of a support function. This function returns InvalidOid if there is no
* support function for the operator class family of the column, but if the data type
* of the column has no default operator class whatsoever, this function errors out.
*/
static Oid
SupportFunctionForColumn(Var *partitionColumn, Oid accessMethodId,
int16 supportFunctionNumber)
{
Oid operatorFamilyId = InvalidOid;
Oid supportFunctionOid = InvalidOid;
Oid operatorClassInputType = InvalidOid;
Oid columnOid = partitionColumn->vartype;
Oid operatorClassId = GetDefaultOpClass(columnOid, accessMethodId);
/* currently only support using the default operator class */
if (operatorClassId == InvalidOid)
{
ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT),
errmsg("data type %s has no default operator class for specified"
" partition method", format_type_be(columnOid)),
errdatatype(columnOid),
errdetail("Partition column types must have a default operator"
" class defined.")));
}
operatorFamilyId = get_opclass_family(operatorClassId);
operatorClassInputType = get_opclass_input_type(operatorClassId);
supportFunctionOid = get_opfamily_proc(operatorFamilyId, operatorClassInputType,
operatorClassInputType,
supportFunctionNumber);
return supportFunctionOid;
}

View File

@ -0,0 +1,300 @@
/*-------------------------------------------------------------------------
*
* transmit.c
* Routines for transmitting regular files between two nodes.
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include "distributed/relay_utility.h"
#include "distributed/transmit.h"
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "storage/fd.h"
/* Local functions forward declarations */
static File FileOpenForTransmit(const char *filename, int fileFlags, int fileMode);
static void SendCopyInStart(void);
static void SendCopyOutStart(void);
static void SendCopyDone(void);
static void SendCopyData(StringInfo fileBuffer);
static bool ReceiveCopyData(StringInfo copyData);
/*
* ReceiveRegularFile receives data from stdin using the standard copy
* protocol. The function then creates or truncates a file with the given
* filename, and appends received data to this file.
*/
void
ReceiveRegularFile(const char *filename)
{
StringInfo copyData = makeStringInfo();
bool copyDone = false;
File fileDesc = -1;
const int fileFlags = (O_APPEND | O_CREAT | O_RDWR | O_TRUNC | PG_BINARY);
const int fileMode = (S_IRUSR | S_IWUSR);
fileDesc = FileOpenForTransmit(filename, fileFlags, fileMode);
SendCopyInStart();
copyDone = ReceiveCopyData(copyData);
while (!copyDone)
{
/* if received data has contents, append to regular file */
if (copyData->len > 0)
{
int appended = FileWrite(fileDesc, copyData->data, copyData->len);
if (appended != copyData->len)
{
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not append to received file: %m")));
}
}
resetStringInfo(copyData);
copyDone = ReceiveCopyData(copyData);
}
FreeStringInfo(copyData);
FileClose(fileDesc);
}
/*
* SendRegularFile reads data from the given file, and sends these data to
* stdout using the standard copy protocol. After all file data are sent, the
* function ends the copy protocol and closes the file.
*/
void
SendRegularFile(const char *filename)
{
File fileDesc = -1;
StringInfo fileBuffer = NULL;
int readBytes = -1;
const uint32 fileBufferSize = 32768; /* 32 KB */
const int fileFlags = (O_RDONLY | PG_BINARY);
const int fileMode = 0;
/* we currently do not check if the caller has permissions for this file */
fileDesc = FileOpenForTransmit(filename, fileFlags, fileMode);
/*
* We read file's contents into buffers of 32 KB. This buffer size is twice
* as large as Hadoop's default buffer size, and may later be configurable.
*/
fileBuffer = makeStringInfo();
enlargeStringInfo(fileBuffer, fileBufferSize);
SendCopyOutStart();
readBytes = FileRead(fileDesc, fileBuffer->data, fileBufferSize);
while (readBytes > 0)
{
fileBuffer->len = readBytes;
SendCopyData(fileBuffer);
resetStringInfo(fileBuffer);
readBytes = FileRead(fileDesc, fileBuffer->data, fileBufferSize);
}
SendCopyDone();
FreeStringInfo(fileBuffer);
FileClose(fileDesc);
}
/* Helper function that deallocates string info object. */
void
FreeStringInfo(StringInfo stringInfo)
{
resetStringInfo(stringInfo);
pfree(stringInfo->data);
pfree(stringInfo);
}
/*
* FileOpenForTransmit opens file with the given filename and flags. On success,
* the function returns the internal file handle for the opened file. On failure
* the function errors out.
*/
static File
FileOpenForTransmit(const char *filename, int fileFlags, int fileMode)
{
File fileDesc = -1;
int fileStated = -1;
struct stat fileStat;
fileStated = stat(filename, &fileStat);
if (fileStated >= 0)
{
if (S_ISDIR(fileStat.st_mode))
{
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("\"%s\" is a directory", filename)));
}
}
fileDesc = PathNameOpenFile((char *) filename, fileFlags, fileMode);
if (fileDesc < 0)
{
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", filename)));
}
return fileDesc;
}
/*
* SendCopyInStart sends the start copy in message to initiate receiving data
* from stdin. The frontend should now send copy data.
*/
static void
SendCopyInStart(void)
{
StringInfoData copyInStart = { NULL, 0, 0, 0 };
const char copyFormat = 1; /* binary copy format */
int flushed = 0;
pq_beginmessage(&copyInStart, 'G');
pq_sendbyte(&copyInStart, copyFormat);
pq_sendint(&copyInStart, 0, 2);
pq_endmessage(&copyInStart);
/* flush here to ensure that FE knows it can send data */
flushed = pq_flush();
if (flushed != 0)
{
ereport(WARNING, (errmsg("could not flush copy start data")));
}
}
/*
* SendCopyOutStart sends the start copy out message to initiate sending data to
* stdout. After this message, the backend will continue by sending copy data.
*/
static void
SendCopyOutStart(void)
{
StringInfoData copyOutStart = { NULL, 0, 0, 0 };
const char copyFormat = 1; /* binary copy format */
pq_beginmessage(&copyOutStart, 'H');
pq_sendbyte(&copyOutStart, copyFormat);
pq_sendint(&copyOutStart, 0, 2);
pq_endmessage(&copyOutStart);
}
/* Sends the copy-complete message. */
static void
SendCopyDone(void)
{
StringInfoData copyDone = { NULL, 0, 0, 0 };
int flushed = 0;
pq_beginmessage(&copyDone, 'c');
pq_endmessage(&copyDone);
/* flush here to signal to FE that we are done */
flushed = pq_flush();
if (flushed != 0)
{
ereport(WARNING, (errmsg("could not flush copy start data")));
}
}
/* Sends the copy data message to stdout. */
static void
SendCopyData(StringInfo fileBuffer)
{
StringInfoData copyData = { NULL, 0, 0, 0 };
pq_beginmessage(&copyData, 'd');
pq_sendbytes(&copyData, fileBuffer->data, fileBuffer->len);
pq_endmessage(&copyData);
}
/*
* ReceiveCopyData receives one copy data message from stdin, and writes this
* message's contents into the given argument. The function then checks if the
* copy protocol has been completed, and if it has, the function returns true.
* If not, the function returns false indicating there are more data to read.
* If the received message does not conform to the copy protocol, the function
* mirrors copy.c's error behavior.
*/
static bool
ReceiveCopyData(StringInfo copyData)
{
int messageType = 0;
int messageCopied = 0;
bool copyDone = true;
const int unlimitedSize = 0;
HOLD_CANCEL_INTERRUPTS();
pq_startmsgread();
messageType = pq_getbyte();
if (messageType == EOF)
{
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("unexpected EOF on client connection")));
}
/* consume the rest of message before checking for message type */
messageCopied = pq_getmessage(copyData, unlimitedSize);
if (messageCopied == EOF)
{
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("unexpected EOF on client connection")));
}
RESUME_CANCEL_INTERRUPTS();
switch (messageType)
{
case 'd': /* CopyData */
copyDone = false;
break;
case 'c': /* CopyDone */
copyDone = true;
break;
case 'f': /* CopyFail */
ereport(ERROR, (errcode(ERRCODE_QUERY_CANCELED),
errmsg("COPY data failed: %s", pq_getmsgstring(copyData))));
break;
case 'H': /* Flush */
case 'S': /* Sync */
/*
* Ignore Flush/Sync for the convenience of client libraries (such
* as libpq) that may send those without noticing that the command
* they just sent was COPY.
*/
copyDone = false;
break;
default:
ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION),
errmsg("unexpected message type 0x%02X during COPY data",
messageType)));
break;
}
return copyDone;
}

View File

@ -0,0 +1,861 @@
/*-------------------------------------------------------------------------
*
* multi_client_executor.c
*
* This file contains the libpq-specific parts of executing queries on remote
* nodes.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#include "libpq-fe.h"
#include "distributed/multi_client_executor.h"
#include <errno.h>
#include <unistd.h>
#ifdef HAVE_POLL_H
#include <poll.h>
#endif
#ifdef HAVE_SYS_POLL_H
#include <sys/poll.h>
#endif
#ifdef HAVE_SYS_SELECT_H
#include <sys/select.h>
#endif
/* Local pool to track active connections */
static PGconn *ClientConnectionArray[MAX_CONNECTION_COUNT];
/*
* The value at any position on ClientPollingStatusArray is only defined when
* the corresponding ClientConnectionArray entry exists.
*/
static PostgresPollingStatusType ClientPollingStatusArray[MAX_CONNECTION_COUNT];
/* Local functions forward declarations */
static void ClearRemainingResults(PGconn *connection);
static bool ClientConnectionReady(PGconn *connection,
PostgresPollingStatusType pollingStatus);
static void ReportRemoteError(PGconn *connection, PGresult *result);
static void ReportConnectionError(PGconn *connection);
static char * ConnectionGetOptionValue(PGconn *connection, char *optionKeyword);
/* AllocateConnectionId returns a connection id from the connection pool. */
static int32
AllocateConnectionId(void)
{
int32 connectionId = INVALID_CONNECTION_ID;
int32 connIndex = 0;
/* allocate connectionId from connection pool */
for (connIndex = 0; connIndex < MAX_CONNECTION_COUNT; connIndex++)
{
PGconn *connection = ClientConnectionArray[connIndex];
if (connection == NULL)
{
connectionId = connIndex;
break;
}
}
return connectionId;
}
/*
* MultiClientConnect synchronously tries to establish a connection. If it
* succeeds, it returns the connection id. Otherwise, it reports connection
* error and returns INVALID_CONNECTION_ID.
*/
int32
MultiClientConnect(const char *nodeName, uint32 nodePort, const char *nodeDatabase)
{
PGconn *connection = NULL;
char connInfoString[STRING_BUFFER_SIZE];
ConnStatusType connStatusType = CONNECTION_OK;
int32 connectionId = AllocateConnectionId();
if (connectionId == INVALID_CONNECTION_ID)
{
ereport(WARNING, (errmsg("could not allocate connection in connection pool")));
return connectionId;
}
/* transcribe connection paremeters to string */
snprintf(connInfoString, STRING_BUFFER_SIZE, CONN_INFO_TEMPLATE,
nodeName, nodePort, nodeDatabase, CLIENT_CONNECT_TIMEOUT);
/* establish synchronous connection to worker node */
connection = PQconnectdb(connInfoString);
connStatusType = PQstatus(connection);
if (connStatusType == CONNECTION_OK)
{
ClientConnectionArray[connectionId] = connection;
}
else
{
ReportConnectionError(connection);
PQfinish(connection);
connectionId = INVALID_CONNECTION_ID;
}
return connectionId;
}
/*
* MultiClientConnectStart asynchronously tries to establish a connection. If it
* succeeds, it returns the connection id. Otherwise, it reports connection
* error and returns INVALID_CONNECTION_ID.
*/
int32
MultiClientConnectStart(const char *nodeName, uint32 nodePort, const char *nodeDatabase)
{
PGconn *connection = NULL;
char connInfoString[STRING_BUFFER_SIZE];
ConnStatusType connStatusType = CONNECTION_BAD;
int32 connectionId = AllocateConnectionId();
if (connectionId == INVALID_CONNECTION_ID)
{
ereport(WARNING, (errmsg("could not allocate connection in connection pool")));
return connectionId;
}
/* transcribe connection paremeters to string */
snprintf(connInfoString, STRING_BUFFER_SIZE, CONN_INFO_TEMPLATE,
nodeName, nodePort, nodeDatabase, CLIENT_CONNECT_TIMEOUT);
/* prepare asynchronous request for worker node connection */
connection = PQconnectStart(connInfoString);
connStatusType = PQstatus(connection);
/*
* If prepared, we save the connection, and set its initial polling status
* to PGRES_POLLING_WRITING as specified in "Database Connection Control
* Functions" section of the PostgreSQL documentation.
*/
if (connStatusType != CONNECTION_BAD)
{
ClientConnectionArray[connectionId] = connection;
ClientPollingStatusArray[connectionId] = PGRES_POLLING_WRITING;
}
else
{
ReportConnectionError(connection);
PQfinish(connection);
connectionId = INVALID_CONNECTION_ID;
}
return connectionId;
}
/* MultiClientConnectPoll returns the status of client connection. */
ConnectStatus
MultiClientConnectPoll(int32 connectionId)
{
PGconn *connection = NULL;
PostgresPollingStatusType pollingStatus = PGRES_POLLING_OK;
ConnectStatus connectStatus = CLIENT_INVALID_CONNECT;
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
pollingStatus = ClientPollingStatusArray[connectionId];
if (pollingStatus == PGRES_POLLING_OK)
{
connectStatus = CLIENT_CONNECTION_READY;
}
else if (pollingStatus == PGRES_POLLING_READING)
{
bool readReady = ClientConnectionReady(connection, PGRES_POLLING_READING);
if (readReady)
{
ClientPollingStatusArray[connectionId] = PQconnectPoll(connection);
}
connectStatus = CLIENT_CONNECTION_BUSY;
}
else if (pollingStatus == PGRES_POLLING_WRITING)
{
bool writeReady = ClientConnectionReady(connection, PGRES_POLLING_WRITING);
if (writeReady)
{
ClientPollingStatusArray[connectionId] = PQconnectPoll(connection);
}
connectStatus = CLIENT_CONNECTION_BUSY;
}
else if (pollingStatus == PGRES_POLLING_FAILED)
{
ReportConnectionError(connection);
connectStatus = CLIENT_CONNECTION_BAD;
}
return connectStatus;
}
/* MultiClientDisconnect disconnects the connection. */
void
MultiClientDisconnect(int32 connectionId)
{
PGconn *connection = NULL;
const int InvalidPollingStatus = -1;
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
PQfinish(connection);
ClientConnectionArray[connectionId] = NULL;
ClientPollingStatusArray[connectionId] = InvalidPollingStatus;
}
/*
* MultiClientConnectionUp checks if the connection status is up, in other words,
* it is not bad.
*/
bool
MultiClientConnectionUp(int32 connectionId)
{
PGconn *connection = NULL;
ConnStatusType connStatusType = CONNECTION_OK;
bool connectionUp = true;
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
connStatusType = PQstatus(connection);
if (connStatusType == CONNECTION_BAD)
{
connectionUp = false;
}
return connectionUp;
}
/* MultiClientSendQuery sends the given query over the given connection. */
bool
MultiClientSendQuery(int32 connectionId, const char *query)
{
PGconn *connection = NULL;
bool success = true;
int querySent = 0;
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
querySent = PQsendQuery(connection, query);
if (querySent == 0)
{
char *errorMessage = PQerrorMessage(connection);
ereport(WARNING, (errmsg("could not send remote query \"%s\"", query),
errdetail("Client error: %s", errorMessage)));
success = false;
}
return success;
}
/* MultiClientCancel cancels the running query on the given connection. */
bool
MultiClientCancel(int32 connectionId)
{
PGconn *connection = NULL;
PGcancel *cancelObject = NULL;
int cancelSent = 0;
bool canceled = true;
char errorBuffer[STRING_BUFFER_SIZE];
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
cancelObject = PQgetCancel(connection);
cancelSent = PQcancel(cancelObject, errorBuffer, sizeof(errorBuffer));
if (cancelSent == 0)
{
ereport(WARNING, (errmsg("could not issue cancel request"),
errdetail("Client error: %s", errorBuffer)));
canceled = false;
}
PQfreeCancel(cancelObject);
return canceled;
}
/* MultiClientResultStatus checks result status for an asynchronous query. */
ResultStatus
MultiClientResultStatus(int32 connectionId)
{
PGconn *connection = NULL;
int consumed = 0;
ConnStatusType connStatusType = CONNECTION_OK;
ResultStatus resultStatus = CLIENT_INVALID_RESULT_STATUS;
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
connStatusType = PQstatus(connection);
if (connStatusType == CONNECTION_BAD)
{
ereport(WARNING, (errmsg("could not maintain connection to worker node")));
return CLIENT_RESULT_UNAVAILABLE;
}
/* consume input to allow status change */
consumed = PQconsumeInput(connection);
if (consumed != 0)
{
int connectionBusy = PQisBusy(connection);
if (connectionBusy == 0)
{
resultStatus = CLIENT_RESULT_READY;
}
else
{
resultStatus = CLIENT_RESULT_BUSY;
}
}
else
{
ereport(WARNING, (errmsg("could not consume data from worker node")));
resultStatus = CLIENT_RESULT_UNAVAILABLE;
}
return resultStatus;
}
/* MultiClientQueryResult gets results for an asynchronous query. */
bool
MultiClientQueryResult(int32 connectionId, void **queryResult, int *rowCount,
int *columnCount)
{
PGconn *connection = NULL;
PGresult *result = NULL;
ConnStatusType connStatusType = CONNECTION_OK;
ExecStatusType resultStatus = PGRES_COMMAND_OK;
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
connStatusType = PQstatus(connection);
if (connStatusType == CONNECTION_BAD)
{
ereport(WARNING, (errmsg("could not maintain connection to worker node")));
return false;
}
result = PQgetResult(connection);
resultStatus = PQresultStatus(result);
if (resultStatus == PGRES_TUPLES_OK)
{
(*queryResult) = (void **) result;
(*rowCount) = PQntuples(result);
(*columnCount) = PQnfields(result);
}
else
{
ReportRemoteError(connection, result);
PQclear(result);
}
/* clear extra result objects */
ClearRemainingResults(connection);
return true;
}
/*
* MultiClientBatchResult returns results for a "batch" of queries, meaning a
* string containing multiple select statements separated by semicolons. This
* function should be called multiple times to retrieve the results for all the
* queries, until CLIENT_BATCH_QUERY_DONE is returned (even if a failure occurs).
* If a query in the batch fails, the remaining queries will not be executed. On
* success, queryResult, rowCount and columnCount will be set to the appropriate
* values. After use, queryResult should be cleared using ClientClearResult.
*/
BatchQueryStatus
MultiClientBatchResult(int32 connectionId, void **queryResult, int *rowCount,
int *columnCount)
{
PGconn *connection = NULL;
PGresult *result = NULL;
ConnStatusType connStatusType = CONNECTION_OK;
ExecStatusType resultStatus = PGRES_COMMAND_OK;
BatchQueryStatus queryStatus = CLIENT_INVALID_BATCH_QUERY;
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
/* set default result */
(*queryResult) = NULL;
(*rowCount) = -1;
(*columnCount) = -1;
connStatusType = PQstatus(connection);
if (connStatusType == CONNECTION_BAD)
{
ereport(WARNING, (errmsg("could not maintain connection to worker node")));
return CLIENT_BATCH_QUERY_FAILED;
}
result = PQgetResult(connection);
if (result == NULL)
{
return CLIENT_BATCH_QUERY_DONE;
}
resultStatus = PQresultStatus(result);
if (resultStatus == PGRES_TUPLES_OK)
{
(*queryResult) = (void **) result;
(*rowCount) = PQntuples(result);
(*columnCount) = PQnfields(result);
queryStatus = CLIENT_BATCH_QUERY_CONTINUE;
}
else if (resultStatus == PGRES_COMMAND_OK)
{
(*queryResult) = (void **) result;
queryStatus = CLIENT_BATCH_QUERY_CONTINUE;
}
else
{
ReportRemoteError(connection, result);
PQclear(result);
queryStatus = CLIENT_BATCH_QUERY_FAILED;
}
return queryStatus;
}
/* MultiClientGetValue returns the value of field at the given position. */
char *
MultiClientGetValue(void *queryResult, int rowIndex, int columnIndex)
{
char *value = PQgetvalue((PGresult *) queryResult, rowIndex, columnIndex);
return value;
}
/* MultiClientClearResult free's the memory associated with a PGresult. */
void
MultiClientClearResult(void *queryResult)
{
PQclear((PGresult *) queryResult);
}
/* MultiClientQueryStatus returns the query status. */
QueryStatus
MultiClientQueryStatus(int32 connectionId)
{
PGconn *connection = NULL;
PGresult *result = NULL;
int tupleCount = 0;
bool copyResults = false;
ConnStatusType connStatusType = CONNECTION_OK;
ExecStatusType resultStatus = PGRES_COMMAND_OK;
QueryStatus queryStatus = CLIENT_INVALID_QUERY;
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
connStatusType = PQstatus(connection);
if (connStatusType == CONNECTION_BAD)
{
ereport(WARNING, (errmsg("could not maintain connection to worker node")));
return CLIENT_QUERY_FAILED;
}
/*
* We now read the result object and check its status. If the result object
* isn't ready yet (the caller didn't wait for the connection to be ready),
* we will block on this call.
*/
result = PQgetResult(connection);
resultStatus = PQresultStatus(result);
if (resultStatus == PGRES_COMMAND_OK)
{
queryStatus = CLIENT_QUERY_DONE;
}
else if (resultStatus == PGRES_TUPLES_OK)
{
queryStatus = CLIENT_QUERY_DONE;
/*
* We use the client executor to only issue a select query that returns
* a void value. We therefore should not have more than one value here.
*/
tupleCount = PQntuples(result);
Assert(tupleCount <= 1);
}
else if (resultStatus == PGRES_COPY_OUT)
{
queryStatus = CLIENT_QUERY_COPY;
copyResults = true;
}
else
{
queryStatus = CLIENT_QUERY_FAILED;
if (resultStatus == PGRES_COPY_IN)
{
copyResults = true;
}
ReportRemoteError(connection, result);
}
/* clear the result object */
PQclear(result);
/*
* When using the async query mechanism, we need to keep reading results
* until we get null. The exception to this rule is the copy protocol.
*/
if (!copyResults)
{
ClearRemainingResults(connection);
}
return queryStatus;
}
/* MultiClientCopyData copies data from the file. */
CopyStatus
MultiClientCopyData(int32 connectionId, int32 fileDescriptor)
{
PGconn *connection = NULL;
char *receiveBuffer = NULL;
int consumed = 0;
int receiveLength = 0;
const int asynchronous = 1;
CopyStatus copyStatus = CLIENT_INVALID_COPY;
Assert(connectionId != INVALID_CONNECTION_ID);
connection = ClientConnectionArray[connectionId];
Assert(connection != NULL);
/*
* Consume input to handle the case where previous copy operation might have
* received zero bytes.
*/
consumed = PQconsumeInput(connection);
if (consumed == 0)
{
ereport(WARNING, (errmsg("could not read data from worker node")));
return CLIENT_COPY_FAILED;
}
/* receive copy data message in an asynchronous manner */
receiveLength = PQgetCopyData(connection, &receiveBuffer, asynchronous);
while (receiveLength > 0)
{
/* received copy data; append these data to file */
int appended = -1;
errno = 0;
appended = write(fileDescriptor, receiveBuffer, receiveLength);
if (appended != receiveLength)
{
/* if write didn't set errno, assume problem is no disk space */
if (errno == 0)
{
errno = ENOSPC;
}
ereport(FATAL, (errcode_for_file_access(),
errmsg("could not append to copied file: %m")));
}
PQfreemem(receiveBuffer);
receiveLength = PQgetCopyData(connection, &receiveBuffer, asynchronous);
}
/* we now check the last received length returned by copy data */
if (receiveLength == 0)
{
/* we cannot read more data without blocking */
copyStatus = CLIENT_COPY_MORE;
}
else if (receiveLength == -1)
{
/* received copy done message */
PGresult *result = PQgetResult(connection);
ExecStatusType resultStatus = PQresultStatus(result);
if (resultStatus == PGRES_COMMAND_OK)
{
copyStatus = CLIENT_COPY_DONE;
}
else
{
copyStatus = CLIENT_COPY_FAILED;
ReportRemoteError(connection, result);
}
PQclear(result);
}
else if (receiveLength == -2)
{
/* received an error */
copyStatus = CLIENT_COPY_FAILED;
ReportConnectionError(connection);
}
/* if copy out completed, make sure we drain all results from libpq */
if (receiveLength < 0)
{
ClearRemainingResults(connection);
}
return copyStatus;
}
/*
* ClearRemainingResults reads result objects from the connection until we get
* null, and clears these results. This is the last step in completing an async
* query.
*/
static void
ClearRemainingResults(PGconn *connection)
{
PGresult *result = PQgetResult(connection);
while (result != NULL)
{
PQclear(result);
result = PQgetResult(connection);
}
}
/*
* ClientConnectionReady checks if the given connection is ready for non-blocking
* reads or writes. This function is loosely based on pqSocketCheck() at fe-misc.c
* and libpq_select() at libpqwalreceiver.c.
*/
static bool
ClientConnectionReady(PGconn *connection, PostgresPollingStatusType pollingStatus)
{
bool clientConnectionReady = false;
int pollResult = 0;
/* we use poll(2) if available, otherwise select(2) */
#ifdef HAVE_POLL
int fileDescriptorCount = 1;
int immediateTimeout = 0;
int pollEventMask = 0;
struct pollfd pollFileDescriptor;
if (pollingStatus == PGRES_POLLING_READING)
{
pollEventMask = POLLERR | POLLIN;
}
else if (pollingStatus == PGRES_POLLING_WRITING)
{
pollEventMask = POLLERR | POLLOUT;
}
pollFileDescriptor.fd = PQsocket(connection);
pollFileDescriptor.events = pollEventMask;
pollFileDescriptor.revents = 0;
pollResult = poll(&pollFileDescriptor, fileDescriptorCount, immediateTimeout);
#else /* !HAVE_POLL */
fd_set readFileDescriptorSet;
fd_set writeFileDescriptorSet;
fd_set exceptionFileDescriptorSet;
struct timeval immediateTimeout = {0, 0};
int connectionFileDescriptor = PQsocket(connection);
FD_ZERO(&readFileDescriptorSet);
FD_ZERO(&writeFileDescriptorSet);
FD_ZERO(&exceptionFileDescriptorSet);
if (pollingStatus == PGRES_POLLING_READING)
{
FD_SET(connectionFileDescriptor, &exceptionFileDescriptorSet);
FD_SET(connectionFileDescriptor, &readFileDescriptorSet);
}
else if (pollingStatus == PGRES_POLLING_WRITING)
{
FD_SET(connectionFileDescriptor, &exceptionFileDescriptorSet);
FD_SET(connectionFileDescriptor, &writeFileDescriptorSet);
}
pollResult = select(connectionFileDescriptor + 1, &readFileDescriptorSet,
&writeFileDescriptorSet, &exceptionFileDescriptorSet,
&immediateTimeout);
#endif /* HAVE_POLL */
if (pollResult > 0)
{
clientConnectionReady = true;
}
else if (pollResult == 0)
{
clientConnectionReady = false;
}
else if (pollResult < 0)
{
if (errno == EINTR)
{
/*
* If a signal was caught, we return false so the caller polls the
* connection again.
*/
clientConnectionReady = false;
}
else
{
/*
* poll() or select() can set errno to EFAULT (when socket is not
* contained in the calling program's address space), EBADF (invalid
* file descriptor), EINVAL (invalid arguments to select or poll),
* and ENOMEM (no space to allocate file descriptor tables). Out of
* these, only ENOMEM is likely here, and it is a fatal error, so we
* error out.
*/
Assert(errno == ENOMEM);
ereport(ERROR, (errcode_for_socket_access(),
errmsg("select()/poll() failed: %m")));
}
}
return clientConnectionReady;
}
/*
* ReportRemoteError retrieves various error fields from the a remote result and
* produces an error report at the WARNING level.
*/
static void
ReportRemoteError(PGconn *connection, PGresult *result)
{
char *sqlStateString = PQresultErrorField(result, PG_DIAG_SQLSTATE);
char *remoteMessage = PQresultErrorField(result, PG_DIAG_MESSAGE_PRIMARY);
char *nodeName = ConnectionGetOptionValue(connection, "host");
char *nodePort = ConnectionGetOptionValue(connection, "port");
char *errorPrefix = "could not connect to node";
int sqlState = ERRCODE_CONNECTION_FAILURE;
if (sqlStateString != NULL)
{
sqlState = MAKE_SQLSTATE(sqlStateString[0], sqlStateString[1], sqlStateString[2],
sqlStateString[3], sqlStateString[4]);
/* use more specific error prefix for result failures */
if (sqlState != ERRCODE_CONNECTION_FAILURE)
{
errorPrefix = "could not receive query results from";
}
}
/*
* If the PGresult did not contain a message, the connection may provide a
* suitable top level one. At worst, this is an empty string.
*/
if (remoteMessage == NULL)
{
char *lastNewlineIndex = NULL;
remoteMessage = PQerrorMessage(connection);
lastNewlineIndex = strrchr(remoteMessage, '\n');
/* trim trailing newline, if any */
if (lastNewlineIndex != NULL)
{
*lastNewlineIndex = '\0';
}
}
ereport(WARNING, (errcode(sqlState),
errmsg("%s %s:%s", errorPrefix, nodeName, nodePort),
errdetail("Client error: %s", remoteMessage)));
}
/*
* ReportConnectionError raises a WARNING and reports that we could not
* establish the given connection.
*/
static void
ReportConnectionError(PGconn *connection)
{
char *nodeName = ConnectionGetOptionValue(connection, "host");
char *nodePort = ConnectionGetOptionValue(connection, "port");
char *errorMessage = PQerrorMessage(connection);
ereport(WARNING, (errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("could not connect to node %s:%s", nodeName, nodePort),
errdetail("Client error: %s", errorMessage)));
}
/*
* ConnectionGetOptionValue inspects the provided connection for an option with
* a given keyword and returns a new palloc'd string with that options's value.
* The function returns NULL if the connection has no setting for an option with
* the provided keyword.
*/
static char *
ConnectionGetOptionValue(PGconn *connection, char *optionKeyword)
{
char *optionValue = NULL;
PQconninfoOption *option = NULL;
PQconninfoOption *conninfoOptions = PQconninfo(connection);
for (option = conninfoOptions; option->keyword != NULL; option++)
{
if (strncmp(option->keyword, optionKeyword, NAMEDATALEN) == 0)
{
optionValue = pstrdup(option->val);
}
}
PQconninfoFree(conninfoOptions);
return optionValue;
}

View File

@ -0,0 +1,278 @@
/*-------------------------------------------------------------------------
*
* multi_executor.c
*
* Entrypoint into distributed query execution.
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include "access/xact.h"
#include "catalog/dependency.h"
#include "catalog/namespace.h"
#include "distributed/multi_executor.h"
#include "distributed/multi_master_planner.h"
#include "distributed/multi_planner.h"
#include "distributed/multi_router_executor.h"
#include "distributed/multi_resowner.h"
#include "distributed/multi_server_executor.h"
#include "distributed/multi_utility.h"
#include "distributed/worker_protocol.h"
#include "executor/execdebug.h"
#include "storage/lmgr.h"
#include "tcop/utility.h"
#include "utils/snapmgr.h"
/*
* multi_ExecutorStart is a hook called at at the beginning of any execution
* of any query plan.
*
* If a distributed relation is the target of the query, perform some validity
* checks. If a legal statement, start the distributed execution. After that
* the to-be-executed query is replaced with the portion executing solely on
* the master.
*/
void
multi_ExecutorStart(QueryDesc *queryDesc, int eflags)
{
PlannedStmt *planStatement = queryDesc->plannedstmt;
if (HasCitusToplevelNode(planStatement))
{
MultiPlan *multiPlan = GetMultiPlan(planStatement);
MultiExecutorType executorType = MULTI_EXECUTOR_INVALID_FIRST;
Job *workerJob = multiPlan->workerJob;
executorType = JobExecutorType(multiPlan);
if (executorType == MULTI_EXECUTOR_ROUTER)
{
Task *task = NULL;
List *taskList = workerJob->taskList;
List *dependendJobList PG_USED_FOR_ASSERTS_ONLY = workerJob->dependedJobList;
List *workerTargetList = multiPlan->workerJob->jobQuery->targetList;
TupleDesc tupleDescriptor = ExecCleanTypeFromTL(workerTargetList, false);
/* router executor can only execute distributed plans with a single task */
Assert(list_length(taskList) == 1);
Assert(dependendJobList == NIL);
task = (Task *) linitial(taskList);
/* we need to set tupleDesc in executorStart */
queryDesc->tupDesc = tupleDescriptor;
/* drop into the router executor */
RouterExecutorStart(queryDesc, eflags, task);
}
else
{
PlannedStmt *masterSelectPlan = MasterNodeSelectPlan(multiPlan);
CreateStmt *masterCreateStmt = MasterNodeCreateStatement(multiPlan);
List *masterCopyStmtList = MasterNodeCopyStatementList(multiPlan);
ListCell *masterCopyStmtCell = NULL;
RangeTblEntry *masterRangeTableEntry = NULL;
StringInfo jobDirectoryName = NULL;
/*
* We create a directory on the master node to keep task execution results.
* We also register this directory for automatic cleanup on portal delete.
*/
jobDirectoryName = JobDirectoryName(workerJob->jobId);
CreateDirectory(jobDirectoryName);
ResourceOwnerEnlargeJobDirectories(CurrentResourceOwner);
ResourceOwnerRememberJobDirectory(CurrentResourceOwner, workerJob->jobId);
/* pick distributed executor to use */
if (executorType == MULTI_EXECUTOR_REAL_TIME)
{
MultiRealTimeExecute(workerJob);
}
else if (executorType == MULTI_EXECUTOR_TASK_TRACKER)
{
MultiTaskTrackerExecute(workerJob);
}
/* then create the result relation */
ProcessUtility((Node *) masterCreateStmt,
"(temp table creation)",
PROCESS_UTILITY_QUERY,
NULL,
None_Receiver,
NULL);
/* make the temporary table visible */
CommandCounterIncrement();
/* now copy data from all the remote nodes into temp table */
foreach(masterCopyStmtCell, masterCopyStmtList)
{
Node *masterCopyStmt = (Node *) lfirst(masterCopyStmtCell);
Assert(IsA(masterCopyStmt, CopyStmt));
ProcessUtility(masterCopyStmt,
"(copy job)",
PROCESS_UTILITY_QUERY,
NULL,
None_Receiver,
NULL);
}
/* make the copied contents visible */
CommandCounterIncrement();
/*
* Update the QueryDesc's snapshot so it sees the table. That's not
* particularly pretty, but we don't have much of a choice. One might
* think we could unregister the snapshot, push a new active one,
* update it, register it, and be happy. That only works if it's only
* registered once though...
*/
queryDesc->snapshot->curcid = GetCurrentCommandId(false);
/*
* Set the OID of the RTE used in the master select statement to point
* to the now created (and filled) temporary table. The target
* relation's oid is only known now.
*/
masterRangeTableEntry =
(RangeTblEntry *) linitial(masterSelectPlan->rtable);
masterRangeTableEntry->relid =
RelnameGetRelid(masterRangeTableEntry->eref->aliasname);
/*
* Replace to-be-run query with the master select query. As the
* planned statement is now replaced we can't call GetMultiPlan() in
* the later hooks, so we set a flag marking this as a distributed
* statement running on the master. That e.g. allows us to drop the
* temp table later.
*/
queryDesc->plannedstmt = masterSelectPlan;
eflags |= EXEC_FLAG_CITUS_MASTER_SELECT;
}
}
/* if the execution is not done for router executor, drop into standard executor */
if (queryDesc->estate == NULL ||
!(queryDesc->estate->es_top_eflags & EXEC_FLAG_CITUS_ROUTER_EXECUTOR))
{
standard_ExecutorStart(queryDesc, eflags);
}
}
/* Execute query plan. */
void
multi_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count)
{
int eflags = queryDesc->estate->es_top_eflags;
if (eflags & EXEC_FLAG_CITUS_ROUTER_EXECUTOR)
{
Task *task = NULL;
PlannedStmt *planStatement = queryDesc->plannedstmt;
MultiPlan *multiPlan = GetMultiPlan(planStatement);
List *taskList = multiPlan->workerJob->taskList;
/* router executor can only execute distributed plans with a single task */
Assert(list_length(taskList) == 1);
task = (Task *) linitial(taskList);
/* drop into the router executor */
RouterExecutorRun(queryDesc, direction, count, task);
}
else
{
/* drop into the standard executor */
standard_ExecutorRun(queryDesc, direction, count);
}
}
/* Perform actions, like e.g. firing triggers, after the query has run. */
void
multi_ExecutorFinish(QueryDesc *queryDesc)
{
int eflags = queryDesc->estate->es_top_eflags;
if (eflags & EXEC_FLAG_CITUS_ROUTER_EXECUTOR)
{
/* drop into the router executor */
RouterExecutorFinish(queryDesc);
}
else
{
/* drop into the standard executor */
standard_ExecutorFinish(queryDesc);
}
}
/*
* multi_ExecutorEnd is a hook called to deallocate resources used during
* query execution.
*
* If the query executed was the portion of a distributed query running on the
* master, remove the resources that were needed for distributed execution.
*/
void
multi_ExecutorEnd(QueryDesc *queryDesc)
{
int eflags = queryDesc->estate->es_top_eflags;
if (eflags & EXEC_FLAG_CITUS_ROUTER_EXECUTOR)
{
/* drop into the router executor */
RouterExecutorEnd(queryDesc);
}
else
{
/* drop into the standard executor */
standard_ExecutorEnd(queryDesc);
}
/*
* Final step of a distributed query is executing the master node select
* query. We clean up the temp tables after executing it, if we already created it.
*/
if (eflags & EXEC_FLAG_CITUS_MASTER_SELECT)
{
PlannedStmt *planStatement = queryDesc->plannedstmt;
int savedLogMinMessages = 0;
int savedClientMinMessages = 0;
RangeTblEntry *rangeTableEntry = linitial(planStatement->rtable);
Oid masterTableRelid = rangeTableEntry->relid;
ObjectAddress masterTableObject = {InvalidOid, InvalidOid, 0};
masterTableObject.classId = RelationRelationId;
masterTableObject.objectId = masterTableRelid;
masterTableObject.objectSubId = 0;
/*
* Temporarily change logging level to avoid DEBUG2 logging output by
* performDeletion. This avoids breaking the regression tests which
* use DEBUG2 logging.
*/
savedLogMinMessages = log_min_messages;
savedClientMinMessages = client_min_messages;
log_min_messages = INFO;
client_min_messages = INFO;
performDeletion(&masterTableObject, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
log_min_messages = savedLogMinMessages;
client_min_messages = savedClientMinMessages;
}
}

View File

@ -0,0 +1,862 @@
/*-------------------------------------------------------------------------
*
* multi_real_time_executor.c
*
* Routines for executing remote tasks as part of a distributed execution plan
* in real-time. These routines open up a separate connection for each task they
* need to execute, and therefore return their results faster. However, they can
* only handle as many tasks as the number of file descriptors (connections)
* available. They also can't handle execution primitives that need to write
* their results to intermediate files.
*
* Copyright (c) 2013, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include <unistd.h>
#include "commands/dbcommands.h"
#include "distributed/multi_client_executor.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/multi_server_executor.h"
#include "distributed/worker_protocol.h"
#include "storage/fd.h"
/* Local functions forward declarations */
static ConnectAction ManageTaskExecution(Task *task, TaskExecution *taskExecution);
static bool TaskExecutionReadyToStart(TaskExecution *taskExecution);
static bool TaskExecutionCompleted(TaskExecution *taskExecution);
static void CancelTaskExecutionIfActive(TaskExecution *taskExecution);
static void CancelRequestIfActive(TaskExecStatus taskStatus, int connectionId);
/* Worker node state hash functions */
static HTAB * WorkerHash(const char *workerHashName, List *workerNodeList);
static HTAB * WorkerHashCreate(const char *workerHashName, uint32 workerHashSize);
static WorkerNodeState * WorkerHashEnter(HTAB *workerHash,
char *nodeName, uint32 nodePort);
static WorkerNodeState * WorkerHashLookup(HTAB *workerHash,
const char *nodeName, uint32 nodePort);
static WorkerNodeState * LookupWorkerForTask(HTAB *workerHash, Task *task,
TaskExecution *taskExecution);
/* Throttling functions */
static bool WorkerConnectionsExhausted(WorkerNodeState *workerNodeState);
static bool MasterConnectionsExhausted(HTAB *workerHash);
static uint32 TotalOpenConnectionCount(HTAB *workerHash);
static void UpdateConnectionCounter(WorkerNodeState *workerNode,
ConnectAction connectAction);
/*
* MultiRealTimeExecute loops over the given tasks, and manages their execution
* until either one task permanently fails or all tasks successfully complete.
* The function opens up a connection for each task it needs to execute, and
* manages these tasks' execution in real-time.
*/
void
MultiRealTimeExecute(Job *job)
{
List *taskList = job->taskList;
List *taskExecutionList = NIL;
ListCell *taskExecutionCell = NULL;
ListCell *taskCell = NULL;
uint32 failedTaskId = 0;
bool allTasksCompleted = false;
bool taskCompleted = false;
bool taskFailed = false;
List *workerNodeList = NIL;
HTAB *workerHash = NULL;
const char *workerHashName = "Worker node hash";
workerNodeList = WorkerNodeList();
workerHash = WorkerHash(workerHashName, workerNodeList);
/* initialize task execution structures for remote execution */
foreach(taskCell, taskList)
{
Task *task = (Task *) lfirst(taskCell);
TaskExecution *taskExecution = InitTaskExecution(task, EXEC_TASK_CONNECT_START);
taskExecutionList = lappend(taskExecutionList, taskExecution);
}
/* loop around until all tasks complete, one task fails, or user cancels */
while ( !(allTasksCompleted || taskFailed || QueryCancelPending) )
{
uint32 taskCount = list_length(taskList);
uint32 completedTaskCount = 0;
/* loop around all tasks and manage them */
ListCell *taskCell = NULL;
ListCell *taskExecutionCell = NULL;
forboth(taskCell, taskList, taskExecutionCell, taskExecutionList)
{
Task *task = (Task *) lfirst(taskCell);
TaskExecution *taskExecution = (TaskExecution *) lfirst(taskExecutionCell);
ConnectAction connectAction = CONNECT_ACTION_NONE;
WorkerNodeState *workerNodeState = NULL;
workerNodeState = LookupWorkerForTask(workerHash, task, taskExecution);
/* in case the task is about to start, throttle if necessary */
if (TaskExecutionReadyToStart(taskExecution) &&
(WorkerConnectionsExhausted(workerNodeState) ||
MasterConnectionsExhausted(workerHash)))
{
continue;
}
/* call the function that performs the core task execution logic */
connectAction = ManageTaskExecution(task, taskExecution);
/* update the connection counter for throttling */
UpdateConnectionCounter(workerNodeState, connectAction);
/*
* If this task failed, we need to iterate over task executions, and
* manually clean out their client-side resources. Hence, we record
* the failure here instead of immediately erroring out.
*/
taskFailed = TaskExecutionFailed(taskExecution);
if (taskFailed)
{
failedTaskId = taskExecution->taskId;
break;
}
taskCompleted = TaskExecutionCompleted(taskExecution);
if (taskCompleted)
{
completedTaskCount++;
}
}
/* check if all tasks completed; otherwise sleep to avoid tight loop */
if (completedTaskCount == taskCount)
{
allTasksCompleted = true;
}
else
{
long sleepIntervalPerCycle = RemoteTaskCheckInterval * 1000L;
pg_usleep(sleepIntervalPerCycle);
}
}
/*
* We prevent cancel/die interrupts until we clean up connections to worker
* nodes. Note that for the above while loop, if the user Ctrl+C's a query
* and we emit a warning before looping to the beginning of the while loop,
* we will get canceled away before we can hold any interrupts.
*/
HOLD_INTERRUPTS();
/* cancel any active task executions */
taskExecutionCell = NULL;
foreach(taskExecutionCell, taskExecutionList)
{
TaskExecution *taskExecution = (TaskExecution *) lfirst(taskExecutionCell);
CancelTaskExecutionIfActive(taskExecution);
}
/*
* If cancel might have been sent, give remote backends some time to flush
* their responses. This avoids some broken pipe logs on the backend-side.
*/
if (taskFailed || QueryCancelPending)
{
long sleepInterval = RemoteTaskCheckInterval * 1000L;
pg_usleep(sleepInterval);
}
/* close connections and open files */
taskExecutionCell = NULL;
foreach(taskExecutionCell, taskExecutionList)
{
TaskExecution *taskExecution = (TaskExecution *) lfirst(taskExecutionCell);
CleanupTaskExecution(taskExecution);
}
RESUME_INTERRUPTS();
/*
* If we previously broke out of the execution loop due to a task failure or
* user cancellation request, we can now safely emit an error message (all
* client-side resources have been cleared).
*/
if (taskFailed)
{
ereport(ERROR, (errmsg("failed to execute job " UINT64_FORMAT, job->jobId),
errdetail("Failure due to failed task %u", failedTaskId)));
}
else if (QueryCancelPending)
{
CHECK_FOR_INTERRUPTS();
}
}
/*
* ManageTaskExecution manages all execution logic for the given task. For this,
* the function starts a new "execution" on a node, and tracks this execution's
* progress. On failure, the function restarts this execution on another node.
* Note that this function directly manages a task's execution by opening up a
* separate connection to the worker node for each execution. The function
* returns a ConnectAction enum indicating whether a connection has been opened
* or closed in this call.
*/
static ConnectAction
ManageTaskExecution(Task *task, TaskExecution *taskExecution)
{
TaskExecStatus *taskStatusArray = taskExecution->taskStatusArray;
int32 *connectionIdArray = taskExecution->connectionIdArray;
int32 *fileDescriptorArray = taskExecution->fileDescriptorArray;
uint32 currentIndex = taskExecution->currentNodeIndex;
TaskExecStatus currentStatus = taskStatusArray[currentIndex];
List *taskPlacementList = task->taskPlacementList;
ShardPlacement *taskPlacement = list_nth(taskPlacementList, currentIndex);
char *nodeName = taskPlacement->nodeName;
uint32 nodePort = taskPlacement->nodePort;
ConnectAction connectAction = CONNECT_ACTION_NONE;
switch (currentStatus)
{
case EXEC_TASK_CONNECT_START:
{
int32 connectionId = INVALID_CONNECTION_ID;
char *nodeDatabase = NULL;
/* we use the same database name on the master and worker nodes */
nodeDatabase = get_database_name(MyDatabaseId);
connectionId = MultiClientConnectStart(nodeName, nodePort, nodeDatabase);
connectionIdArray[currentIndex] = connectionId;
/* if valid, poll the connection until the connection is initiated */
if (connectionId != INVALID_CONNECTION_ID)
{
taskStatusArray[currentIndex] = EXEC_TASK_CONNECT_POLL;
taskExecution->connectPollCount = 0;
connectAction = CONNECT_ACTION_OPENED;
}
else
{
AdjustStateForFailure(taskExecution);
}
break;
}
case EXEC_TASK_CONNECT_POLL:
{
int32 connectionId = connectionIdArray[currentIndex];
ConnectStatus pollStatus = MultiClientConnectPoll(connectionId);
/*
* If the connection is established, we reset the data fetch counter and
* change our status to data fetching.
*/
if (pollStatus == CLIENT_CONNECTION_READY)
{
taskExecution->dataFetchTaskIndex = -1;
taskStatusArray[currentIndex] = EXEC_FETCH_TASK_LOOP;
}
else if (pollStatus == CLIENT_CONNECTION_BUSY)
{
taskStatusArray[currentIndex] = EXEC_TASK_CONNECT_POLL;
}
else if (pollStatus == CLIENT_CONNECTION_BAD)
{
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
}
/* now check if we have been trying to connect for too long */
taskExecution->connectPollCount++;
if (pollStatus == CLIENT_CONNECTION_BUSY)
{
uint32 maxCount = REMOTE_NODE_CONNECT_TIMEOUT / RemoteTaskCheckInterval;
uint32 currentCount = taskExecution->connectPollCount;
if (currentCount >= maxCount)
{
ereport(WARNING, (errmsg("could not establish asynchronous connection "
"after %u ms", REMOTE_NODE_CONNECT_TIMEOUT)));
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
}
}
break;
}
case EXEC_TASK_FAILED:
{
/*
* On task failure, we close the connection. We also reset our execution
* status assuming that we might fail on all other worker nodes and come
* back to this failed node. In that case, we will retry the same fetch
* and compute task(s) on this node again.
*/
int32 connectionId = connectionIdArray[currentIndex];
MultiClientDisconnect(connectionId);
connectionIdArray[currentIndex] = INVALID_CONNECTION_ID;
connectAction = CONNECT_ACTION_CLOSED;
taskStatusArray[currentIndex] = EXEC_TASK_CONNECT_START;
/* try next worker node */
AdjustStateForFailure(taskExecution);
break;
}
case EXEC_FETCH_TASK_LOOP:
{
List *dataFetchTaskList = task->dependedTaskList;
int32 dataFetchTaskCount = list_length(dataFetchTaskList);
/* move to the next data fetch task */
taskExecution->dataFetchTaskIndex++;
if (taskExecution->dataFetchTaskIndex < dataFetchTaskCount)
{
taskStatusArray[currentIndex] = EXEC_FETCH_TASK_START;
}
else
{
taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_START;
}
break;
}
case EXEC_FETCH_TASK_START:
{
List *dataFetchTaskList = task->dependedTaskList;
int32 dataFetchTaskIndex = taskExecution->dataFetchTaskIndex;
Task *dataFetchTask = (Task *) list_nth(dataFetchTaskList, dataFetchTaskIndex);
char *dataFetchQuery = dataFetchTask->queryString;
int32 connectionId = connectionIdArray[currentIndex];
bool querySent = MultiClientSendQuery(connectionId, dataFetchQuery);
if (querySent)
{
taskStatusArray[currentIndex] = EXEC_FETCH_TASK_RUNNING;
}
else
{
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
}
break;
}
case EXEC_FETCH_TASK_RUNNING:
{
int32 connectionId = connectionIdArray[currentIndex];
ResultStatus resultStatus = MultiClientResultStatus(connectionId);
QueryStatus queryStatus = CLIENT_INVALID_QUERY;
/* check if query results are in progress or unavailable */
if (resultStatus == CLIENT_RESULT_BUSY)
{
taskStatusArray[currentIndex] = EXEC_FETCH_TASK_RUNNING;
break;
}
else if (resultStatus == CLIENT_RESULT_UNAVAILABLE)
{
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
break;
}
Assert(resultStatus == CLIENT_RESULT_READY);
/*
* If the query executed successfully, loop onto the next data fetch
* task. Else if the query failed, try data fetching on another node.
*/
queryStatus = MultiClientQueryStatus(connectionId);
if (queryStatus == CLIENT_QUERY_DONE)
{
taskStatusArray[currentIndex] = EXEC_FETCH_TASK_LOOP;
}
else if (queryStatus == CLIENT_QUERY_FAILED)
{
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
}
else
{
ereport(FATAL, (errmsg("invalid query status: %d", queryStatus)));
}
break;
}
case EXEC_COMPUTE_TASK_START:
{
int32 connectionId = connectionIdArray[currentIndex];
bool querySent = false;
/* construct new query to copy query results to stdout */
char *queryString = task->queryString;
StringInfo computeTaskQuery = makeStringInfo();
if (BinaryMasterCopyFormat)
{
appendStringInfo(computeTaskQuery, COPY_QUERY_TO_STDOUT_BINARY, queryString);
}
else
{
appendStringInfo(computeTaskQuery, COPY_QUERY_TO_STDOUT_TEXT, queryString);
}
querySent = MultiClientSendQuery(connectionId, computeTaskQuery->data);
if (querySent)
{
taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_RUNNING;
}
else
{
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
}
break;
}
case EXEC_COMPUTE_TASK_RUNNING:
{
int32 connectionId = connectionIdArray[currentIndex];
ResultStatus resultStatus = MultiClientResultStatus(connectionId);
QueryStatus queryStatus = CLIENT_INVALID_QUERY;
/* check if query results are in progress or unavailable */
if (resultStatus == CLIENT_RESULT_BUSY)
{
taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_RUNNING;
break;
}
else if (resultStatus == CLIENT_RESULT_UNAVAILABLE)
{
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
break;
}
Assert(resultStatus == CLIENT_RESULT_READY);
/* check if our request to copy query results has been acknowledged */
queryStatus = MultiClientQueryStatus(connectionId);
if (queryStatus == CLIENT_QUERY_COPY)
{
StringInfo jobDirectoryName = JobDirectoryName(task->jobId);
StringInfo taskFilename = TaskFilename(jobDirectoryName, task->taskId);
char *filename = taskFilename->data;
int fileFlags = (O_APPEND | O_CREAT | O_RDWR | O_TRUNC | PG_BINARY);
int fileMode = (S_IRUSR | S_IWUSR);
int32 fileDescriptor = BasicOpenFile(filename, fileFlags, fileMode);
if (fileDescriptor >= 0)
{
/*
* All files inside the job directory get automatically cleaned
* up on transaction commit or abort.
*/
fileDescriptorArray[currentIndex] = fileDescriptor;
taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_COPYING;
}
else
{
ereport(WARNING, (errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", filename)));
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
}
}
else if (queryStatus == CLIENT_QUERY_FAILED)
{
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
}
else
{
ereport(FATAL, (errmsg("invalid query status: %d", queryStatus)));
}
break;
}
case EXEC_COMPUTE_TASK_COPYING:
{
int32 connectionId = connectionIdArray[currentIndex];
int32 fileDesc = fileDescriptorArray[currentIndex];
int closed = -1;
/* copy data from worker node, and write to local file */
CopyStatus copyStatus = MultiClientCopyData(connectionId, fileDesc);
/* if worker node will continue to send more data, keep reading */
if (copyStatus == CLIENT_COPY_MORE)
{
taskStatusArray[currentIndex] = EXEC_COMPUTE_TASK_COPYING;
}
else if (copyStatus == CLIENT_COPY_DONE)
{
closed = close(fileDesc);
fileDescriptorArray[currentIndex] = -1;
if (closed >= 0)
{
taskStatusArray[currentIndex] = EXEC_TASK_DONE;
/* we are done executing; we no longer need the connection */
MultiClientDisconnect(connectionId);
connectionIdArray[currentIndex] = INVALID_CONNECTION_ID;
connectAction = CONNECT_ACTION_CLOSED;
}
else
{
ereport(WARNING, (errcode_for_file_access(),
errmsg("could not close copied file: %m")));
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
}
}
else if (copyStatus == CLIENT_COPY_FAILED)
{
taskStatusArray[currentIndex] = EXEC_TASK_FAILED;
closed = close(fileDesc);
fileDescriptorArray[currentIndex] = -1;
if (closed < 0)
{
ereport(WARNING, (errcode_for_file_access(),
errmsg("could not close copy file: %m")));
}
}
break;
}
case EXEC_TASK_DONE:
{
/* we are done with this task's execution */
break;
}
default:
{
/* we fatal here to avoid leaking client-side resources */
ereport(FATAL, (errmsg("invalid execution status: %d", currentStatus)));
break;
}
}
return connectAction;
}
/* Determines if the given task is ready to start. */
static bool
TaskExecutionReadyToStart(TaskExecution *taskExecution)
{
bool readyToStart = false;
TaskExecStatus *taskStatusArray = taskExecution->taskStatusArray;
uint32 currentIndex = taskExecution->currentNodeIndex;
TaskExecStatus taskStatus = taskStatusArray[currentIndex];
if (taskStatus == EXEC_TASK_CONNECT_START)
{
readyToStart = true;
}
return readyToStart;
}
/* Determines if the given task successfully completed executing. */
static bool
TaskExecutionCompleted(TaskExecution *taskExecution)
{
bool completed = false;
uint32 nodeIndex = 0;
for (nodeIndex = 0; nodeIndex < taskExecution->nodeCount; nodeIndex++)
{
TaskExecStatus taskStatus = taskExecution->taskStatusArray[nodeIndex];
if (taskStatus == EXEC_TASK_DONE)
{
completed = true;
break;
}
}
return completed;
}
/* Iterates over all open connections, and cancels any active requests. */
static void
CancelTaskExecutionIfActive(TaskExecution *taskExecution)
{
uint32 nodeIndex = 0;
for (nodeIndex = 0; nodeIndex < taskExecution->nodeCount; nodeIndex++)
{
int32 connectionId = taskExecution->connectionIdArray[nodeIndex];
if (connectionId != INVALID_CONNECTION_ID)
{
TaskExecStatus *taskStatusArray = taskExecution->taskStatusArray;
TaskExecStatus taskStatus = taskStatusArray[nodeIndex];
CancelRequestIfActive(taskStatus, connectionId);
}
}
}
/* Helper function to cancel an ongoing request, if any. */
static void
CancelRequestIfActive(TaskExecStatus taskStatus, int connectionId)
{
/*
* We use the task status to determine if we have an active request being
* processed by the worker node. If we do, we send a cancellation request.
* Note that we don't cancel data fetch tasks, and allow them to complete.
*/
if (taskStatus == EXEC_COMPUTE_TASK_RUNNING)
{
ResultStatus resultStatus = MultiClientResultStatus(connectionId);
if (resultStatus == CLIENT_RESULT_BUSY)
{
MultiClientCancel(connectionId);
}
}
else if (taskStatus == EXEC_COMPUTE_TASK_COPYING)
{
MultiClientCancel(connectionId);
}
}
/*
* WorkerHash creates a worker node hash with the given name. The function
* then inserts one entry for each worker node in the given worker node
* list.
*/
static HTAB *
WorkerHash(const char *workerHashName, List *workerNodeList)
{
uint32 workerHashSize = list_length(workerNodeList);
HTAB *workerHash = WorkerHashCreate(workerHashName, workerHashSize);
ListCell *workerNodeCell = NULL;
foreach(workerNodeCell, workerNodeList)
{
WorkerNode *workerNode = (WorkerNode *) lfirst(workerNodeCell);
char *nodeName = workerNode->workerName;
uint32 nodePort = workerNode->workerPort;
WorkerHashEnter(workerHash, nodeName, nodePort);
}
return workerHash;
}
/*
* WorkerHashCreate allocates memory for a worker node hash, initializes an
* empty hash, and returns this hash.
*/
static HTAB *
WorkerHashCreate(const char *workerHashName, uint32 workerHashSize)
{
HASHCTL info;
int hashFlags = 0;
HTAB *workerHash = NULL;
memset(&info, 0, sizeof(info));
info.keysize = WORKER_LENGTH + sizeof(uint32);
info.entrysize = sizeof(WorkerNodeState);
info.hash = tag_hash;
info.hcxt = CurrentMemoryContext;
hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
workerHash = hash_create(workerHashName, workerHashSize, &info, hashFlags);
if (workerHash == NULL)
{
ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("could not initialize worker node hash")));
}
return workerHash;
}
/*
* WorkerHashEnter creates a new worker node entry in the given worker node
* hash, and checks that the worker node entry has been properly created.
*/
static WorkerNodeState *
WorkerHashEnter(HTAB *workerHash, char *nodeName, uint32 nodePort)
{
bool handleFound = false;
WorkerNodeState *workerNodeState = NULL;
WorkerNodeState workerNodeKey;
memset(&workerNodeKey, 0, sizeof(WorkerNodeState));
strlcpy(workerNodeKey.workerName, nodeName, WORKER_LENGTH);
workerNodeKey.workerPort = nodePort;
workerNodeState = (WorkerNodeState *) hash_search(workerHash, (void *) &workerNodeKey,
HASH_ENTER, &handleFound);
if (handleFound)
{
ereport(WARNING, (errmsg("multiple worker node state entries for node: \"%s:%u\"",
nodeName, nodePort)));
}
memcpy(workerNodeState, &workerNodeKey, sizeof(WorkerNodeState));
workerNodeState->openConnectionCount = 0;
return workerNodeState;
}
/*
* WorkerHashLookup looks for the worker node state that corresponds to the given
* node name and port number, and returns the found worker node state if any.
*/
static WorkerNodeState *
WorkerHashLookup(HTAB *workerHash, const char *nodeName, uint32 nodePort)
{
bool handleFound = false;
WorkerNodeState *workerNodeState = NULL;
WorkerNodeState workerNodeKey;
memset(&workerNodeKey, 0, sizeof(WorkerNodeState));
strlcpy(workerNodeKey.workerName, nodeName, WORKER_LENGTH);
workerNodeKey.workerPort = nodePort;
workerNodeState = (WorkerNodeState *) hash_search(workerHash, (void *) &workerNodeKey,
HASH_FIND, &handleFound);
if (workerNodeState == NULL)
{
ereport(ERROR, (errmsg("could not find worker node state for node \"%s:%u\"",
nodeName, nodePort)));
}
return workerNodeState;
}
/*
* LookupWorkerForTask looks for the worker node state of the current worker
* node of a task execution.
*/
static WorkerNodeState *
LookupWorkerForTask(HTAB *workerHash, Task *task, TaskExecution *taskExecution)
{
uint32 currentIndex = taskExecution->currentNodeIndex;
List *taskPlacementList = task->taskPlacementList;
ShardPlacement *taskPlacement = list_nth(taskPlacementList, currentIndex);
char *nodeName = taskPlacement->nodeName;
uint32 nodePort = taskPlacement->nodePort;
WorkerNodeState *workerNodeState = WorkerHashLookup(workerHash, nodeName, nodePort);
return workerNodeState;
}
/*
* WorkerConnectionsExhausted determines if the current query has exhausted the
* maximum number of open connections that can be made to a worker.
*/
static bool
WorkerConnectionsExhausted(WorkerNodeState *workerNodeState)
{
bool reachedLimit = false;
/*
* A worker cannot accept more than max_connections connections. If we have a
* small number of workers with many shards, then a single query could exhaust
* max_connections unless we throttle here. We use the value of max_connections
* on the master as a proxy for the worker configuration to avoid introducing a
* new configuration value.
*/
if (workerNodeState->openConnectionCount >= MaxConnections)
{
reachedLimit = true;
}
return reachedLimit;
}
/*
* MasterConnectionsExhausted determines if the current query has exhausted
* the maximum number of connections the master process can make.
*/
static bool
MasterConnectionsExhausted(HTAB *workerHash)
{
bool reachedLimit = false;
uint32 maxConnectionCount = MaxMasterConnectionCount();
uint32 totalConnectionCount = TotalOpenConnectionCount(workerHash);
if (totalConnectionCount >= maxConnectionCount)
{
reachedLimit = true;
}
return reachedLimit;
}
/*
* TotalOpenConnectionCount counts the total number of open connections across all the
* workers.
*/
static uint32
TotalOpenConnectionCount(HTAB *workerHash)
{
uint32 connectionCount = 0;
WorkerNodeState *workerNodeState = NULL;
HASH_SEQ_STATUS status;
hash_seq_init(&status, workerHash);
workerNodeState = (WorkerNodeState *) hash_seq_search(&status);
while (workerNodeState != NULL)
{
connectionCount += workerNodeState->openConnectionCount;
workerNodeState = (WorkerNodeState *) hash_seq_search(&status);
}
return connectionCount;
}
/*
* UpdateConnectionCounter updates the connection counter for a given worker
* node based on the specified connect action.
*/
static void
UpdateConnectionCounter(WorkerNodeState *workerNode, ConnectAction connectAction)
{
if (connectAction == CONNECT_ACTION_OPENED)
{
workerNode->openConnectionCount++;
}
else if (connectAction == CONNECT_ACTION_CLOSED)
{
workerNode->openConnectionCount--;
}
}

View File

@ -0,0 +1,563 @@
/*
* multi_router_executor.c
*
* Routines for executing remote tasks as part of a distributed execution plan
* with synchronous connections. The routines utilize the connection cache.
* Therefore, only a single connection is opened for each worker. Also, router
* executor does not require a master table and a master query. In other words,
* the results that are fetched from a single worker is sent to the output console
* directly. Lastly, router executor can only execute a single task.
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*/
#include "postgres.h"
#include "c.h"
#include "fmgr.h"
#include "funcapi.h"
#include "libpq-fe.h"
#include "miscadmin.h"
#include "access/xact.h"
#include "distributed/connection_cache.h"
#include "distributed/listutils.h"
#include "distributed/multi_executor.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/multi_router_executor.h"
#include "distributed/resource_lock.h"
#include "executor/executor.h"
#include "nodes/pg_list.h"
#include "utils/builtins.h"
#include "utils/elog.h"
#include "utils/errcodes.h"
#include "utils/memutils.h"
#include "utils/palloc.h"
/* controls use of locks to enforce safe commutativity */
bool AllModificationsCommutative = false;
static LOCKMODE CommutativityRuleToLockMode(CmdType commandType, bool upsertQuery);
static void AcquireExecutorShardLock(Task *task, LOCKMODE lockMode);
static int32 ExecuteDistributedModify(Task *task);
static void ExecuteSingleShardSelect(Task *task, EState *executorState,
TupleDesc tupleDescriptor,
DestReceiver *destination);
static bool SendQueryInSingleRowMode(PGconn *connection, char *query);
static bool StoreQueryResult(PGconn *connection, TupleDesc tupleDescriptor,
Tuplestorestate *tupleStore);
/*
* RouterExecutorStart sets up the executor state and queryDesc for router
* execution.
*/
void
RouterExecutorStart(QueryDesc *queryDesc, int eflags, Task *task)
{
bool topLevel = true;
LOCKMODE lockMode = NoLock;
EState *executorState = NULL;
CmdType commandType = queryDesc->operation;
/* ensure that the task is not NULL */
Assert(task != NULL);
/* disallow transactions and triggers during distributed commands */
PreventTransactionChain(topLevel, "distributed commands");
eflags |= EXEC_FLAG_SKIP_TRIGGERS;
/* signal that it is a router execution */
eflags |= EXEC_FLAG_CITUS_ROUTER_EXECUTOR;
/* build empty executor state to obtain per-query memory context */
executorState = CreateExecutorState();
executorState->es_top_eflags = eflags;
executorState->es_instrument = queryDesc->instrument_options;
queryDesc->estate = executorState;
#if (PG_VERSION_NUM < 90500)
/* make sure that upsertQuery is false for versions that UPSERT is not available */
Assert(task->upsertQuery == false);
#endif
lockMode = CommutativityRuleToLockMode(commandType, task->upsertQuery);
if (lockMode != NoLock)
{
AcquireExecutorShardLock(task, lockMode);
}
}
/*
* CommutativityRuleToLockMode determines the commutativity rule for the given
* command and returns the appropriate lock mode to enforce that rule. The
* function assumes a SELECT doesn't modify state and therefore is commutative
* with all other commands. The function also assumes that an INSERT commutes
* with another INSERT, but not with an UPDATE/DELETE/UPSERT; and an
* UPDATE/DELETE/UPSERT doesn't commute with an INSERT, UPDATE, DELETE or UPSERT.
*
* Note that the above comment defines INSERT INTO ... ON CONFLICT type of queries
* as an UPSERT. Since UPSERT is not defined as a separate command type in postgres,
* we have to pass it as a second parameter to the function.
*
* The above mapping is overridden entirely when all_modifications_commutative
* is set to true. In that case, all commands just claim a shared lock. This
* allows the shard repair logic to lock out modifications while permitting all
* commands to otherwise commute.
*/
static LOCKMODE
CommutativityRuleToLockMode(CmdType commandType, bool upsertQuery)
{
LOCKMODE lockMode = NoLock;
/* bypass commutativity checks when flag enabled */
if (AllModificationsCommutative)
{
return ShareLock;
}
if (commandType == CMD_SELECT)
{
lockMode = NoLock;
}
else if (upsertQuery)
{
lockMode = ExclusiveLock;
}
else if (commandType == CMD_INSERT)
{
lockMode = ShareLock;
}
else if (commandType == CMD_UPDATE || commandType == CMD_DELETE)
{
lockMode = ExclusiveLock;
}
else
{
ereport(ERROR, (errmsg("unrecognized operation code: %d", (int) commandType)));
}
return lockMode;
}
/*
* AcquireExecutorShardLock: acquire shard lock needed for execution of
* a single task within a distributed plan.
*/
static void
AcquireExecutorShardLock(Task *task, LOCKMODE lockMode)
{
int64 shardId = task->shardId;
LockShardResource(shardId, lockMode);
}
/*
* RouterExecutorRun actually executes a single task on a worker.
*/
void
RouterExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count, Task *task)
{
EState *estate = queryDesc->estate;
CmdType operation = queryDesc->operation;
MemoryContext oldcontext = NULL;
Assert(estate != NULL);
Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY));
Assert(task != NULL);
/* we only support default scan direction and row fetch count */
if (!ScanDirectionIsForward(direction))
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("scan directions other than forward scans "
"are unsupported")));
}
if (count != 0)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("fetching rows from a query using a cursor "
"is unsupported")));
}
oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
if (queryDesc->totaltime != NULL)
{
InstrStartNode(queryDesc->totaltime);
}
if (operation == CMD_INSERT || operation == CMD_UPDATE ||
operation == CMD_DELETE)
{
int32 affectedRowCount = ExecuteDistributedModify(task);
estate->es_processed = affectedRowCount;
}
else if (operation == CMD_SELECT)
{
DestReceiver *destination = queryDesc->dest;
TupleDesc resultTupleDescriptor = queryDesc->tupDesc;
ExecuteSingleShardSelect(task, estate, resultTupleDescriptor, destination);
}
else
{
ereport(ERROR, (errmsg("unrecognized operation code: %d",
(int) operation)));
}
if (queryDesc->totaltime != NULL)
{
InstrStopNode(queryDesc->totaltime, estate->es_processed);
}
MemoryContextSwitchTo(oldcontext);
}
/*
* ExecuteDistributedModify is the main entry point for modifying distributed
* tables. A distributed modification is successful if any placement of the
* distributed table is successful. ExecuteDistributedModify returns the number
* of modified rows in that case and errors in all others. This function will
* also generate warnings for individual placement failures.
*/
static int32
ExecuteDistributedModify(Task *task)
{
int32 affectedTupleCount = -1;
ListCell *taskPlacementCell = NULL;
List *failedPlacementList = NIL;
ListCell *failedPlacementCell = NULL;
foreach(taskPlacementCell, task->taskPlacementList)
{
ShardPlacement *taskPlacement = (ShardPlacement *) lfirst(taskPlacementCell);
char *nodeName = taskPlacement->nodeName;
int32 nodePort = taskPlacement->nodePort;
PGconn *connection = NULL;
PGresult *result = NULL;
char *currentAffectedTupleString = NULL;
int32 currentAffectedTupleCount = -1;
Assert(taskPlacement->shardState == FILE_FINALIZED);
connection = GetConnection(nodeName, nodePort);
if (connection == NULL)
{
failedPlacementList = lappend(failedPlacementList, taskPlacement);
continue;
}
result = PQexec(connection, task->queryString);
if (PQresultStatus(result) != PGRES_COMMAND_OK)
{
ReportRemoteError(connection, result);
PQclear(result);
failedPlacementList = lappend(failedPlacementList, taskPlacement);
continue;
}
currentAffectedTupleString = PQcmdTuples(result);
currentAffectedTupleCount = pg_atoi(currentAffectedTupleString, sizeof(int32), 0);
if ((affectedTupleCount == -1) ||
(affectedTupleCount == currentAffectedTupleCount))
{
affectedTupleCount = currentAffectedTupleCount;
}
else
{
ereport(WARNING, (errmsg("modified %d tuples, but expected to modify %d",
currentAffectedTupleCount, affectedTupleCount),
errdetail("modified placement on %s:%d",
nodeName, nodePort)));
}
PQclear(result);
}
/* if all placements failed, error out */
if (list_length(failedPlacementList) == list_length(task->taskPlacementList))
{
ereport(ERROR, (errmsg("could not modify any active placements")));
}
/* otherwise, mark failed placements as inactive: they're stale */
foreach(failedPlacementCell, failedPlacementList)
{
ShardPlacement *failedPlacement = (ShardPlacement *) lfirst(failedPlacementCell);
uint64 shardLength = 0;
DeleteShardPlacementRow(failedPlacement->shardId, failedPlacement->nodeName,
failedPlacement->nodePort);
InsertShardPlacementRow(failedPlacement->shardId, FILE_INACTIVE, shardLength,
failedPlacement->nodeName, failedPlacement->nodePort);
}
return affectedTupleCount;
}
/*
* ExecuteSingleShardSelect executes the remote select query and sends the
* resultant tuples to the given destination receiver. If the query fails on a
* given placement, the function attempts it on its replica.
*/
static void
ExecuteSingleShardSelect(Task *task, EState *executorState,
TupleDesc tupleDescriptor, DestReceiver *destination)
{
Tuplestorestate *tupleStore = NULL;
bool resultsOK = false;
TupleTableSlot *tupleTableSlot = NULL;
tupleStore = tuplestore_begin_heap(false, false, work_mem);
resultsOK = ExecuteTaskAndStoreResults(task, tupleDescriptor, tupleStore);
if (!resultsOK)
{
ereport(ERROR, (errmsg("could not receive query results")));
}
tupleTableSlot = MakeSingleTupleTableSlot(tupleDescriptor);
/* startup the tuple receiver */
(*destination->rStartup)(destination, CMD_SELECT, tupleDescriptor);
/* iterate over tuples in tuple store, and send them to destination */
for (;;)
{
bool nextTuple = tuplestore_gettupleslot(tupleStore, true, false, tupleTableSlot);
if (!nextTuple)
{
break;
}
(*destination->receiveSlot)(tupleTableSlot, destination);
executorState->es_processed++;
ExecClearTuple(tupleTableSlot);
}
/* shutdown the tuple receiver */
(*destination->rShutdown)(destination);
ExecDropSingleTupleTableSlot(tupleTableSlot);
tuplestore_end(tupleStore);
}
/*
* ExecuteTaskAndStoreResults executes the task on the remote node, retrieves
* the results and stores them in the given tuple store. If the task fails on
* one of the placements, the function retries it on other placements.
*/
bool
ExecuteTaskAndStoreResults(Task *task, TupleDesc tupleDescriptor,
Tuplestorestate *tupleStore)
{
bool resultsOK = false;
List *taskPlacementList = task->taskPlacementList;
ListCell *taskPlacementCell = NULL;
/*
* Try to run the query to completion on one placement. If the query fails
* attempt the query on the next placement.
*/
foreach(taskPlacementCell, taskPlacementList)
{
ShardPlacement *taskPlacement = (ShardPlacement *) lfirst(taskPlacementCell);
char *nodeName = taskPlacement->nodeName;
int32 nodePort = taskPlacement->nodePort;
bool queryOK = false;
bool storedOK = false;
PGconn *connection = GetConnection(nodeName, nodePort);
if (connection == NULL)
{
continue;
}
queryOK = SendQueryInSingleRowMode(connection, task->queryString);
if (!queryOK)
{
PurgeConnection(connection);
continue;
}
storedOK = StoreQueryResult(connection, tupleDescriptor, tupleStore);
if (storedOK)
{
resultsOK = true;
break;
}
else
{
tuplestore_clear(tupleStore);
PurgeConnection(connection);
}
}
return resultsOK;
}
/*
* SendQueryInSingleRowMode sends the given query on the connection in an
* asynchronous way. The function also sets the single-row mode on the
* connection so that we receive results a row at a time.
*/
static bool
SendQueryInSingleRowMode(PGconn *connection, char *query)
{
int querySent = 0;
int singleRowMode = 0;
querySent = PQsendQuery(connection, query);
if (querySent == 0)
{
ReportRemoteError(connection, NULL);
return false;
}
singleRowMode = PQsetSingleRowMode(connection);
if (singleRowMode == 0)
{
ReportRemoteError(connection, NULL);
return false;
}
return true;
}
/*
* StoreQueryResult gets the query results from the given connection, builds
* tuples from the results and stores them in the given tuple-store. If the
* function can't receive query results, it returns false. Note that this
* function assumes the query has already been sent on the connection and the
* tuplestore has earlier been initialized.
*/
static bool
StoreQueryResult(PGconn *connection, TupleDesc tupleDescriptor,
Tuplestorestate *tupleStore)
{
AttInMetadata *attributeInputMetadata = TupleDescGetAttInMetadata(tupleDescriptor);
uint32 expectedColumnCount = tupleDescriptor->natts;
char **columnArray = (char **) palloc0(expectedColumnCount * sizeof(char *));
MemoryContext ioContext = AllocSetContextCreate(CurrentMemoryContext,
"StoreQueryResult",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
Assert(tupleStore != NULL);
for (;;)
{
uint32 rowIndex = 0;
uint32 columnIndex = 0;
uint32 rowCount = 0;
uint32 columnCount = 0;
ExecStatusType resultStatus = 0;
PGresult *result = PQgetResult(connection);
if (result == NULL)
{
break;
}
resultStatus = PQresultStatus(result);
if ((resultStatus != PGRES_SINGLE_TUPLE) && (resultStatus != PGRES_TUPLES_OK))
{
ReportRemoteError(connection, result);
PQclear(result);
return false;
}
rowCount = PQntuples(result);
columnCount = PQnfields(result);
Assert(columnCount == expectedColumnCount);
for (rowIndex = 0; rowIndex < rowCount; rowIndex++)
{
HeapTuple heapTuple = NULL;
MemoryContext oldContext = NULL;
memset(columnArray, 0, columnCount * sizeof(char *));
for (columnIndex = 0; columnIndex < columnCount; columnIndex++)
{
if (PQgetisnull(result, rowIndex, columnIndex))
{
columnArray[columnIndex] = NULL;
}
else
{
columnArray[columnIndex] = PQgetvalue(result, rowIndex, columnIndex);
}
}
/*
* Switch to a temporary memory context that we reset after each tuple. This
* protects us from any memory leaks that might be present in I/O functions
* called by BuildTupleFromCStrings.
*/
oldContext = MemoryContextSwitchTo(ioContext);
heapTuple = BuildTupleFromCStrings(attributeInputMetadata, columnArray);
MemoryContextSwitchTo(oldContext);
tuplestore_puttuple(tupleStore, heapTuple);
MemoryContextReset(ioContext);
}
PQclear(result);
}
pfree(columnArray);
return true;
}
/*
* RouterExecutorFinish cleans up after a distributed execution.
*/
void
RouterExecutorFinish(QueryDesc *queryDesc)
{
EState *estate = queryDesc->estate;
Assert(estate != NULL);
estate->es_finished = true;
}
/*
* RouterExecutorEnd cleans up the executor state after a distributed
* execution.
*/
void
RouterExecutorEnd(QueryDesc *queryDesc)
{
EState *estate = queryDesc->estate;
Assert(estate != NULL);
Assert(estate->es_finished);
FreeExecutorState(estate);
queryDesc->estate = NULL;
queryDesc->totaltime = NULL;
}

View File

@ -0,0 +1,315 @@
/*-------------------------------------------------------------------------
*
* multi_server_executor.c
*
* Function definitions for distributed task execution for real-time
* and task-tracker executors, and routines common to both. The common
* routines are implement backend-side logic; and they trigger executions
* on the client-side via function hooks that they load.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include <unistd.h>
#include "distributed/multi_client_executor.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/multi_resowner.h"
#include "distributed/multi_server_executor.h"
#include "distributed/worker_protocol.h"
int RemoteTaskCheckInterval = 100; /* per cycle sleep interval in millisecs */
int TaskExecutorType = MULTI_EXECUTOR_REAL_TIME; /* distributed executor type */
bool BinaryMasterCopyFormat = false; /* copy data from workers in binary format */
/*
* JobExecutorType selects the executor type for the given multiPlan using the task
* executor type config value. The function then checks if the given multiPlan needs
* more resources than those provided to it by other config values, and issues
* warnings accordingly. If the selected executor type cannot execute the given
* multiPlan, the function errors out.
*/
MultiExecutorType
JobExecutorType(MultiPlan *multiPlan)
{
Job *job = multiPlan->workerJob;
Query *masterQuery = multiPlan->masterQuery;
List *workerTaskList = job->taskList;
List *workerNodeList = WorkerNodeList();
int taskCount = list_length(workerTaskList);
int workerNodeCount = list_length(workerNodeList);
double tasksPerNode = taskCount / ((double) workerNodeCount);
int dependedJobCount = list_length(job->dependedJobList);
MultiExecutorType executorType = TaskExecutorType;
/* check if the first task is a modify task, short-circuit if so */
if (taskCount > 0)
{
Task *firstTask = (Task *) linitial(workerTaskList);
if (firstTask->taskType == MODIFY_TASK)
{
return MULTI_EXECUTOR_ROUTER;
}
}
if (executorType == MULTI_EXECUTOR_REAL_TIME)
{
double reasonableConnectionCount = 0;
/* if we need to open too many connections per worker, warn the user */
if (tasksPerNode >= MaxConnections)
{
ereport(WARNING, (errmsg("this query uses more connections than the "
"configured max_connections limit"),
errhint("Consider increasing max_connections or setting "
"citusdb.task_executor_type to "
"\"task-tracker\".")));
}
/*
* If we need to open too many outgoing connections, warn the user.
* The real-time executor caps the number of tasks it starts by the same limit,
* but we still issue this warning because it degrades performance.
*/
reasonableConnectionCount = MaxMasterConnectionCount();
if (taskCount >= reasonableConnectionCount)
{
ereport(WARNING, (errmsg("this query uses more file descriptors than the "
"configured max_files_per_process limit"),
errhint("Consider increasing max_files_per_process or "
"setting citusdb.task_executor_type to "
"\"task-tracker\".")));
}
/* if we have repartition jobs with real time executor, error out */
if (dependedJobCount > 0)
{
ereport(ERROR, (errmsg("cannot use real time executor with repartition jobs"),
errhint("Set citusdb.task_executor_type to "
"\"task-tracker\".")));
}
}
else if (executorType == MULTI_EXECUTOR_TASK_TRACKER)
{
/* if we have more tasks per node than what can be tracked, warn the user */
if (tasksPerNode >= MaxTrackedTasksPerNode)
{
ereport(WARNING, (errmsg("this query assigns more tasks per node than the "
"configured max_tracked_tasks_per_node limit")));
}
}
else if (executorType == MULTI_EXECUTOR_ROUTER)
{
Task *workerTask = NULL;
List *workerDependentTaskList = NIL;
bool masterQueryHasAggregates = false;
/* if we have repartition jobs with router executor, error out */
if (dependedJobCount > 0)
{
ereport(ERROR, (errmsg("cannot use router executor with repartition jobs"),
errhint("Set citusdb.task_executor_type to "
"\"task-tracker\".")));
}
/* if the query hits more than one shards, error out*/
if (taskCount != 1)
{
ereport(ERROR, (errmsg("cannot use router executor with queries that "
"hit multiple shards"),
errhint("Set citusdb.task_executor_type to \"real-time\" or "
"\"task-tracker\".")));
}
/* if the query has dependent data fetch tasks */
workerTask = list_nth(workerTaskList, 0);
workerDependentTaskList = workerTask->dependedTaskList;
if (list_length(workerDependentTaskList) > 0)
{
ereport(ERROR, (errmsg("cannot use router executor with JOINs"),
errhint("Set citusdb.task_executor_type to \"real-time\" or "
"\"task-tracker\".")));
}
/* ORDER BY is always applied on the master table with the current planner */
if (masterQuery != NULL && list_length(masterQuery->sortClause) > 0)
{
ereport(ERROR, (errmsg("cannot use router executor with ORDER BY clauses"),
errhint("Set citusdb.task_executor_type to \"real-time\" or "
"\"task-tracker\".")));
}
/*
* Note that worker query having an aggregate means that the master query should have either
* an aggregate or a function expression which has to be executed for the correct results.
*/
masterQueryHasAggregates = job->jobQuery->hasAggs;
if (masterQueryHasAggregates)
{
ereport(ERROR, (errmsg("cannot use router executor with aggregates"),
errhint("Set citusdb.task_executor_type to \"real-time\" or "
"\"task-tracker\".")));
}
}
return executorType;
}
/*
* MaxMasterConnectionCount returns the number of connections a master can open.
* A master cannot create more than a certain number of file descriptors (FDs).
* Every task requires 2 FDs, one file and one connection. Some FDs are taken by
* the VFD pool and there is currently no way to reclaim these before opening a
* connection. We therefore assume some FDs to be reserved for VFDs, based on
* observing a typical size of the pool on a CitusDB master.
*/
int
MaxMasterConnectionCount(void)
{
return Max((max_files_per_process - RESERVED_FD_COUNT) / 2, 1);
}
/*
* RemoveJobDirectory gets automatically called at portal drop (end of query) or
* at transaction abort. The function removes the job directory and releases the
* associated job resource from the resource manager.
*/
void
RemoveJobDirectory(uint64 jobId)
{
StringInfo jobDirectoryName = JobDirectoryName(jobId);
RemoveDirectory(jobDirectoryName);
ResourceOwnerForgetJobDirectory(CurrentResourceOwner, jobId);
}
/*
* InitTaskExecution creates a task execution structure for the given task, and
* initializes execution related fields.
*/
TaskExecution *
InitTaskExecution(Task *task, TaskExecStatus initialTaskExecStatus)
{
/* each task placement (assignment) corresponds to one worker node */
uint32 nodeCount = list_length(task->taskPlacementList);
uint32 nodeIndex = 0;
TaskExecution *taskExecution = palloc0(sizeof(TaskExecution));
taskExecution->jobId = task->jobId;
taskExecution->taskId = task->taskId;
taskExecution->nodeCount = nodeCount;
taskExecution->connectPollCount = 0;
taskExecution->currentNodeIndex = 0;
taskExecution->dataFetchTaskIndex = -1;
taskExecution->failureCount = 0;
taskExecution->taskStatusArray = palloc0(nodeCount * sizeof(TaskExecStatus));
taskExecution->transmitStatusArray = palloc0(nodeCount * sizeof(TransmitExecStatus));
taskExecution->connectionIdArray = palloc0(nodeCount * sizeof(int32));
taskExecution->fileDescriptorArray = palloc0(nodeCount * sizeof(int32));
for (nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
{
taskExecution->taskStatusArray[nodeIndex] = initialTaskExecStatus;
taskExecution->transmitStatusArray[nodeIndex] = EXEC_TRANSMIT_UNASSIGNED;
taskExecution->connectionIdArray[nodeIndex] = INVALID_CONNECTION_ID;
taskExecution->fileDescriptorArray[nodeIndex] = -1;
}
return taskExecution;
}
/*
* CleanupTaskExecution iterates over all connections and file descriptors for
* the given task execution. The function first closes all open connections and
* file descriptors, and then frees memory allocated for the task execution.
*/
void
CleanupTaskExecution(TaskExecution *taskExecution)
{
uint32 nodeIndex = 0;
for (nodeIndex = 0; nodeIndex < taskExecution->nodeCount; nodeIndex++)
{
int32 connectionId = taskExecution->connectionIdArray[nodeIndex];
int32 fileDescriptor = taskExecution->fileDescriptorArray[nodeIndex];
/* close open connection */
if (connectionId != INVALID_CONNECTION_ID)
{
MultiClientDisconnect(connectionId);
taskExecution->connectionIdArray[nodeIndex] = INVALID_CONNECTION_ID;
}
/* close open file */
if (fileDescriptor >= 0)
{
int closed = close(fileDescriptor);
taskExecution->fileDescriptorArray[nodeIndex] = -1;
if (closed < 0)
{
ereport(WARNING, (errcode_for_file_access(),
errmsg("could not close copy file: %m")));
}
}
}
/* deallocate memory and reset all fields */
pfree(taskExecution->taskStatusArray);
pfree(taskExecution->connectionIdArray);
pfree(taskExecution->fileDescriptorArray);
memset(taskExecution, 0, sizeof(TaskExecution));
}
/* Determines if the given task exceeded its failure threshold. */
bool
TaskExecutionFailed(TaskExecution *taskExecution)
{
if (taskExecution->failureCount >= MAX_TASK_EXECUTION_FAILURES)
{
return true;
}
return false;
}
/*
* AdjustStateForFailure increments the failure count for given task execution.
* The function also determines the next worker node that should be contacted
* for remote execution.
*/
void
AdjustStateForFailure(TaskExecution *taskExecution)
{
int maxNodeIndex = taskExecution->nodeCount - 1;
Assert(maxNodeIndex >= 0);
if (taskExecution->currentNodeIndex < maxNodeIndex)
{
taskExecution->currentNodeIndex++; /* try next worker node */
}
else
{
taskExecution->currentNodeIndex = 0; /* go back to the first worker node */
}
taskExecution->dataFetchTaskIndex = -1; /* reset data fetch counter */
taskExecution->failureCount++; /* record failure */
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,234 @@
/*-------------------------------------------------------------------------
*
* master_create_shards.c
*
* This file contains functions to distribute a table by creating shards for it
* across a set of worker nodes.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include "fmgr.h"
#include "libpq-fe.h"
#include "miscadmin.h"
#include "port.h"
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/errno.h>
#include "catalog/namespace.h"
#include "catalog/pg_class.h"
#include "distributed/connection_cache.h"
#include "distributed/listutils.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/master_protocol.h"
#include "distributed/multi_join_order.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/pg_dist_shard.h"
#include "distributed/resource_lock.h"
#include "distributed/worker_manager.h"
#include "lib/stringinfo.h"
#include "nodes/pg_list.h"
#include "nodes/primnodes.h"
#include "postmaster/postmaster.h"
#include "storage/fd.h"
#include "storage/lock.h"
#include "utils/builtins.h"
#include "utils/elog.h"
#include "utils/errcodes.h"
#include "utils/lsyscache.h"
#include "utils/palloc.h"
/* local function forward declarations */
static void CheckHashPartitionedTable(Oid distributedTableId);
static text * IntegerToText(int32 value);
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(master_create_worker_shards);
/*
* master_create_worker_shards creates empty shards for the given table based
* on the specified number of initial shards. The function first gets a list of
* candidate nodes and issues DDL commands on the nodes to create empty shard
* placements on those nodes. The function then updates metadata on the master
* node to make this shard (and its placements) visible. Note that the function
* assumes the table is hash partitioned and calculates the min/max hash token
* ranges for each shard, giving them an equal split of the hash space.
*/
Datum
master_create_worker_shards(PG_FUNCTION_ARGS)
{
text *tableNameText = PG_GETARG_TEXT_P(0);
int32 shardCount = PG_GETARG_INT32(1);
int32 replicationFactor = PG_GETARG_INT32(2);
Oid distributedTableId = ResolveRelationId(tableNameText);
char relationKind = get_rel_relkind(distributedTableId);
char *tableName = text_to_cstring(tableNameText);
char shardStorageType = '\0';
List *workerNodeList = NIL;
List *ddlCommandList = NIL;
int32 workerNodeCount = 0;
uint32 placementAttemptCount = 0;
uint64 hashTokenIncrement = 0;
List *existingShardList = NIL;
int64 shardIndex = 0;
/* make sure table is hash partitioned */
CheckHashPartitionedTable(distributedTableId);
/* we plan to add shards: get an exclusive metadata lock */
LockRelationDistributionMetadata(distributedTableId, ExclusiveLock);
/* validate that shards haven't already been created for this table */
existingShardList = LoadShardList(distributedTableId);
if (existingShardList != NIL)
{
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("table \"%s\" has already had shards created for it",
tableName)));
}
/* make sure that at least one shard is specified */
if (shardCount <= 0)
{
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("shard_count must be positive")));
}
/* make sure that at least one replica is specified */
if (replicationFactor <= 0)
{
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("replication_factor must be positive")));
}
/* calculate the split of the hash space */
hashTokenIncrement = HASH_TOKEN_COUNT / shardCount;
/* load and sort the worker node list for deterministic placement */
workerNodeList = WorkerNodeList();
workerNodeList = SortList(workerNodeList, CompareWorkerNodes);
/* make sure we don't process cancel signals until all shards are created */
HOLD_INTERRUPTS();
/* retrieve the DDL commands for the table */
ddlCommandList = GetTableDDLEvents(distributedTableId);
workerNodeCount = list_length(workerNodeList);
if (replicationFactor > workerNodeCount)
{
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("replication_factor (%d) exceeds number of worker nodes "
"(%d)", replicationFactor, workerNodeCount),
errhint("Add more worker nodes or try again with a lower "
"replication factor.")));
}
/* if we have enough nodes, add an extra placement attempt for backup */
placementAttemptCount = (uint32) replicationFactor;
if (workerNodeCount > replicationFactor)
{
placementAttemptCount++;
}
/* set shard storage type according to relation type */
if (relationKind == RELKIND_FOREIGN_TABLE)
{
shardStorageType = SHARD_STORAGE_FOREIGN;
}
else
{
shardStorageType = SHARD_STORAGE_TABLE;
}
for (shardIndex = 0; shardIndex < shardCount; shardIndex++)
{
uint32 roundRobinNodeIndex = shardIndex % workerNodeCount;
/* initialize the hash token space for this shard */
text *minHashTokenText = NULL;
text *maxHashTokenText = NULL;
int32 shardMinHashToken = INT32_MIN + (shardIndex * hashTokenIncrement);
int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1);
Datum shardIdDatum = master_get_new_shardid(NULL);
int64 shardId = DatumGetInt64(shardIdDatum);
/* if we are at the last shard, make sure the max token value is INT_MAX */
if (shardIndex == (shardCount - 1))
{
shardMaxHashToken = INT32_MAX;
}
/* insert the shard metadata row along with its min/max values */
minHashTokenText = IntegerToText(shardMinHashToken);
maxHashTokenText = IntegerToText(shardMaxHashToken);
/*
* Grabbing the shard metadata lock isn't technically necessary since
* we already hold an exclusive lock on the partition table, but we'll
* acquire it for the sake of completeness. As we're adding new active
* placements, the mode must be exclusive.
*/
LockShardDistributionMetadata(shardId, ExclusiveLock);
CreateShardPlacements(shardId, ddlCommandList, workerNodeList,
roundRobinNodeIndex, replicationFactor);
InsertShardRow(distributedTableId, shardId, shardStorageType,
minHashTokenText, maxHashTokenText);
}
if (QueryCancelPending)
{
ereport(WARNING, (errmsg("cancel requests are ignored during shard creation")));
QueryCancelPending = false;
}
RESUME_INTERRUPTS();
PG_RETURN_VOID();
}
/*
* CheckHashPartitionedTable looks up the partition information for the given
* tableId and checks if the table is hash partitioned. If not, the function
* throws an error.
*/
static void
CheckHashPartitionedTable(Oid distributedTableId)
{
char partitionType = PartitionMethod(distributedTableId);
if (partitionType != DISTRIBUTE_BY_HASH)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("unsupported table partition type: %c", partitionType)));
}
}
/* Helper function to convert an integer value to a text type */
static text *
IntegerToText(int32 value)
{
text *valueText = NULL;
StringInfo valueString = makeStringInfo();
appendStringInfo(valueString, "%d", value);
valueText = cstring_to_text(valueString->data);
return valueText;
}

View File

@ -0,0 +1,446 @@
/*-------------------------------------------------------------------------
*
* master_delete_protocol.c
*
* Routine for deleting shards in the distributed cluster. This function takes
* in a delete command and deletes a shard if and only if all rows in the shard
* satisfy the conditions in the delete command.
*
* Copyright (c) 2014, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "catalog/pg_class.h"
#include "commands/dbcommands.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/master_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_client_executor.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/multi_server_executor.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/worker_protocol.h"
#include "optimizer/clauses.h"
#include "optimizer/predtest.h"
#include "optimizer/restrictinfo.h"
#include "optimizer/var.h"
#include "tcop/tcopprot.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
/* Local functions forward declarations */
static void CheckTableCount(Query *deleteQuery);
static void CheckDeleteCriteria(Node *deleteCriteria);
static void CheckPartitionColumn(Oid relationId, Node *whereClause);
static List * ShardsMatchingDeleteCriteria(Oid relationId, List *shardList,
Node *deleteCriteria);
static bool ExecuteRemoteCommand(const char *nodeName, uint32 nodePort,
StringInfo queryString);
/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(master_apply_delete_command);
/*
* master_apply_delete_command takes in a delete command, finds shards that
* match the criteria defined in the delete command, drops the found shards from
* the worker nodes, and updates the corresponding metadata on the master node.
* This function drops a shard if and only if all rows in the shard satisfy
* the conditions in the delete command. Note that this function only accepts
* conditions on the partition key and if no condition is provided then all
* shards are deleted.
*
* We mark shard placements that we couldn't drop as to be deleted later. If a
* shard satisfies the given conditions, we delete it from shard metadata table
* even though related shard placements are not deleted.
*/
Datum
master_apply_delete_command(PG_FUNCTION_ARGS)
{
text *queryText = PG_GETARG_TEXT_P(0);
char *queryString = text_to_cstring(queryText);
char *relationName = NULL;
text *relationNameText = NULL;
Oid relationId = InvalidOid;
List *shardIntervalList = NIL;
ListCell *shardIntervalCell = NULL;
List *deletableShardIntervalList = NIL;
List *queryTreeList = NIL;
Query *deleteQuery = NULL;
Node *whereClause = NULL;
Node *deleteCriteria = NULL;
Node *queryTreeNode = NULL;
DeleteStmt *deleteStatement = NULL;
int32 deleteCriteriaShardCount = 0;
LOCKTAG lockTag;
bool sessionLock = false;
bool dontWait = false;
char partitionMethod = 0;
queryTreeNode = ParseTreeNode(queryString);
if (!IsA(queryTreeNode, DeleteStmt))
{
ereport(ERROR, (errmsg("query \"%s\" is not a delete statement",
queryString)));
}
deleteStatement = (DeleteStmt *) queryTreeNode;
relationName = deleteStatement->relation->relname;
relationNameText = cstring_to_text(relationName);
relationId = ResolveRelationId(relationNameText);
CheckDistributedTable(relationId);
queryTreeList = pg_analyze_and_rewrite(queryTreeNode, queryString, NULL, 0);
deleteQuery = (Query *) linitial(queryTreeList);
CheckTableCount(deleteQuery);
/* get where clause and flatten it */
whereClause = (Node *) deleteQuery->jointree->quals;
deleteCriteria = eval_const_expressions(NULL, whereClause);
partitionMethod = PartitionMethod(relationId);
if ((partitionMethod == DISTRIBUTE_BY_HASH) && (deleteCriteria != NULL))
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot delete from distributed table"),
errdetail("Delete statements on hash-partitioned tables "
"with where clause is not supported")));
}
CheckDeleteCriteria(deleteCriteria);
CheckPartitionColumn(relationId, deleteCriteria);
/* acquire lock */
SET_LOCKTAG_ADVISORY(lockTag, MyDatabaseId, relationId, 0, 0);
LockAcquire(&lockTag, ExclusiveLock, sessionLock, dontWait);
shardIntervalList = LoadShardIntervalList(relationId);
/* drop all shards if where clause is not present */
if (deleteCriteria == NULL)
{
deletableShardIntervalList = shardIntervalList;
ereport(DEBUG2, (errmsg("dropping all shards for \"%s\"", relationName)));
}
else
{
deletableShardIntervalList = ShardsMatchingDeleteCriteria(relationId,
shardIntervalList,
deleteCriteria);
}
foreach(shardIntervalCell, deletableShardIntervalList)
{
List *shardPlacementList = NIL;
List *droppedPlacementList = NIL;
List *lingeringPlacementList= NIL;
ListCell *shardPlacementCell = NULL;
ListCell *droppedPlacementCell = NULL;
ListCell *lingeringPlacementCell = NULL;
ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell);
uint64 shardId = shardInterval->shardId;
char *quotedShardName = NULL;
/* if shard doesn't have an alias, extend regular table name */
char *shardName = LoadShardAlias(relationId, shardId);
if (shardName == NULL)
{
shardName = get_rel_name(relationId);
AppendShardIdToName(&shardName, shardId);
}
quotedShardName = quote_qualified_identifier(NULL, shardName);
shardPlacementList = ShardPlacementList(shardId);
foreach(shardPlacementCell, shardPlacementList)
{
ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell);
char *workerName = shardPlacement->nodeName;
uint32 workerPort = shardPlacement->nodePort;
bool dropSuccessful = false;
StringInfo workerDropQuery = makeStringInfo();
char tableType = get_rel_relkind(relationId);
if (tableType == RELKIND_RELATION)
{
appendStringInfo(workerDropQuery, DROP_REGULAR_TABLE_COMMAND, quotedShardName);
}
else if (tableType == RELKIND_FOREIGN_TABLE)
{
appendStringInfo(workerDropQuery, DROP_FOREIGN_TABLE_COMMAND, quotedShardName);
}
dropSuccessful = ExecuteRemoteCommand(workerName, workerPort, workerDropQuery);
if (dropSuccessful)
{
droppedPlacementList = lappend(droppedPlacementList, shardPlacement);
}
else
{
lingeringPlacementList = lappend(lingeringPlacementList, shardPlacement);
}
}
/* make sure we don't process cancel signals */
HOLD_INTERRUPTS();
foreach(droppedPlacementCell, droppedPlacementList)
{
ShardPlacement *placement = (ShardPlacement *) lfirst(droppedPlacementCell);
char *workerName = placement->nodeName;
uint32 workerPort = placement->nodePort;
DeleteShardPlacementRow(shardId, workerName, workerPort);
}
/* mark shard placements that we couldn't drop as to be deleted */
foreach(lingeringPlacementCell, lingeringPlacementList)
{
ShardPlacement *placement = (ShardPlacement *) lfirst(lingeringPlacementCell);
char *workerName = placement->nodeName;
uint32 workerPort = placement->nodePort;
uint64 oldShardLength = placement->shardLength;
DeleteShardPlacementRow(shardId, workerName, workerPort);
InsertShardPlacementRow(shardId, FILE_TO_DELETE, oldShardLength,
workerName, workerPort);
ereport(WARNING, (errmsg("could not delete shard \"%s\" on node "
"\"%s:%u\"", shardName, workerName, workerPort),
errdetail("Marking this shard placement for deletion")));
}
DeleteShardRow(shardId);
if (QueryCancelPending)
{
ereport(WARNING, (errmsg("cancel requests are ignored during shard deletion")));
QueryCancelPending = false;
}
RESUME_INTERRUPTS();
}
deleteCriteriaShardCount = list_length(deletableShardIntervalList);
PG_RETURN_INT32(deleteCriteriaShardCount);
}
/* Checks that delete is only on one table. */
static void
CheckTableCount(Query *deleteQuery)
{
int rangeTableCount = list_length(deleteQuery->rtable);
if (rangeTableCount > 1)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot delete from distributed table"),
errdetail("Delete on multiple tables is not supported")));
}
}
/* Checks that delete criteria only consists of simple operator expressions. */
static void
CheckDeleteCriteria(Node *deleteCriteria)
{
bool simpleOpExpression = true;
if (deleteCriteria == NULL)
{
return;
}
if (is_opclause(deleteCriteria))
{
simpleOpExpression = SimpleOpExpression((Expr *) deleteCriteria);
}
else if (IsA(deleteCriteria, BoolExpr))
{
ListCell *opExpressionCell = NULL;
BoolExpr *deleteCriteriaExpression = (BoolExpr *) deleteCriteria;
List *opExpressionList = deleteCriteriaExpression->args;
foreach(opExpressionCell, opExpressionList)
{
Expr *opExpression = (Expr *) lfirst(opExpressionCell);
if (!SimpleOpExpression(opExpression))
{
simpleOpExpression = false;
break;
}
}
}
else
{
simpleOpExpression = false;
}
if (!simpleOpExpression)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot delete from distributed table"),
errdetail("Delete query has a complex operator expression")));
}
}
/*
* CheckPartitionColumn checks that the given where clause is based only on the
* partition key of the given relation id.
*/
static void
CheckPartitionColumn(Oid relationId, Node *whereClause)
{
Var *partitionColumn = PartitionKey(relationId);
ListCell *columnCell = NULL;
List *columnList = pull_var_clause_default(whereClause);
foreach(columnCell, columnList)
{
Var *var = (Var *) lfirst(columnCell);
if (var->varattno != partitionColumn->varattno)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot delete from distributed table"),
errdetail("Where clause includes a column other than "
"partition column")));
}
}
}
/*
* ShardsMatchingDeleteCriteria selects shards to be deleted from the shard
* interval list based on the delete criteria, and returns selected shards in
* another list. We add a shard to the list if and only if all rows in the shard
* satisfy the delete criteria. Note that this function does not expect
* deleteCriteria to be NULL.
*/
static List *
ShardsMatchingDeleteCriteria(Oid relationId, List *shardIntervalList,
Node *deleteCriteria)
{
List *dropShardIntervalList = NIL;
List *deleteCriteriaList = NIL;
ListCell *shardIntervalCell = NULL;
/* build the base expression for constraint */
Index rangeTableIndex = 1;
Var *partitionColumn = PartitionColumn(relationId, rangeTableIndex);
Node *baseConstraint = BuildBaseConstraint(partitionColumn);
Assert(deleteCriteria != NULL);
deleteCriteriaList = list_make1(deleteCriteria);
/* walk over shard list and check if shards can be dropped */
foreach(shardIntervalCell, shardIntervalList)
{
ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell);
if (shardInterval->minValueExists && shardInterval->maxValueExists)
{
List *restrictInfoList = NIL;
bool dropShard = false;
BoolExpr *andExpr = NULL;
Expr *lessThanExpr = NULL;
Expr *greaterThanExpr = NULL;
RestrictInfo *lessThanRestrictInfo = NULL;
RestrictInfo *greaterThanRestrictInfo = NULL;
/* set the min/max values in the base constraint */
UpdateConstraint(baseConstraint, shardInterval);
andExpr = (BoolExpr *) baseConstraint;
lessThanExpr = (Expr *) linitial(andExpr->args);
greaterThanExpr = (Expr *) lsecond(andExpr->args);
lessThanRestrictInfo = make_simple_restrictinfo(lessThanExpr);
greaterThanRestrictInfo = make_simple_restrictinfo(greaterThanExpr);
restrictInfoList = lappend(restrictInfoList, lessThanRestrictInfo);
restrictInfoList = lappend(restrictInfoList, greaterThanRestrictInfo);
dropShard = predicate_implied_by(deleteCriteriaList, restrictInfoList);
if (dropShard)
{
dropShardIntervalList = lappend(dropShardIntervalList, shardInterval);
ereport(DEBUG2, (errmsg("delete criteria includes shardId "
UINT64_FORMAT, shardInterval->shardId)));
}
}
}
return dropShardIntervalList;
}
/*
* ExecuteRemoteCommand executes the given SQL command. This command could be an
* Insert, Update, or Delete statement, or a utility command that returns
* nothing. If query is successfuly executed, the function returns true.
* Otherwise, it returns false.
*/
static bool
ExecuteRemoteCommand(const char *nodeName, uint32 nodePort, StringInfo queryString)
{
char *nodeDatabase = get_database_name(MyDatabaseId);
int32 connectionId = -1;
QueryStatus queryStatus = CLIENT_INVALID_QUERY;
bool querySent = false;
bool queryReady = false;
bool queryDone = false;
connectionId = MultiClientConnect(nodeName, nodePort, nodeDatabase);
if (connectionId == INVALID_CONNECTION_ID)
{
return false;
}
querySent = MultiClientSendQuery(connectionId, queryString->data);
if (!querySent)
{
MultiClientDisconnect(connectionId);
return false;
}
while (!queryReady)
{
ResultStatus resultStatus = MultiClientResultStatus(connectionId);
if (resultStatus == CLIENT_RESULT_READY)
{
queryReady = true;
}
else if (resultStatus == CLIENT_RESULT_BUSY)
{
long sleepIntervalPerCycle = RemoteTaskCheckInterval * 1000L;
pg_usleep(sleepIntervalPerCycle);
}
else
{
MultiClientDisconnect(connectionId);
return false;
}
}
queryStatus = MultiClientQueryStatus(connectionId);
if (queryStatus == CLIENT_QUERY_DONE)
{
queryDone = true;
}
MultiClientDisconnect(connectionId);
return queryDone;
}

View File

@ -0,0 +1,587 @@
/*-------------------------------------------------------------------------
*
* master_metadata_utility.c
* Routines for reading and modifying master node's metadata.
*
* Copyright (c) 2014, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "access/htup_details.h"
#include "access/xact.h"
#include "catalog/indexing.h"
#include "catalog/pg_type.h"
#include "distributed/citus_nodes.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_join_order.h"
#include "distributed/multi_logical_optimizer.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/pg_dist_shard.h"
#include "distributed/pg_dist_shard_placement.h"
#include "distributed/worker_manager.h"
#include "nodes/makefuncs.h"
#include "parser/scansup.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/fmgroids.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
/* Local functions forward declarations */
static uint64 * AllocateUint64(uint64 value);
/*
* LoadShardIntervalList returns a list of shard intervals related for a given
* distributed table. The function returns an empty list if no shards can be
* found for the given relation.
*/
List *
LoadShardIntervalList(Oid relationId)
{
DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(relationId);
List *shardList = NIL;
int i = 0;
for (i = 0; i < cacheEntry->shardIntervalArrayLength; i++)
{
ShardInterval *newShardInterval = NULL;
newShardInterval = (ShardInterval *) palloc0(sizeof(ShardInterval));
CopyShardInterval(&cacheEntry->shardIntervalArray[i], newShardInterval);
shardList = lappend(shardList, newShardInterval);
}
return shardList;
}
/*
* LoadShardList reads list of shards for given relationId from pg_dist_shard,
* and returns the list of found shardIds.
*/
List *
LoadShardList(Oid relationId)
{
DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(relationId);
List *shardList = NIL;
int i = 0;
for (i = 0; i < cacheEntry->shardIntervalArrayLength; i++)
{
ShardInterval *currentShardInterval = &cacheEntry->shardIntervalArray[i];
uint64 *shardIdPointer = AllocateUint64(currentShardInterval->shardId);
shardList = lappend(shardList, shardIdPointer);
}
return shardList;
}
/* Allocates eight bytes, and copies given value's contents those bytes. */
static uint64 *
AllocateUint64(uint64 value)
{
uint64 *allocatedValue = (uint64 *) palloc0(sizeof(uint64));
Assert(sizeof(uint64) >= 8);
(*allocatedValue) = value;
return allocatedValue;
}
/*
* LoadShardAlias finds the row for given relation and shardId in pg_dist_shard,
* finds the shard alias in this row if any, and then deep copies this alias.
*/
char *
LoadShardAlias(Oid relationId, uint64 shardId)
{
SysScanDesc scanDescriptor = NULL;
ScanKeyData scanKey[1];
int scanKeyCount = 1;
HeapTuple heapTuple = NULL;
Datum shardAliasDatum = 0;
bool shardAliasNull = false;
char *shardAlias = NULL;
Relation pgDistShard = heap_open(DistShardRelationId(), AccessShareLock);
TupleDesc tupleDescriptor = RelationGetDescr(pgDistShard);
ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
scanDescriptor = systable_beginscan(pgDistShard,
DistShardShardidIndexId(), true,
NULL, scanKeyCount, scanKey);
/*
* Normally, we should have at most one tuple here as we have a unique index
* on shardId. However, if users want to drop this uniqueness constraint,
* and look up the shardalias based on the relation and shardId pair, we
* still allow that. We don't have any users relaying on this feature. Thus,
* we may consider to remove this check.
*/
heapTuple = systable_getnext(scanDescriptor);
while (HeapTupleIsValid(heapTuple))
{
Form_pg_dist_shard pgDistShardForm = (Form_pg_dist_shard) GETSTRUCT(heapTuple);
if (pgDistShardForm->logicalrelid == relationId)
{
break;
}
heapTuple = systable_getnext(scanDescriptor);
}
/* if no tuple found, error out */
if (!HeapTupleIsValid(heapTuple))
{
ereport(ERROR, (errmsg("could not find valid entry for relationId: %u "
"and shard " UINT64_FORMAT, relationId, shardId)));
}
/* if shard alias exists, deep copy cstring */
shardAliasDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardalias,
tupleDescriptor, &shardAliasNull);
if (!shardAliasNull)
{
shardAlias = TextDatumGetCString(shardAliasDatum);
}
systable_endscan(scanDescriptor);
heap_close(pgDistShard, AccessShareLock);
return shardAlias;
}
/*
* CopyShardInterval copies fields from the specified source ShardInterval
* into the fields of the provided destination ShardInterval.
*/
void
CopyShardInterval(ShardInterval *srcInterval, ShardInterval *destInterval)
{
destInterval->type = srcInterval->type;
destInterval->relationId = srcInterval->relationId;
destInterval->storageType = srcInterval->storageType;
destInterval->valueTypeId = srcInterval->valueTypeId;
destInterval->valueTypeLen = srcInterval->valueTypeLen;
destInterval->valueByVal = srcInterval->valueByVal;
destInterval->minValueExists = srcInterval->minValueExists;
destInterval->maxValueExists = srcInterval->maxValueExists;
destInterval->shardId = srcInterval->shardId;
destInterval->minValue = 0;
if (destInterval->minValueExists)
{
destInterval->minValue = datumCopy(srcInterval->minValue,
srcInterval->valueByVal,
srcInterval->valueTypeLen);
}
destInterval->maxValue = 0;
if (destInterval->maxValueExists)
{
destInterval->maxValue = datumCopy(srcInterval->maxValue,
srcInterval->valueByVal,
srcInterval->valueTypeLen);
}
}
/*
* ShardLength finds shard placements for the given shardId, extracts the length
* of a finalized shard, and returns the shard's length. This function errors
* out if we cannot find any finalized shard placements for the given shardId.
*/
uint64
ShardLength(uint64 shardId)
{
uint64 shardLength = 0;
List *shardPlacementList = FinalizedShardPlacementList(shardId);
if (shardPlacementList == NIL)
{
ereport(ERROR, (errmsg("could not find length of shard " UINT64_FORMAT, shardId),
errdetail("Could not find any shard placements for the shard.")));
}
else
{
ShardPlacement *shardPlacement = (ShardPlacement *) linitial(shardPlacementList);
shardLength = shardPlacement->shardLength;
}
return shardLength;
}
/*
* FinalizedShardPlacementList finds shard placements for the given shardId from
* system catalogs, chooses placements that are in finalized state, and returns
* these shard placements in a new list.
*/
List *
FinalizedShardPlacementList(uint64 shardId)
{
List *finalizedPlacementList = NIL;
List *shardPlacementList = ShardPlacementList(shardId);
ListCell *shardPlacementCell = NULL;
foreach(shardPlacementCell, shardPlacementList)
{
ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell);
if (shardPlacement->shardState == FILE_FINALIZED)
{
finalizedPlacementList = lappend(finalizedPlacementList, shardPlacement);
}
}
return finalizedPlacementList;
}
/*
* ShardPlacementList finds shard placements for the given shardId from system
* catalogs, converts these placements to their in-memory representation, and
* returns the converted shard placements in a new list.
*/
List *
ShardPlacementList(uint64 shardId)
{
List *shardPlacementList = NIL;
Relation pgShardPlacement = NULL;
SysScanDesc scanDescriptor = NULL;
ScanKeyData scanKey[1];
int scanKeyCount = 1;
bool indexOK = true;
HeapTuple heapTuple = NULL;
pgShardPlacement = heap_open(DistShardPlacementRelationId(), AccessShareLock);
ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_placement_shardid,
BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
scanDescriptor = systable_beginscan(pgShardPlacement,
DistShardPlacementShardidIndexId(), indexOK,
NULL, scanKeyCount, scanKey);
heapTuple = systable_getnext(scanDescriptor);
while (HeapTupleIsValid(heapTuple))
{
TupleDesc tupleDescriptor = RelationGetDescr(pgShardPlacement);
ShardPlacement *placement = TupleToShardPlacement(tupleDescriptor, heapTuple);
shardPlacementList = lappend(shardPlacementList, placement);
heapTuple = systable_getnext(scanDescriptor);
}
systable_endscan(scanDescriptor);
heap_close(pgShardPlacement, AccessShareLock);
/* if no shard placements are found, warn the user */
if (shardPlacementList == NIL)
{
ereport(WARNING, (errmsg("could not find any shard placements for shardId "
UINT64_FORMAT, shardId)));
}
return shardPlacementList;
}
/*
* TupleToShardPlacement takes in a heap tuple from pg_dist_shard_placement, and
* converts this tuple to an equivalent struct in memory. The function assumes
* the caller already has locks on the tuple, and doesn't perform any locking.
*/
ShardPlacement *
TupleToShardPlacement(TupleDesc tupleDescriptor, HeapTuple heapTuple)
{
ShardPlacement *shardPlacement = NULL;
bool isNull = false;
Oid tupleOid = HeapTupleGetOid(heapTuple);
Datum shardId = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_shardid,
tupleDescriptor, &isNull);
Datum shardLength = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_shardlength,
tupleDescriptor, &isNull);
Datum shardState = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_shardstate,
tupleDescriptor, &isNull);
Datum nodeName = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_nodename,
tupleDescriptor, &isNull);
Datum nodePort = heap_getattr(heapTuple, Anum_pg_dist_shard_placement_nodeport,
tupleDescriptor, &isNull);
Assert(!HeapTupleHasNulls(heapTuple));
shardPlacement = CitusMakeNode(ShardPlacement);
shardPlacement->tupleOid = tupleOid;
shardPlacement->shardId = DatumGetInt64(shardId);
shardPlacement->shardLength = DatumGetInt64(shardLength);
shardPlacement->shardState = DatumGetUInt32(shardState);
shardPlacement->nodeName = TextDatumGetCString(nodeName);
shardPlacement->nodePort = DatumGetUInt32(nodePort);
return shardPlacement;
}
/*
* InsertShardRow opens the shard system catalog, and inserts a new row with the
* given values into that system catalog. Note that we allow the user to pass in
* null min/max values in case they are creating an empty shard.
*/
void
InsertShardRow(Oid relationId, uint64 shardId, char storageType,
text *shardMinValue, text *shardMaxValue)
{
Relation pgDistShard = NULL;
TupleDesc tupleDescriptor = NULL;
HeapTuple heapTuple = NULL;
Datum values[Natts_pg_dist_shard];
bool isNulls[Natts_pg_dist_shard];
/* form new shard tuple */
memset(values, 0, sizeof(values));
memset(isNulls, false, sizeof(isNulls));
values[Anum_pg_dist_shard_logicalrelid - 1] = ObjectIdGetDatum(relationId);
values[Anum_pg_dist_shard_shardid - 1] = Int64GetDatum(shardId);
values[Anum_pg_dist_shard_shardstorage - 1] = CharGetDatum(storageType);
/* check if shard min/max values are null */
if (shardMinValue != NULL && shardMaxValue != NULL)
{
values[Anum_pg_dist_shard_shardminvalue - 1] = PointerGetDatum(shardMinValue);
values[Anum_pg_dist_shard_shardmaxvalue - 1] = PointerGetDatum(shardMaxValue);
/* we always set shard alias to null */
isNulls[Anum_pg_dist_shard_shardalias - 1] = true;
}
else
{
isNulls[Anum_pg_dist_shard_shardminvalue - 1] = true;
isNulls[Anum_pg_dist_shard_shardmaxvalue - 1] = true;
isNulls[Anum_pg_dist_shard_shardalias - 1] = true;
}
/* open shard relation and insert new tuple */
pgDistShard = heap_open(DistShardRelationId(), RowExclusiveLock);
tupleDescriptor = RelationGetDescr(pgDistShard);
heapTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
simple_heap_insert(pgDistShard, heapTuple);
CatalogUpdateIndexes(pgDistShard, heapTuple);
CommandCounterIncrement();
/* close relation and invalidate previous cache entry */
heap_close(pgDistShard, RowExclusiveLock);
CacheInvalidateRelcacheByRelid(relationId);
}
/*
* InsertShardPlacementRow opens the shard placement system catalog, and inserts
* a new row with the given values into that system catalog.
*/
void
InsertShardPlacementRow(uint64 shardId, char shardState, uint64 shardLength,
char *nodeName, uint32 nodePort)
{
Relation pgDistShardPlacement = NULL;
TupleDesc tupleDescriptor = NULL;
HeapTuple heapTuple = NULL;
Datum values[Natts_pg_dist_shard_placement];
bool isNulls[Natts_pg_dist_shard_placement];
/* form new shard placement tuple */
memset(values, 0, sizeof(values));
memset(isNulls, false, sizeof(isNulls));
values[Anum_pg_dist_shard_placement_shardid - 1] = Int64GetDatum(shardId);
values[Anum_pg_dist_shard_placement_shardstate - 1] = CharGetDatum(shardState);
values[Anum_pg_dist_shard_placement_shardlength - 1] = Int64GetDatum(shardLength);
values[Anum_pg_dist_shard_placement_nodename - 1] = CStringGetTextDatum(nodeName);
values[Anum_pg_dist_shard_placement_nodeport - 1] = UInt32GetDatum(nodePort);
/* open shard placement relation and insert new tuple */
pgDistShardPlacement = heap_open(DistShardPlacementRelationId(), RowExclusiveLock);
tupleDescriptor = RelationGetDescr(pgDistShardPlacement);
heapTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
simple_heap_insert(pgDistShardPlacement, heapTuple);
CatalogUpdateIndexes(pgDistShardPlacement, heapTuple);
CommandCounterIncrement();
/* close relation */
heap_close(pgDistShardPlacement, RowExclusiveLock);
}
/*
* DeleteShardRow opens the shard system catalog, finds the unique row that has
* the given shardId, and deletes this row.
*/
void
DeleteShardRow(uint64 shardId)
{
Relation pgDistShard = NULL;
SysScanDesc scanDescriptor = NULL;
ScanKeyData scanKey[1];
int scanKeyCount = 1;
bool indexOK = true;
HeapTuple heapTuple = NULL;
Form_pg_dist_shard pgDistShardForm = NULL;
Oid distributedRelationId = InvalidOid;
pgDistShard = heap_open(DistShardRelationId(), RowExclusiveLock);
ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
scanDescriptor = systable_beginscan(pgDistShard,
DistShardShardidIndexId(), indexOK,
NULL, scanKeyCount, scanKey);
heapTuple = systable_getnext(scanDescriptor);
if (!HeapTupleIsValid(heapTuple))
{
ereport(ERROR, (errmsg("could not find valid entry for shard "
UINT64_FORMAT, shardId)));
}
pgDistShardForm = (Form_pg_dist_shard) GETSTRUCT(heapTuple);
distributedRelationId = pgDistShardForm->logicalrelid;
simple_heap_delete(pgDistShard, &heapTuple->t_self);
CommandCounterIncrement();
systable_endscan(scanDescriptor);
heap_close(pgDistShard, RowExclusiveLock);
/* invalidate previous cache entry */
CacheInvalidateRelcacheByRelid(distributedRelationId);
}
/*
* DeleteShardPlacementRow opens the shard placement system catalog, finds the
* first (unique) row that corresponds to the given shardId and worker node, and
* deletes this row.
*/
void
DeleteShardPlacementRow(uint64 shardId, char *workerName, uint32 workerPort)
{
Relation pgDistShardPlacement = NULL;
SysScanDesc scanDescriptor = NULL;
ScanKeyData scanKey[1];
int scanKeyCount = 1;
bool indexOK = true;
HeapTuple heapTuple = NULL;
bool heapTupleFound = false;
pgDistShardPlacement = heap_open(DistShardPlacementRelationId(), RowExclusiveLock);
ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_placement_shardid,
BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
scanDescriptor = systable_beginscan(pgDistShardPlacement,
DistShardPlacementShardidIndexId(), indexOK,
NULL, scanKeyCount, scanKey);
heapTuple = systable_getnext(scanDescriptor);
while (HeapTupleIsValid(heapTuple))
{
TupleDesc tupleDescriptor = RelationGetDescr(pgDistShardPlacement);
ShardPlacement *placement = TupleToShardPlacement(tupleDescriptor, heapTuple);
if (strncmp(placement->nodeName, workerName, WORKER_LENGTH) == 0 &&
placement->nodePort == workerPort)
{
heapTupleFound = true;
break;
}
heapTuple = systable_getnext(scanDescriptor);
}
/* if we couldn't find the shard placement to delete, error out */
if (!heapTupleFound)
{
ereport(ERROR, (errmsg("could not find valid entry for shard placement "
UINT64_FORMAT " on node \"%s:%u\"",
shardId, workerName, workerPort)));
}
simple_heap_delete(pgDistShardPlacement, &heapTuple->t_self);
CommandCounterIncrement();
systable_endscan(scanDescriptor);
heap_close(pgDistShardPlacement, RowExclusiveLock);
}
/*
* BuildDistributionKeyFromColumnName builds a simple distribution key consisting
* only out of a reference to the column of name columnName. Errors out if the
* specified column does not exist or is not suitable to be used as a
* distribution column.
*/
Node *
BuildDistributionKeyFromColumnName(Relation distributedRelation, char *columnName)
{
HeapTuple columnTuple = NULL;
Form_pg_attribute columnForm = NULL;
Var *column = NULL;
char *tableName = RelationGetRelationName(distributedRelation);
/* it'd probably better to downcase identifiers consistent with SQL case folding */
truncate_identifier(columnName, strlen(columnName), true);
/* lookup column definition */
columnTuple = SearchSysCacheAttName(RelationGetRelid(distributedRelation),
columnName);
if (!HeapTupleIsValid(columnTuple))
{
ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("column \"%s\" of relation \"%s\" does not exist",
columnName, tableName)));
}
columnForm = (Form_pg_attribute) GETSTRUCT(columnTuple);
/* check if the column may be referenced in the distribution key */
if (columnForm->attnum <= 0)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot reference system column \"%s\" in relation \"%s\"",
columnName, tableName)));
}
/* build Var referencing only the chosen distribution column */
column = makeVar(1, columnForm->attnum, columnForm->atttypid,
columnForm->atttypmod, columnForm->attcollation, 0);
ReleaseSysCache(columnTuple);
return (Node *) column;
}

View File

@ -0,0 +1,756 @@
/*-------------------------------------------------------------------------
*
* master_node_protocol.c
* Routines for requesting information from the master node for creating or
* updating shards.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "access/htup_details.h"
#include "catalog/catalog.h"
#include "catalog/dependency.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/pg_index.h"
#include "catalog/pg_type.h"
#include "commands/sequence.h"
#include "distributed/citus_ruleutils.h"
#include "distributed/listutils.h"
#include "distributed/master_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/pg_dist_shard.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/worker_manager.h"
#include "foreign/foreign.h"
#include "libpq/ip.h"
#include "libpq/libpq-be.h"
#include "nodes/pg_list.h"
#include "storage/lock.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#if (PG_VERSION_NUM >= 90500)
#include "utils/ruleutils.h"
#endif
#include "utils/syscache.h"
#include "utils/tqual.h"
/* Shard related configuration */
int ShardReplicationFactor = 2; /* desired replication factor for shards */
int ShardMaxSize = 1048576; /* maximum size in KB one shard can grow to */
int ShardPlacementPolicy = SHARD_PLACEMENT_ROUND_ROBIN;
static char * hostname_client_addr(void);
static Datum WorkerNodeGetDatum(WorkerNode *workerNode, TupleDesc tupleDescriptor);
/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(master_get_table_metadata);
PG_FUNCTION_INFO_V1(master_get_table_ddl_events);
PG_FUNCTION_INFO_V1(master_get_new_shardid);
PG_FUNCTION_INFO_V1(master_get_local_first_candidate_nodes);
PG_FUNCTION_INFO_V1(master_get_round_robin_candidate_nodes);
PG_FUNCTION_INFO_V1(master_get_active_worker_nodes);
/*
* master_get_table_metadata takes in a relation name, and returns partition
* related metadata for the relation. These metadata are grouped and returned in
* a tuple, and are used by the caller when creating new shards. The function
* errors if given relation does not exist, or is not partitioned.
*/
Datum
master_get_table_metadata(PG_FUNCTION_ARGS)
{
text *relationName = PG_GETARG_TEXT_P(0);
Oid relationId = ResolveRelationId(relationName);
DistTableCacheEntry *partitionEntry = NULL;
TypeFuncClass resultTypeClass = 0;
Datum partitionKeyExpr = 0;
Datum partitionKey = 0;
Datum metadataDatum = 0;
HeapTuple metadataTuple = NULL;
TupleDesc metadataDescriptor = NULL;
uint64 shardMaxSizeInBytes = 0;
char relationType = 0;
char storageType = 0;
Datum values[TABLE_METADATA_FIELDS];
bool isNulls[TABLE_METADATA_FIELDS];
/* find partition tuple for partitioned relation */
partitionEntry = DistributedTableCacheEntry(relationId);
/* create tuple descriptor for return value */
resultTypeClass = get_call_result_type(fcinfo, NULL, &metadataDescriptor);
if (resultTypeClass != TYPEFUNC_COMPOSITE)
{
ereport(ERROR, (errmsg("return type must be a row type")));
}
/* get decompiled expression tree for partition key */
partitionKeyExpr =
PointerGetDatum(cstring_to_text(partitionEntry->partitionKeyString));
partitionKey = DirectFunctionCall2(pg_get_expr, partitionKeyExpr,
ObjectIdGetDatum(relationId));
/* form heap tuple for table metadata */
memset(values, 0, sizeof(values));
memset(isNulls, false, sizeof(isNulls));
shardMaxSizeInBytes = (int64) ShardMaxSize * 1024L;
/* get storage type */
relationType = get_rel_relkind(relationId);
if (relationType == RELKIND_RELATION)
{
storageType = SHARD_STORAGE_TABLE;
}
else if (relationType == RELKIND_FOREIGN_TABLE)
{
bool cstoreTable = CStoreTable(relationId);
if (cstoreTable)
{
storageType = SHARD_STORAGE_COLUMNAR;
}
else
{
storageType = SHARD_STORAGE_FOREIGN;
}
}
values[0] = ObjectIdGetDatum(relationId);
values[1] = storageType;
values[2] = partitionEntry->partitionMethod;
values[3] = partitionKey;
values[4] = Int32GetDatum(ShardReplicationFactor);
values[5] = Int64GetDatum(shardMaxSizeInBytes);
values[6] = Int32GetDatum(ShardPlacementPolicy);
metadataTuple = heap_form_tuple(metadataDescriptor, values, isNulls);
metadataDatum = HeapTupleGetDatum(metadataTuple);
PG_RETURN_DATUM(metadataDatum);
}
/*
* CStoreTable returns true if the given relationId belongs to a foreign cstore
* table, otherwise it returns false.
*/
bool
CStoreTable(Oid relationId)
{
bool cstoreTable = false;
char relationKind = get_rel_relkind(relationId);
if (relationKind == RELKIND_FOREIGN_TABLE)
{
ForeignTable *foreignTable = GetForeignTable(relationId);
ForeignServer *server = GetForeignServer(foreignTable->serverid);
ForeignDataWrapper *foreignDataWrapper = GetForeignDataWrapper(server->fdwid);
if (strncmp(foreignDataWrapper->fdwname, CSTORE_FDW_NAME, NAMEDATALEN) == 0)
{
cstoreTable = true;
}
}
return cstoreTable;
}
/*
* master_get_table_ddl_events takes in a relation name, and returns the set of
* DDL commands needed to reconstruct the relation. The returned DDL commands
* are similar in flavor to schema definitions that pgdump returns. The function
* errors if given relation does not exist.
*/
Datum
master_get_table_ddl_events(PG_FUNCTION_ARGS)
{
FuncCallContext *functionContext = NULL;
ListCell *tableDDLEventCell = NULL;
/*
* On the very first call to this function, we first use the given relation
* name to get to the relation. We then recreate the list of DDL statements
* issued for this relation, and save the first statement's position in the
* function context.
*/
if (SRF_IS_FIRSTCALL())
{
text *relationName = PG_GETARG_TEXT_P(0);
Oid relationId = ResolveRelationId(relationName);
MemoryContext oldContext = NULL;
List *tableDDLEventList = NIL;
/* create a function context for cross-call persistence */
functionContext = SRF_FIRSTCALL_INIT();
/* switch to memory context appropriate for multiple function calls */
oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
/* allocate DDL statements, and then save position in DDL statements */
tableDDLEventList = GetTableDDLEvents(relationId);
tableDDLEventCell = list_head(tableDDLEventList);
functionContext->user_fctx = tableDDLEventCell;
MemoryContextSwitchTo(oldContext);
}
/*
* On every call to this function, we get the current position in the
* statement list. We then iterate to the next position in the list and
* return the current statement, if we have not yet reached the end of
* list.
*/
functionContext = SRF_PERCALL_SETUP();
tableDDLEventCell = (ListCell *) functionContext->user_fctx;
if (tableDDLEventCell != NULL)
{
char *ddlStatement = (char *) lfirst(tableDDLEventCell);
text *ddlStatementText = cstring_to_text(ddlStatement);
functionContext->user_fctx = lnext(tableDDLEventCell);
SRF_RETURN_NEXT(functionContext, PointerGetDatum(ddlStatementText));
}
else
{
SRF_RETURN_DONE(functionContext);
}
}
/*
* master_get_new_shardid allocates and returns a unique shardId for the shard
* to be created. This allocation occurs both in shared memory and in write
* ahead logs; writing to logs avoids the risk of having shardId collisions.
*
* Please note that the caller is still responsible for finalizing shard data
* and the shardId with the master node. Further note that this function relies
* on an internal sequence created in initdb to generate unique identifiers.
*/
Datum
master_get_new_shardid(PG_FUNCTION_ARGS)
{
text *sequenceName = cstring_to_text(SHARDID_SEQUENCE_NAME);
Oid sequenceId = ResolveRelationId(sequenceName);
Datum sequenceIdDatum = ObjectIdGetDatum(sequenceId);
/* generate new and unique shardId from sequence */
Datum shardIdDatum = DirectFunctionCall1(nextval_oid, sequenceIdDatum);
int64 shardId = DatumGetInt64(shardIdDatum);
PG_RETURN_INT64(shardId);
}
/*
* master_get_local_first_candidate_nodes returns a set of candidate host names
* and port numbers on which to place new shards. The function makes sure to
* always allocate the first candidate node as the node the caller is connecting
* from; and allocates additional nodes until the shard replication factor is
* met. The function errors if the caller's remote node name is not found in the
* membership list, or if the number of available nodes falls short of the
* replication factor.
*/
Datum
master_get_local_first_candidate_nodes(PG_FUNCTION_ARGS)
{
FuncCallContext *functionContext = NULL;
uint32 desiredNodeCount = 0;
uint32 currentNodeCount = 0;
if (SRF_IS_FIRSTCALL())
{
MemoryContext oldContext = NULL;
TupleDesc tupleDescriptor = NULL;
uint32 liveNodeCount = 0;
bool hasOid = false;
/* create a function context for cross-call persistence */
functionContext = SRF_FIRSTCALL_INIT();
/* switch to memory context appropriate for multiple function calls */
oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
functionContext->user_fctx = NIL;
functionContext->max_calls = ShardReplicationFactor;
/* if enough live nodes, return an extra candidate node as backup */
liveNodeCount = WorkerGetLiveNodeCount();
if (liveNodeCount > ShardReplicationFactor)
{
functionContext->max_calls = ShardReplicationFactor + 1;
}
/*
* This tuple descriptor must match the output parameters declared for
* the function in pg_proc.
*/
tupleDescriptor = CreateTemplateTupleDesc(CANDIDATE_NODE_FIELDS, hasOid);
TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name",
TEXTOID, -1, 0);
TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port",
INT8OID, -1, 0);
functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor);
MemoryContextSwitchTo(oldContext);
}
functionContext = SRF_PERCALL_SETUP();
desiredNodeCount = functionContext->max_calls;
currentNodeCount = functionContext->call_cntr;
if (currentNodeCount < desiredNodeCount)
{
MemoryContext oldContext = NULL;
List *currentNodeList = NIL;
WorkerNode *candidateNode = NULL;
Datum candidateDatum = 0;
/* switch to memory context appropriate for multiple function calls */
oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
currentNodeList = functionContext->user_fctx;
if (currentNodeCount == 0)
{
/* choose first candidate node to be the client's host */
char *remoteHostname = hostname_client_addr();
/* if hostname is localhost.localdomain, change it to localhost */
int nameCompare = strncmp(remoteHostname, "localhost.localdomain",
WORKER_LENGTH);
if (nameCompare == 0)
{
remoteHostname = pstrdup("localhost");
}
candidateNode = WorkerGetNodeWithName(remoteHostname);
if (candidateNode == NULL)
{
ereport(ERROR, (errmsg("could not find worker node for hostname: %s",
remoteHostname)));
}
}
else
{
/* find a candidate node different from those already selected */
candidateNode = WorkerGetCandidateNode(currentNodeList);
if (candidateNode == NULL)
{
ereport(ERROR, (errmsg("could only find %u of %u required nodes",
currentNodeCount, desiredNodeCount)));
}
}
currentNodeList = lappend(currentNodeList, candidateNode);
functionContext->user_fctx = currentNodeList;
MemoryContextSwitchTo(oldContext);
candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc);
SRF_RETURN_NEXT(functionContext, candidateDatum);
}
else
{
SRF_RETURN_DONE(functionContext);
}
}
/*
* master_get_round_robin_candidate_nodes returns a set of candidate host names
* and port numbers on which to place new shards. The function uses the round
* robin policy to choose the nodes and tries to ensure that there is an even
* distribution of shards across the worker nodes. This function errors out if
* the number of available nodes falls short of the replication factor.
*/
Datum
master_get_round_robin_candidate_nodes(PG_FUNCTION_ARGS)
{
uint64 shardId = PG_GETARG_INT64(0);
FuncCallContext *functionContext = NULL;
uint32 desiredNodeCount = 0;
uint32 currentNodeCount = 0;
if (SRF_IS_FIRSTCALL())
{
MemoryContext oldContext = NULL;
TupleDesc tupleDescriptor = NULL;
List *workerNodeList = NIL;
TypeFuncClass resultTypeClass = 0;
uint32 workerNodeCount = 0;
/* create a function context for cross-call persistence */
functionContext = SRF_FIRSTCALL_INIT();
/* switch to memory context appropriate for multiple function calls */
oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
/* get the worker node list and sort it for determinism */
workerNodeList = WorkerNodeList();
workerNodeList = SortList(workerNodeList, CompareWorkerNodes);
functionContext->user_fctx = workerNodeList;
functionContext->max_calls = ShardReplicationFactor;
/* if we enough live nodes, return an extra candidate node as backup */
workerNodeCount = (uint32) list_length(workerNodeList);
if (workerNodeCount > ShardReplicationFactor)
{
functionContext->max_calls = ShardReplicationFactor + 1;
}
/* create tuple descriptor for return value */
resultTypeClass = get_call_result_type(fcinfo, NULL, &tupleDescriptor);
if (resultTypeClass != TYPEFUNC_COMPOSITE)
{
ereport(ERROR, (errmsg("return type must be a row type")));
}
functionContext->tuple_desc = tupleDescriptor;
MemoryContextSwitchTo(oldContext);
}
functionContext = SRF_PERCALL_SETUP();
desiredNodeCount = functionContext->max_calls;
currentNodeCount = functionContext->call_cntr;
if (currentNodeCount < desiredNodeCount)
{
List *workerNodeList = functionContext->user_fctx;
WorkerNode *candidateNode = NULL;
Datum candidateDatum = 0;
candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId,
currentNodeCount);
if (candidateNode == NULL)
{
ereport(ERROR, (errmsg("could only find %u of %u required nodes",
currentNodeCount, desiredNodeCount)));
}
candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc);
SRF_RETURN_NEXT(functionContext, candidateDatum);
}
else
{
SRF_RETURN_DONE(functionContext);
}
}
/*
* master_get_active_worker_nodes returns a set of active worker host names and
* port numbers in deterministic order. Currently we assume that all worker
* nodes in pg_worker_list.conf are active.
*/
Datum
master_get_active_worker_nodes(PG_FUNCTION_ARGS)
{
FuncCallContext *functionContext = NULL;
uint32 workerNodeIndex = 0;
uint32 workerNodeCount = 0;
if (SRF_IS_FIRSTCALL())
{
MemoryContext oldContext = NULL;
List *workerNodeList = NIL;
uint32 workerNodeCount = 0;
TupleDesc tupleDescriptor = NULL;
bool hasOid = false;
/* create a function context for cross-call persistence */
functionContext = SRF_FIRSTCALL_INIT();
/* switch to memory context appropriate for multiple function calls */
oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
workerNodeList = WorkerNodeList();
workerNodeCount = (uint32) list_length(workerNodeList);
functionContext->user_fctx = workerNodeList;
functionContext->max_calls = workerNodeCount;
/*
* This tuple descriptor must match the output parameters declared for
* the function in pg_proc.
*/
tupleDescriptor = CreateTemplateTupleDesc(WORKER_NODE_FIELDS, hasOid);
TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name",
TEXTOID, -1, 0);
TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port",
INT8OID, -1, 0);
functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor);
MemoryContextSwitchTo(oldContext);
}
functionContext = SRF_PERCALL_SETUP();
workerNodeIndex = functionContext->call_cntr;
workerNodeCount = functionContext->max_calls;
if (workerNodeIndex < workerNodeCount)
{
List *workerNodeList = functionContext->user_fctx;
WorkerNode *workerNode = list_nth(workerNodeList, workerNodeIndex);
Datum workerNodeDatum = WorkerNodeGetDatum(workerNode,
functionContext->tuple_desc);
SRF_RETURN_NEXT(functionContext, workerNodeDatum);
}
else
{
SRF_RETURN_DONE(functionContext);
}
}
/* Finds the relationId from a potentially qualified relation name. */
Oid
ResolveRelationId(text *relationName)
{
List *relationNameList = NIL;
RangeVar *relation = NULL;
Oid relationId = InvalidOid;
bool failOK = false; /* error if relation cannot be found */
/* resolve relationId from passed in schema and relation name */
relationNameList = textToQualifiedNameList(relationName);
relation = makeRangeVarFromNameList(relationNameList);
relationId = RangeVarGetRelid(relation, NoLock, failOK);
return relationId;
}
/*
* GetTableDDLEvents takes in a relationId, and returns the list of DDL commands
* needed to reconstruct the relation. These DDL commands are all palloced; and
* include the table's schema definition, optional column storage and statistics
* definitions, and index and constraint defitions.
*/
List *
GetTableDDLEvents(Oid relationId)
{
List *tableDDLEventList = NIL;
char tableType = 0;
char *tableSchemaDef = NULL;
char *tableColumnOptionsDef = NULL;
char *schemaName = NULL;
Oid schemaId = InvalidOid;
Relation pgIndex = NULL;
SysScanDesc scanDescriptor = NULL;
ScanKeyData scanKey[1];
int scanKeyCount = 1;
HeapTuple heapTuple = NULL;
/* if foreign table, fetch extension and server definitions */
tableType = get_rel_relkind(relationId);
if (tableType == RELKIND_FOREIGN_TABLE)
{
char *extensionDef = pg_get_extensiondef_string(relationId);
char *serverDef = pg_get_serverdef_string(relationId);
if (extensionDef != NULL)
{
tableDDLEventList = lappend(tableDDLEventList, extensionDef);
}
tableDDLEventList = lappend(tableDDLEventList, serverDef);
}
/* create schema if the table is not in the default namespace (public) */
schemaId = get_rel_namespace(relationId);
schemaName = get_namespace_name(schemaId);
if (strncmp(schemaName, "public", NAMEDATALEN) != 0)
{
StringInfo schemaNameDef = makeStringInfo();
appendStringInfo(schemaNameDef, CREATE_SCHEMA_COMMAND, schemaName);
tableDDLEventList = lappend(tableDDLEventList, schemaNameDef->data);
}
/* fetch table schema and column option definitions */
tableSchemaDef = pg_get_tableschemadef_string(relationId);
tableColumnOptionsDef = pg_get_tablecolumnoptionsdef_string(relationId);
tableDDLEventList = lappend(tableDDLEventList, tableSchemaDef);
if (tableColumnOptionsDef != NULL)
{
tableDDLEventList = lappend(tableDDLEventList, tableColumnOptionsDef);
}
/* open system catalog and scan all indexes that belong to this table */
pgIndex = heap_open(IndexRelationId, AccessShareLock);
ScanKeyInit(&scanKey[0], Anum_pg_index_indrelid,
BTEqualStrategyNumber, F_OIDEQ, relationId);
scanDescriptor = systable_beginscan(pgIndex,
IndexIndrelidIndexId, true, /* indexOK */
NULL, scanKeyCount, scanKey);
heapTuple = systable_getnext(scanDescriptor);
while (HeapTupleIsValid(heapTuple))
{
Form_pg_index indexForm = (Form_pg_index) GETSTRUCT(heapTuple);
Oid indexId = indexForm->indexrelid;
bool isConstraint = false;
char *statementDef = NULL;
/*
* A primary key index is always created by a constraint statement.
* A unique key index is created by a constraint if and only if the
* index has a corresponding constraint entry in pg_depend. Any other
* index form is never associated with a constraint.
*/
if (indexForm->indisprimary)
{
isConstraint = true;
}
else if (indexForm->indisunique)
{
Oid constraintId = get_index_constraint(indexId);
isConstraint = OidIsValid(constraintId);
}
else
{
isConstraint = false;
}
/* get the corresponding constraint or index statement */
if (isConstraint)
{
Oid constraintId = get_index_constraint(indexId);
Assert(constraintId != InvalidOid);
#if (PG_VERSION_NUM >= 90500)
statementDef = pg_get_constraintdef_command(constraintId);
#else
statementDef = pg_get_constraintdef_string(constraintId);
#endif
}
else
{
statementDef = pg_get_indexdef_string(indexId);
}
/* append found constraint or index definition to the list */
tableDDLEventList = lappend(tableDDLEventList, statementDef);
/* if table is clustered on this index, append definition to the list */
if (indexForm->indisclustered)
{
char *clusteredDef = pg_get_indexclusterdef_string(indexId);
Assert(clusteredDef != NULL);
tableDDLEventList = lappend(tableDDLEventList, clusteredDef);
}
heapTuple = systable_getnext(scanDescriptor);
}
/* clean up scan and close system catalog */
systable_endscan(scanDescriptor);
heap_close(pgIndex, AccessShareLock);
return tableDDLEventList;
}
/*
* hostname_client_addr allocates memory for the connecting client's fully
* qualified hostname, and returns this name. If there is no such connection or
* the connection is over Unix domain socket, the function errors.
*/
static char *
hostname_client_addr(void)
{
Port *port = MyProcPort;
char *remoteHost = NULL;
int remoteHostLen = NI_MAXHOST;
int flags = NI_NAMEREQD; /* require fully qualified hostname */
int nameFound = 0;
if (port == NULL)
{
ereport(ERROR, (errmsg("cannot find tcp/ip connection to client")));
}
switch (port->raddr.addr.ss_family)
{
case AF_INET:
#ifdef HAVE_IPV6
case AF_INET6:
#endif
break;
default:
ereport(ERROR, (errmsg("invalid address family in connection")));
break;
}
remoteHost = palloc0(remoteHostLen);
nameFound = pg_getnameinfo_all(&port->raddr.addr, port->raddr.salen,
remoteHost, remoteHostLen, NULL, 0, flags);
if (nameFound != 0)
{
ereport(ERROR, (errmsg("could not resolve client hostname: %s",
gai_strerror(nameFound))));
}
return remoteHost;
}
/*
* WorkerNodeGetDatum converts the worker node passed to it into its datum
* representation. To do this, the function first creates the heap tuple from
* the worker node name and port. Then, the function converts the heap tuple
* into a datum and returns it.
*/
static Datum
WorkerNodeGetDatum(WorkerNode *workerNode, TupleDesc tupleDescriptor)
{
Datum values[WORKER_NODE_FIELDS];
bool isNulls[WORKER_NODE_FIELDS];
HeapTuple workerNodeTuple = NULL;
Datum workerNodeDatum = 0;
memset(values, 0, sizeof(values));
memset(isNulls, false, sizeof(isNulls));
values[0] = CStringGetTextDatum(workerNode->workerName);
values[1] = Int64GetDatum((int64) workerNode->workerPort);
workerNodeTuple = heap_form_tuple(tupleDescriptor, values, isNulls);
workerNodeDatum = HeapTupleGetDatum(workerNodeTuple);
return workerNodeDatum;
}

View File

@ -0,0 +1,264 @@
/*-------------------------------------------------------------------------
*
* master_repair_shards.c
*
* This file contains functions to repair unhealthy shard placements using data
* from healthy ones.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include "fmgr.h"
#include "miscadmin.h"
#include <string.h>
#include "catalog/pg_class.h"
#include "distributed/connection_cache.h"
#include "distributed/master_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_router_executor.h"
#include "distributed/resource_lock.h"
#include "distributed/worker_manager.h"
#include "distributed/worker_protocol.h"
#include "lib/stringinfo.h"
#include "nodes/pg_list.h"
#include "storage/lock.h"
#include "utils/builtins.h"
#include "utils/elog.h"
#include "utils/errcodes.h"
#include "utils/lsyscache.h"
#include "utils/palloc.h"
/* local function forward declarations */
static ShardPlacement * SearchShardPlacementInList(List *shardPlacementList,
text *nodeName, uint32 nodePort);
static List * RecreateTableDDLCommandList(Oid relationId);
static bool CopyDataFromFinalizedPlacement(Oid distributedTableId, int64 shardId,
ShardPlacement *healthyPlacement,
ShardPlacement *placementToRepair);
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(master_copy_shard_placement);
/*
* master_copy_shard_placement implements a user-facing UDF to copy data from
* a healthy (source) node to an inactive (target) node. To accomplish this it
* entirely recreates the table structure before copying all data. During this
* time all modifications are paused to the shard. After successful repair, the
* inactive placement is marked healthy and modifications may continue. If the
* repair fails at any point, this function throws an error, leaving the node
* in an unhealthy state.
*/
Datum
master_copy_shard_placement(PG_FUNCTION_ARGS)
{
int64 shardId = PG_GETARG_INT64(0);
text *sourceNodeName = PG_GETARG_TEXT_P(1);
int32 sourceNodePort = PG_GETARG_INT32(2);
text *targetNodeName = PG_GETARG_TEXT_P(3);
int32 targetNodePort = PG_GETARG_INT32(4);
ShardInterval *shardInterval = LoadShardInterval(shardId);
Oid distributedTableId = shardInterval->relationId;
List *shardPlacementList = NIL;
ShardPlacement *sourcePlacement = NULL;
ShardPlacement *targetPlacement = NULL;
WorkerNode *targetNode = NULL;
List *ddlCommandList = NIL;
bool dataCopied = false;
char relationKind = '\0';
/*
* By taking an exclusive lock on the shard, we both stop all modifications
* (INSERT, UPDATE, or DELETE) and prevent concurrent repair operations from
* being able to operate on this shard.
*/
LockShardResource(shardId, ExclusiveLock);
/*
* We've stopped data modifications of this shard, but we plan to move
* a placement to the healthy state, so we need to grab a shard metadata
* lock (in exclusive mode) as well.
*/
LockShardDistributionMetadata(shardId, ExclusiveLock);
shardPlacementList = ShardPlacementList(shardId);
sourcePlacement = SearchShardPlacementInList(shardPlacementList, sourceNodeName,
sourceNodePort);
if (sourcePlacement->shardState != FILE_FINALIZED)
{
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("source placement must be in finalized state")));
}
targetPlacement = SearchShardPlacementInList(shardPlacementList, targetNodeName,
targetNodePort);
if (targetPlacement->shardState != FILE_INACTIVE)
{
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("target placement must be in inactive state")));
}
relationKind = get_rel_relkind(distributedTableId);
if (relationKind == RELKIND_FOREIGN_TABLE)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot repair shard"),
errdetail("Repairing shards backed by foreign tables is "
"not supported.")));
}
targetNode = palloc0(sizeof(WorkerNode));
targetNode->inWorkerFile = true;
strlcpy(targetNode->workerName, targetPlacement->nodeName, WORKER_LENGTH);
targetNode->workerPort = targetPlacement->nodePort;
/* retrieve DDL commands needed to drop and recreate table*/
ddlCommandList = RecreateTableDDLCommandList(distributedTableId);
/* remove existing (unhealthy) placement row; CreateShardPlacements will recreate */
DeleteShardPlacementRow(targetPlacement->shardId, targetPlacement->nodeName,
targetPlacement->nodePort);
/* finally, drop/recreate remote table and add back row (in healthy state) */
CreateShardPlacements(shardId, ddlCommandList, list_make1(targetNode), 0, 1);
HOLD_INTERRUPTS();
dataCopied = CopyDataFromFinalizedPlacement(distributedTableId, shardId,
sourcePlacement, targetPlacement);
if (!dataCopied)
{
ereport(ERROR, (errmsg("could not copy shard data"),
errhint("Consult recent messages in the server logs for "
"details.")));
}
RESUME_INTERRUPTS();
PG_RETURN_VOID();
}
/*
* SearchShardPlacementInList searches a provided list for a shard placement
* with the specified node name and port. This function throws an error if no
* such placement exists in the provided list.
*/
static ShardPlacement *
SearchShardPlacementInList(List *shardPlacementList, text *nodeNameText, uint32 nodePort)
{
ListCell *shardPlacementCell = NULL;
ShardPlacement *matchingPlacement = NULL;
char *nodeName = text_to_cstring(nodeNameText);
foreach(shardPlacementCell, shardPlacementList)
{
ShardPlacement *shardPlacement = lfirst(shardPlacementCell);
if (strncmp(nodeName, shardPlacement->nodeName, MAX_NODE_LENGTH) == 0 &&
nodePort == shardPlacement->nodePort)
{
matchingPlacement = shardPlacement;
break;
}
}
if (matchingPlacement == NULL)
{
ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION),
errmsg("could not find placement matching \"%s:%d\"",
nodeName, nodePort),
errhint("Confirm the placement still exists and try again.")));
}
return matchingPlacement;
}
/*
* RecreateTableDDLCommandList returns a list of DDL statements similar to that
* returned by GetTableDDLEvents except that the list begins with a "DROP TABLE"
* or "DROP FOREIGN TABLE" statement to facilitate total recreation of a placement.
*/
static List *
RecreateTableDDLCommandList(Oid relationId)
{
char *relationName = get_rel_name(relationId);
StringInfo dropCommand = makeStringInfo();
List *createCommandList = NIL;
List *dropCommandList = NIL;
List *recreateCommandList = NIL;
char relationKind = get_rel_relkind(relationId);
/* build appropriate DROP command based on relation kind */
if (relationKind == RELKIND_RELATION)
{
appendStringInfo(dropCommand, DROP_REGULAR_TABLE_COMMAND,
quote_identifier(relationName));
}
else if (relationKind == RELKIND_FOREIGN_TABLE)
{
appendStringInfo(dropCommand, DROP_FOREIGN_TABLE_COMMAND,
quote_identifier(relationName));
}
else
{
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("repair target is not a regular or foreign table")));
}
dropCommandList = list_make1(dropCommand->data);
createCommandList = GetTableDDLEvents(relationId);
recreateCommandList = list_concat(dropCommandList, createCommandList);
return recreateCommandList;
}
/*
* CopyDataFromFinalizedPlacement copies a the data for a shard (identified by
* a relation and shard identifier) from a healthy placement to one needing
* repair. The unhealthy placement must already have an empty relation in place
* to receive rows from the healthy placement. This function returns a boolean
* indicating success or failure.
*/
static bool
CopyDataFromFinalizedPlacement(Oid distributedTableId, int64 shardId,
ShardPlacement *healthyPlacement,
ShardPlacement *placementToRepair)
{
char *relationName = get_rel_name(distributedTableId);
const char *shardName = NULL;
StringInfo copyRelationQuery = makeStringInfo();
List *queryResultList = NIL;
bool copySuccessful = false;
AppendShardIdToName(&relationName, shardId);
shardName = quote_identifier(relationName);
appendStringInfo(copyRelationQuery, WORKER_APPEND_TABLE_TO_SHARD,
quote_literal_cstr(shardName), /* table to append */
quote_literal_cstr(shardName), /* remote table name */
quote_literal_cstr(healthyPlacement->nodeName), /* remote host */
healthyPlacement->nodePort); /* remote port */
queryResultList = ExecuteRemoteQuery(placementToRepair->nodeName,
placementToRepair->nodePort, copyRelationQuery);
if (queryResultList != NIL)
{
copySuccessful = true;
}
return copySuccessful;
}

View File

@ -0,0 +1,550 @@
/*-------------------------------------------------------------------------
*
* master_stage_protocol.c
*
* Routines for staging PostgreSQL table data as shards into the distributed
* cluster. These user-defined functions are similar to the psql-side \stage
* command, but also differ from them in that users stage data from tables and
* not files, and that they can also append to existing shards.
*
* Copyright (c) 2013, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "access/htup_details.h"
#include "access/xact.h"
#include "catalog/indexing.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/master_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_join_order.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/pg_dist_shard.h"
#include "distributed/resource_lock.h"
#include "distributed/worker_manager.h"
#include "distributed/worker_protocol.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
#include "utils/rel.h"
#include "utils/tqual.h"
/* Local functions forward declarations */
static bool WorkerCreateShard(char *nodeName, uint32 nodePort,
uint64 shardId, List *ddlCommandList);
static bool WorkerShardStats(char *nodeName, uint32 nodePort, Oid relationId,
char *shardName, uint64 *shardLength,
text **shardMinValue, text **shardMaxValue);
static uint64 WorkerTableSize(char *nodeName, uint32 nodePort, char *tableName);
static StringInfo WorkerPartitionValue(char *nodeName, uint32 nodePort, Oid relationId,
char *shardName, char *selectQuery);
/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(master_create_empty_shard);
PG_FUNCTION_INFO_V1(master_append_table_to_shard);
/*
* master_create_empty_shard creates an empty shard for the given distributed
* table. For this, the function first gets a list of candidate nodes, connects
* to these nodes, and issues DDL commands on the nodes to create empty shard
* placements. The function then updates metadata on the master node to make
* this shard (and its placements) visible.
*/
Datum
master_create_empty_shard(PG_FUNCTION_ARGS)
{
text *relationNameText = PG_GETARG_TEXT_P(0);
char *relationName = text_to_cstring(relationNameText);
Datum shardIdDatum = 0;
int64 shardId = INVALID_SHARD_ID;
List *ddlEventList = NULL;
uint32 attemptableNodeCount = 0;
uint32 liveNodeCount = 0;
uint32 candidateNodeCount = 0;
List *candidateNodeList = NIL;
text *nullMinValue = NULL;
text *nullMaxValue = NULL;
char tableType = 0;
char partitionMethod = 0;
Oid relationId = ResolveRelationId(relationNameText);
CheckDistributedTable(relationId);
tableType = get_rel_relkind(relationId);
if (tableType != RELKIND_RELATION)
{
ereport(ERROR, (errmsg("relation \"%s\" is not a regular table", relationName)));
}
partitionMethod = PartitionMethod(relationId);
if (partitionMethod == DISTRIBUTE_BY_HASH)
{
ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table",
relationName),
errdetail("We currently don't support creating shards "
"on hash-partitioned tables")));
}
/* generate new and unique shardId from sequence */
shardIdDatum = master_get_new_shardid(NULL);
shardId = DatumGetInt64(shardIdDatum);
/* get table DDL commands to replay on the worker node */
ddlEventList = GetTableDDLEvents(relationId);
/* if enough live nodes, add an extra candidate node as backup */
attemptableNodeCount = ShardReplicationFactor;
liveNodeCount = WorkerGetLiveNodeCount();
if (liveNodeCount > ShardReplicationFactor)
{
attemptableNodeCount = ShardReplicationFactor + 1;
}
/* first retrieve a list of random nodes for shard placements */
while (candidateNodeCount < attemptableNodeCount)
{
WorkerNode *candidateNode = WorkerGetCandidateNode(candidateNodeList);
if (candidateNode == NULL)
{
ereport(ERROR, (errmsg("could only find %u of %u possible nodes",
candidateNodeCount, attemptableNodeCount)));
}
candidateNodeList = lappend(candidateNodeList, candidateNode);
candidateNodeCount++;
}
CreateShardPlacements(shardId, ddlEventList, candidateNodeList, 0,
ShardReplicationFactor);
InsertShardRow(relationId, shardId, SHARD_STORAGE_TABLE, nullMinValue, nullMaxValue);
PG_RETURN_INT64(shardId);
}
/*
* master_append_table_to_shard appends the given table's contents to the given
* shard, and updates shard metadata on the master node. If the function fails
* to append table data to all shard placements, it doesn't update any metadata
* and errors out. Else if the function fails to append table data to some of
* the shard placements, it marks those placements as invalid. These invalid
* placements will get cleaned up during shard rebalancing.
*/
Datum
master_append_table_to_shard(PG_FUNCTION_ARGS)
{
uint64 shardId = PG_GETARG_INT64(0);
text *sourceTableNameText = PG_GETARG_TEXT_P(1);
text *sourceNodeNameText = PG_GETARG_TEXT_P(2);
uint32 sourceNodePort = PG_GETARG_UINT32(3);
char *sourceTableName = text_to_cstring(sourceTableNameText);
char *sourceNodeName = text_to_cstring(sourceNodeNameText);
char *shardName = NULL;
List *shardPlacementList = NIL;
List *succeededPlacementList = NIL;
List *failedPlacementList = NIL;
ListCell *shardPlacementCell = NULL;
ListCell *succeededPlacementCell = NULL;
ListCell *failedPlacementCell = NULL;
bool statsOK = false;
uint64 newShardLength = 0;
uint64 shardMaxSizeInBytes = 0;
float4 shardFillLevel = 0.0;
text *newMinValue = NULL;
text *newMaxValue = NULL;
char partitionMethod = 0;
ShardInterval *shardInterval = LoadShardInterval(shardId);
Oid relationId = shardInterval->relationId;
char storageType = shardInterval->storageType;
if (storageType != SHARD_STORAGE_TABLE)
{
ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId),
errdetail("The underlying shard is not a regular table")));
}
partitionMethod = PartitionMethod(relationId);
if (partitionMethod == DISTRIBUTE_BY_HASH)
{
ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId),
errdetail("We currently don't support appending to shards "
"in hash-partitioned tables")));
}
/*
* We lock on the shardId, but do not unlock. When the function returns, and
* the transaction for this function commits, this lock will automatically
* be released. This ensures appends to a shard happen in a serial manner.
*/
LockShardResource(shardId, AccessExclusiveLock);
/* if shard doesn't have an alias, extend regular table name */
shardName = LoadShardAlias(relationId, shardId);
if (shardName == NULL)
{
shardName = get_rel_name(relationId);
AppendShardIdToName(&shardName, shardId);
}
shardPlacementList = FinalizedShardPlacementList(shardId);
if (shardPlacementList == NIL)
{
ereport(ERROR, (errmsg("could not find any shard placements for shardId "
UINT64_FORMAT, shardId),
errhint("Try running master_create_empty_shard() first")));
}
/* issue command to append table to each shard placement */
foreach(shardPlacementCell, shardPlacementList)
{
ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell);
char *workerName = shardPlacement->nodeName;
uint32 workerPort = shardPlacement->nodePort;
List *queryResultList = NIL;
StringInfo workerAppendQuery = makeStringInfo();
appendStringInfo(workerAppendQuery, WORKER_APPEND_TABLE_TO_SHARD,
quote_literal_cstr(shardName),
quote_literal_cstr(sourceTableName),
quote_literal_cstr(sourceNodeName), sourceNodePort);
queryResultList = ExecuteRemoteQuery(workerName, workerPort, workerAppendQuery);
if (queryResultList != NIL)
{
succeededPlacementList = lappend(succeededPlacementList, shardPlacement);
}
else
{
failedPlacementList = lappend(failedPlacementList, shardPlacement);
}
}
/* before updating metadata, check that we appended to at least one shard */
if (succeededPlacementList == NIL)
{
ereport(ERROR, (errmsg("could not append table to any shard placement")));
}
/* make sure we don't process cancel signals */
HOLD_INTERRUPTS();
/* mark shard placements that we couldn't append to as inactive */
foreach(failedPlacementCell, failedPlacementList)
{
ShardPlacement *placement = (ShardPlacement *) lfirst(failedPlacementCell);
char *workerName = placement->nodeName;
uint32 workerPort = placement->nodePort;
uint64 oldShardLength = placement->shardLength;
DeleteShardPlacementRow(shardId, workerName, workerPort);
InsertShardPlacementRow(shardId, FILE_INACTIVE, oldShardLength,
workerName, workerPort);
ereport(WARNING, (errmsg("could not append table to shard \"%s\" on node "
"\"%s:%u\"", shardName, workerName, workerPort),
errdetail("Marking this shard placement as inactive")));
}
RESUME_INTERRUPTS();
/* get appended shard's statistics from a shard placement */
foreach(succeededPlacementCell, succeededPlacementList)
{
ShardPlacement *placement = (ShardPlacement *) lfirst(succeededPlacementCell);
char *workerName = placement->nodeName;
uint32 workerPort = placement->nodePort;
statsOK = WorkerShardStats(workerName, workerPort, relationId, shardName,
&newShardLength, &newMinValue, &newMaxValue);
if (statsOK)
{
break;
}
}
/*
* If for some reason we appended data to a shard, but failed to retrieve
* statistics we just WARN here to avoid losing shard-state updates. Note
* that this means we will return 0 as the shard fill-factor, and this shard
* also won't be pruned as the statistics will be empty. If the failure was
* transient, a subsequent append call will fetch the correct statistics.
*/
if (!statsOK)
{
ereport(WARNING, (errmsg("could not get statistics for shard placement"),
errdetail("Setting shard statistics to NULL")));
}
/* make sure we don't process cancel signals */
HOLD_INTERRUPTS();
/* update metadata for each shard placement we appended to */
succeededPlacementCell = NULL;
foreach(succeededPlacementCell, succeededPlacementList)
{
ShardPlacement *placement = (ShardPlacement *) lfirst(succeededPlacementCell);
char *workerName = placement->nodeName;
uint32 workerPort = placement->nodePort;
DeleteShardPlacementRow(shardId, workerName, workerPort);
InsertShardPlacementRow(shardId, FILE_FINALIZED, newShardLength,
workerName, workerPort);
}
DeleteShardRow(shardId);
InsertShardRow(relationId, shardId, storageType, newMinValue, newMaxValue);
if (QueryCancelPending)
{
ereport(WARNING, (errmsg("cancel requests are ignored during table appends")));
QueryCancelPending = false;
}
RESUME_INTERRUPTS();
/* calculate ratio of current shard size compared to shard max size */
shardMaxSizeInBytes = (int64) ShardMaxSize * 1024L;
shardFillLevel = ((float4) newShardLength / (float4) shardMaxSizeInBytes);
PG_RETURN_FLOAT4(shardFillLevel);
}
/*
* CheckDistributedTable checks if the given relationId corresponds to a
* distributed table. If it does not, the function errors out.
*/
void
CheckDistributedTable(Oid relationId)
{
char *relationName = get_rel_name(relationId);
/* check that the relationId belongs to a table */
char tableType = get_rel_relkind(relationId);
if (!(tableType == RELKIND_RELATION || tableType == RELKIND_FOREIGN_TABLE))
{
ereport(ERROR, (errmsg("relation \"%s\" is not a table", relationName)));
}
if (!IsDistributedTable(relationId))
{
ereport(ERROR, (errmsg("relation \"%s\" is not a distributed table",
relationName)));
}
}
/*
* CreateShardPlacements attempts to create a certain number of placements
* (provided by the replicationFactor argument) on the provided list of worker
* nodes. Beginning at the provided start index, DDL commands are attempted on
* worker nodes (via WorkerCreateShards). If there are more worker nodes than
* required for replication, one remote failure is tolerated. If the provided
* replication factor is not attained, an error is raised (placements remain on
* nodes if some DDL commands had been successful).
*/
void
CreateShardPlacements(int64 shardId, List *ddlEventList, List *workerNodeList,
int workerStartIndex, int replicationFactor)
{
int attemptCount = replicationFactor;
int workerNodeCount = list_length(workerNodeList);
int placementsCreated = 0;
int attemptNumber = 0;
/* if we have enough nodes, add an extra placement attempt for backup */
if (workerNodeCount > replicationFactor)
{
attemptCount++;
}
for (attemptNumber = 0; attemptNumber < attemptCount; attemptNumber++)
{
int workerNodeIndex = (workerStartIndex + attemptNumber) % workerNodeCount;
WorkerNode *workerNode = (WorkerNode *) list_nth(workerNodeList, workerNodeIndex);
char *nodeName = workerNode->workerName;
uint32 nodePort = workerNode->workerPort;
bool created = WorkerCreateShard(nodeName, nodePort, shardId, ddlEventList);
if (created)
{
const RelayFileState shardState = FILE_FINALIZED;
const uint64 shardSize = 0;
InsertShardPlacementRow(shardId, shardState, shardSize, nodeName, nodePort);
placementsCreated++;
}
else
{
ereport(WARNING, (errmsg("could not create shard on \"%s:%u\"",
nodeName, nodePort)));
}
if (placementsCreated >= replicationFactor)
{
break;
}
}
/* check if we created enough shard replicas */
if (placementsCreated < replicationFactor)
{
ereport(ERROR, (errmsg("could only create %u of %u of required shard replicas",
placementsCreated, replicationFactor)));
}
}
/*
* WorkerCreateShard applies DDL commands for the given shardId to create the
* shard on the worker node. Note that this function opens a new connection for
* each DDL command, and could leave the shard in an half-initialized state.
*/
static bool
WorkerCreateShard(char *nodeName, uint32 nodePort,
uint64 shardId, List *ddlCommandList)
{
bool shardCreated = true;
ListCell *ddlCommandCell = NULL;
foreach(ddlCommandCell, ddlCommandList)
{
char *ddlCommand = (char *) lfirst(ddlCommandCell);
char *escapedDDLCommand = quote_literal_cstr(ddlCommand);
List *queryResultList = NIL;
StringInfo applyDDLCommand = makeStringInfo();
appendStringInfo(applyDDLCommand, WORKER_APPLY_SHARD_DDL_COMMAND,
shardId, escapedDDLCommand);
queryResultList = ExecuteRemoteQuery(nodeName, nodePort, applyDDLCommand);
if (queryResultList == NIL)
{
shardCreated = false;
break;
}
}
return shardCreated;
}
/*
* WorkerShardStats queries the worker node, and retrieves shard statistics that
* we assume have changed after new table data have been appended to the shard.
*/
static bool
WorkerShardStats(char *nodeName, uint32 nodePort, Oid relationId, char *shardName,
uint64 *shardLength, text **shardMinValue, text **shardMaxValue)
{
bool shardStatsOK = true;
PG_TRY();
{
uint64 tableSize = WorkerTableSize(nodeName, nodePort, shardName);
StringInfo minValue = WorkerPartitionValue(nodeName, nodePort, relationId,
shardName, SHARD_MIN_VALUE_QUERY);
StringInfo maxValue = WorkerPartitionValue(nodeName, nodePort, relationId,
shardName, SHARD_MAX_VALUE_QUERY);
(*shardLength) = tableSize;
(*shardMinValue) = cstring_to_text_with_len(minValue->data, minValue->len);
(*shardMaxValue) = cstring_to_text_with_len(maxValue->data, maxValue->len);
}
PG_CATCH();
{
shardStatsOK = false;
}
PG_END_TRY();
return shardStatsOK;
}
/*
* WorkerTableSize queries the worker node to extract the disk space used by the
* given relation. The function assumes the relation represents a regular table.
*/
static uint64
WorkerTableSize(char *nodeName, uint32 nodePort, char *tableName)
{
uint64 tableSize = 0;
List *queryResultList = NIL;
StringInfo tableSizeString = NULL;
char *tableSizeStringEnd = NULL;
StringInfo tableSizeQuery = makeStringInfo();
appendStringInfo(tableSizeQuery, SHARD_TABLE_SIZE_QUERY, tableName);
queryResultList = ExecuteRemoteQuery(nodeName, nodePort, tableSizeQuery);
if (queryResultList == NIL)
{
ereport(ERROR, (errmsg("could not receive table size from node "
"\"%s:%u\"", nodeName, nodePort)));
}
tableSizeString = (StringInfo) linitial(queryResultList);
errno = 0;
tableSize = strtoull(tableSizeString->data, &tableSizeStringEnd, 0);
if (errno != 0 || (*tableSizeStringEnd) != '\0')
{
ereport(ERROR, (errmsg("could not extract table size for table \"%s\"",
tableName)));
}
return tableSize;
}
/*
* WorkerPartitionValue helps in extracting partition column's min or max value
* from the given shard. For this, the function resolves the given distributed
* relation's partition column, connects to the worker node, and runs a select
* query on the given shard.
*/
static StringInfo
WorkerPartitionValue(char *nodeName, uint32 nodePort, Oid relationId,
char *shardName, char *selectQuery)
{
StringInfo partitionValue = NULL;
List *queryResultList = NIL;
uint32 unusedTableId = 1;
Var *partitionColumn = PartitionColumn(relationId, unusedTableId);
char *partitionColumnName = get_attname(relationId, partitionColumn->varattno);
StringInfo partitionValueQuery = makeStringInfo();
appendStringInfo(partitionValueQuery, selectQuery, partitionColumnName, shardName);
/*
* Note that the following call omits the partition column value's size, and
* simply casts the results to a (char *). If the user partitioned the table
* on a binary byte array, this approach fails and should be fixed.
*/
queryResultList = ExecuteRemoteQuery(nodeName, nodePort, partitionValueQuery);
if (queryResultList == NIL)
{
ereport(ERROR, (errmsg("could not receive shard min/max values from node "
"\"%s:%u\"", nodeName, nodePort)));
}
partitionValue = (StringInfo) linitial(queryResultList);
return partitionValue;
}

View File

@ -0,0 +1,27 @@
# ------------------------------------------
# Citus Database Worker Node Membership List
# ------------------------------------------
#
# This file contains list of worker node names; these names are used both for
# initializing the worker nodes, and later for communicating with them. Records
# in this file are in the following format:
#
# HOSTNAME [PORT] [RACK]
#
# (The uppercase items must be replaced by actual values.)
#
# HOSTNAME specifies the DNS resolvable host name for the worker node. In test
# environments, localhost may be used to loopback to the current node.
#
# PORT specifies the port number to connect to at the specified host. This value
# is optional; in its absence, the port configuration value is used as the
# default.
#
# RACK specifies the host's network location for the purposes of performing rack
# aware data distribution. This value is optional; in its absence, a generic
# value is used as the default.
# Put your actual configuration here
# ----------------------------------
#
# HOSTNAME [PORT] [RACK]

View File

@ -0,0 +1,807 @@
/*-------------------------------------------------------------------------
*
* worker_node_manager.c
* Routines for reading worker nodes from membership file, and allocating
* candidate nodes for shard placement.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include "commands/dbcommands.h"
#include "distributed/worker_manager.h"
#include "distributed/multi_client_executor.h"
#include "libpq/hba.h"
#include "postmaster/postmaster.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "utils/guc.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
/* Config variables managed via guc.c */
char *WorkerListFileName; /* location of pg_worker_list.conf */
int MaxWorkerNodesTracked = 2048; /* determines worker node hash table size */
static HTAB *WorkerNodesHash = NULL; /* worker node hash in shared memory */
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
/* Local functions forward declarations */
static bool OddNumber(uint32 number);
static WorkerNode * FindRandomNodeNotInList(HTAB *WorkerNodesHash,
List *currentNodeList);
static bool ListMember(List *currentList, WorkerNode *workerNode);
static Size WorkerNodeShmemSize(void);
static void WorkerNodeShmemAndWorkerListInit(void);
static uint32 WorkerNodeHashCode(const void *key, Size keySize);
static int WorkerNodeCompare(const void *lhsKey, const void *rhsKey, Size keySize);
static List * ParseWorkerNodeFile(const char *workerNodeFilename);
static void ResetWorkerNodesHash(HTAB *WorkerNodesHash);
static bool WorkerNodeResponsive(const char *workerName, uint32 workerPort);
/* ------------------------------------------------------------
* Worker node selection functions follow
* ------------------------------------------------------------
*/
/*
* WorkerGetCandidateNode takes in a list of worker nodes, and then allocates a
* new worker node. The allocation is performed according to the following
* policy: if the list is empty, a random node is allocated; if the list has one
* node (or an odd number of nodes), the new node is allocated on a different
* rack than the first node; and if the list has two nodes (or an even number of
* nodes), the new node is allocated on the same rack as the first node, but is
* different from all the nodes in the list. This node allocation policy ensures
* that shard locality is maintained within a rack, but no single rack failure
* can result in data loss.
*
* Note that the function returns null if the worker membership list does not
* contain enough nodes to allocate a new worker node.
*/
WorkerNode *
WorkerGetCandidateNode(List *currentNodeList)
{
WorkerNode *workerNode = NULL;
bool wantSameRack = false;
uint32 tryCount = WORKER_RACK_TRIES;
uint32 tryIndex = 0;
/*
* We check if the shard has already been placed on all nodes known to us.
* This check is rather defensive, and has the drawback of performing a full
* scan over the worker node hash for determining the number of live nodes.
*/
uint32 currentNodeCount = list_length(currentNodeList);
uint32 liveNodeCount = WorkerGetLiveNodeCount();
if (currentNodeCount >= liveNodeCount)
{
return NULL;
}
/* if current node list is empty, randomly pick one node and return */
if (currentNodeCount == 0)
{
workerNode = FindRandomNodeNotInList(WorkerNodesHash, NIL);
return workerNode;
}
/*
* If the current list has an odd number of nodes (1, 3, 5, etc), we want to
* place the shard on a different rack than the first node's rack.
* Otherwise, we want to place the shard on the same rack as the first node.
*/
if (OddNumber(currentNodeCount))
{
wantSameRack = false;
}
else
{
wantSameRack = true;
}
/*
* We try to find a worker node that fits our rack-aware placement strategy.
* If after a predefined number of tries, we still cannot find such a node,
* we simply give up and return the last worker node we found.
*/
for (tryIndex = 0; tryIndex < tryCount; tryIndex++)
{
WorkerNode *firstNode = (WorkerNode *) linitial(currentNodeList);
char *firstRack = firstNode->workerRack;
char *workerRack = NULL;
bool sameRack = false;
workerNode = FindRandomNodeNotInList(WorkerNodesHash, currentNodeList);
workerRack = workerNode->workerRack;
sameRack = (strncmp(workerRack, firstRack, WORKER_LENGTH) == 0);
if ((sameRack && wantSameRack) || (!sameRack && !wantSameRack))
{
break;
}
}
return workerNode;
}
/*
* WorkerGetRoundRobinCandidateNode takes in a list of worker nodes and returns
* a candidate worker node from that list. To select this node, this function
* uses the round-robin policy. An ideal round-robin implementation requires
* keeping shared state for shard placements; and we instead approximate our
* implementation by relying on the ever-increasing shardId. So, the first
* worker node selected will be the node at the (shardId MOD worker node count)
* index and the remaining candidate nodes will be the next nodes in the list.
*
* Note that the function returns null if the worker membership list does not
* contain enough nodes to place all replicas.
*/
WorkerNode *
WorkerGetRoundRobinCandidateNode(List *workerNodeList, uint64 shardId,
uint32 placementIndex)
{
uint32 workerNodeCount = list_length(workerNodeList);
WorkerNode *candidateNode = NULL;
if (placementIndex < workerNodeCount)
{
uint32 candidateNodeIndex = (shardId + placementIndex) % workerNodeCount;
candidateNode = (WorkerNode *) list_nth(workerNodeList, candidateNodeIndex);
}
return candidateNode;
}
/*
* WorkerGetNodeWithName finds and returns a node from the membership list that
* has the given hostname. The function returns null if no such node exists.
*/
WorkerNode *
WorkerGetNodeWithName(const char *hostname)
{
WorkerNode *workerNode = NULL;
HASH_SEQ_STATUS status;
hash_seq_init(&status, WorkerNodesHash);
workerNode = (WorkerNode *) hash_seq_search(&status);
while (workerNode != NULL)
{
if (workerNode->inWorkerFile)
{
int nameCompare = strncmp(workerNode->workerName, hostname, WORKER_LENGTH);
if (nameCompare == 0)
{
hash_seq_term(&status);
break;
}
}
workerNode = (WorkerNode *) hash_seq_search(&status);
}
return workerNode;
}
/* Returns the number of live nodes in the cluster. */
uint32
WorkerGetLiveNodeCount(void)
{
WorkerNode *workerNode = NULL;
uint32 liveWorkerCount = 0;
HASH_SEQ_STATUS status;
hash_seq_init(&status, WorkerNodesHash);
workerNode = (WorkerNode *) hash_seq_search(&status);
while (workerNode != NULL)
{
if (workerNode->inWorkerFile)
{
liveWorkerCount++;
}
workerNode = (WorkerNode *) hash_seq_search(&status);
}
return liveWorkerCount;
}
/* Inserts the live worker nodes to a list, and returns the list. */
List *
WorkerNodeList(void)
{
List *workerNodeList = NIL;
WorkerNode *workerNode = NULL;
HASH_SEQ_STATUS status;
hash_seq_init(&status, WorkerNodesHash);
workerNode = (WorkerNode *) hash_seq_search(&status);
while (workerNode != NULL)
{
if (workerNode->inWorkerFile)
{
workerNodeList = lappend(workerNodeList, workerNode);
}
workerNode = (WorkerNode *) hash_seq_search(&status);
}
return workerNodeList;
}
/*
* WorkerNodeActive looks up a worker node with the given name and port number
* in the current membership list. If such a worker node exists, the function
* returns true.
*/
bool
WorkerNodeActive(const char *nodeName, uint32 nodePort)
{
bool workerNodeActive = false;
bool handleFound = false;
WorkerNode *workerNode = NULL;
void *hashKey = NULL;
WorkerNode *searchedNode = (WorkerNode *) palloc0(sizeof(WorkerNode));
strlcpy(searchedNode->workerName, nodeName, WORKER_LENGTH);
searchedNode->workerPort = nodePort;
hashKey = (void *) searchedNode;
workerNode = (WorkerNode *) hash_search(WorkerNodesHash, hashKey,
HASH_FIND, &handleFound);
if (workerNode != NULL)
{
if (workerNode->inWorkerFile)
{
workerNodeActive = true;
}
}
return workerNodeActive;
}
/* Returns true if given number is odd; returns false otherwise. */
static bool
OddNumber(uint32 number)
{
bool oddNumber = ((number % 2) == 1);
return oddNumber;
}
/*
* FindRandomNodeNotInList finds a random node from the shared hash that is not
* a member of the current node list. The caller is responsible for making the
* necessary node count checks to ensure that such a node exists.
*
* Note that this function has a selection bias towards nodes whose positions in
* the shared hash are sequentially adjacent to the positions of nodes that are
* in the current node list. This bias follows from our decision to first pick a
* random node in the hash, and if that node is a member of the current list, to
* simply iterate to the next node in the hash. Overall, this approach trades in
* some selection bias for simplicity in design and for bounded execution time.
*/
static WorkerNode *
FindRandomNodeNotInList(HTAB *WorkerNodesHash, List *currentNodeList)
{
WorkerNode *workerNode = NULL;
HASH_SEQ_STATUS status;
uint32 workerNodeCount = 0;
uint32 currentNodeCount = 0;
bool lookForWorkerNode = true;
uint32 workerPosition = 0;
uint32 workerIndex = 0;
workerNodeCount = hash_get_num_entries(WorkerNodesHash);
currentNodeCount = list_length(currentNodeList);
Assert(workerNodeCount > currentNodeCount);
/*
* We determine a random position within the worker hash between [1, N],
* assuming that the number of elements in the hash is N. We then get to
* this random position by iterating over the worker hash. Please note that
* the random seed has already been set by the postmaster when starting up.
*/
workerPosition = (random() % workerNodeCount) + 1;
hash_seq_init(&status, WorkerNodesHash);
for (workerIndex = 0; workerIndex < workerPosition; workerIndex++)
{
workerNode = (WorkerNode *) hash_seq_search(&status);
}
while (lookForWorkerNode)
{
bool listMember = ListMember(currentNodeList, workerNode);
if (workerNode->inWorkerFile && !listMember)
{
lookForWorkerNode = false;
}
else
{
/* iterate to the next worker node in the hash */
workerNode = (WorkerNode *) hash_seq_search(&status);
/* reached end of hash; start from the beginning */
if (workerNode == NULL)
{
hash_seq_init(&status, WorkerNodesHash);
workerNode = (WorkerNode *) hash_seq_search(&status);
}
}
}
/* we stopped scanning before completion; therefore clean up scan */
hash_seq_term(&status);
return workerNode;
}
/* Checks if given worker node is a member of the current list. */
static bool
ListMember(List *currentList, WorkerNode *workerNode)
{
bool listMember = false;
Size keySize = WORKER_LENGTH + sizeof(uint32);
ListCell *currentCell = NULL;
foreach(currentCell, currentList)
{
WorkerNode *currentNode = (WorkerNode *) lfirst(currentCell);
if (WorkerNodeCompare(workerNode, currentNode, keySize) == 0)
{
listMember = true;
}
}
return listMember;
}
/* ------------------------------------------------------------
* Worker node shared hash functions follow
* ------------------------------------------------------------
*/
/* Organize, at startup, that the resources for worker node management are allocated. */
void
WorkerNodeRegister(void)
{
RequestAddinShmemSpace(WorkerNodeShmemSize());
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = WorkerNodeShmemAndWorkerListInit;
}
/* Estimates the shared memory size used for managing worker nodes. */
static Size
WorkerNodeShmemSize(void)
{
Size size = 0;
Size hashSize = 0;
hashSize = hash_estimate_size(MaxWorkerNodesTracked, sizeof(WorkerNode));
size = add_size(size, hashSize);
return size;
}
/* Initializes the shared memory used for managing worker nodes. */
static void
WorkerNodeShmemAndWorkerListInit(void)
{
HASHCTL info;
int hashFlags = 0;
long maxTableSize = 0;
long initTableSize = 0;
maxTableSize = (long) MaxWorkerNodesTracked;
initTableSize = maxTableSize / 8;
/*
* Allocate the control structure for the hash table that maps worker node
* name and port numbers (char[]:uint32) to general node membership and
* health information.
*/
memset(&info, 0, sizeof(info));
info.keysize = WORKER_LENGTH + sizeof(uint32);
info.entrysize = sizeof(WorkerNode);
info.hash = WorkerNodeHashCode;
info.match = WorkerNodeCompare;
hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
WorkerNodesHash = ShmemInitHash("Worker Node Hash",
initTableSize, maxTableSize,
&info, hashFlags);
/*
* Load the intial contents of the worker node hash table from the
* configuration file.
*/
LoadWorkerNodeList(WorkerListFileName);
if (prev_shmem_startup_hook != NULL)
{
prev_shmem_startup_hook();
}
}
/*
* WorkerNodeHashCode computes the hash code for a worker node from the node's
* host name and port number. Nodes that only differ by their rack locations
* hash to the same value.
*/
static uint32
WorkerNodeHashCode(const void *key, Size keySize)
{
const WorkerNode *worker = (const WorkerNode *) key;
const char *workerName = worker->workerName;
const uint32 *workerPort = &(worker->workerPort);
/* standard hash function outlined in Effective Java, Item 8 */
uint32 result = 17;
result = 37 * result + string_hash(workerName, WORKER_LENGTH);
result = 37 * result + tag_hash(workerPort, sizeof(uint32));
return result;
}
/*
* CompareWorkerNodes compares two pointers to worker nodes using the exact
* same logic employed by WorkerNodeCompare.
*/
int
CompareWorkerNodes(const void *leftElement, const void *rightElement)
{
const void *leftWorker = *((const void **) leftElement);
const void *rightWorker = *((const void **) rightElement);
int compare = 0;
Size ignoredKeySize = 0;
compare = WorkerNodeCompare(leftWorker, rightWorker, ignoredKeySize);
return compare;
}
/*
* WorkerNodeCompare compares two worker nodes by their host name and port
* number. Two nodes that only differ by their rack locations are considered to
* be equal to each other.
*/
static int
WorkerNodeCompare(const void *lhsKey, const void *rhsKey, Size keySize)
{
const WorkerNode *workerLhs = (const WorkerNode *) lhsKey;
const WorkerNode *workerRhs = (const WorkerNode *) rhsKey;
int nameCompare = 0;
int portCompare = 0;
nameCompare = strncmp(workerLhs->workerName, workerRhs->workerName, WORKER_LENGTH);
if (nameCompare != 0)
{
return nameCompare;
}
portCompare = workerLhs->workerPort - workerRhs->workerPort;
return portCompare;
}
/*
* LoadWorkerNodeList reads and parses given membership file, and loads worker
* nodes from this membership file into the shared hash. The function relies on
* hba.c's tokenization method for parsing, and therefore the membership file
* has the same syntax as other configuration files such as ph_hba.conf.
*
* Note that this function allows for reloading membership configuration files
* at runtime. When that happens, old worker nodes that do not appear in the
* file are marked as stale, but are still kept in the shared hash.
*/
void
LoadWorkerNodeList(const char *workerFilename)
{
List *workerList = NIL;
ListCell *workerCell = NULL;
uint32 workerCount = 0;
workerList = ParseWorkerNodeFile(workerFilename);
workerCount = list_length(workerList);
if (workerCount > MaxWorkerNodesTracked)
{
ereport(FATAL, (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("worker node count: %u exceeds max allowed value: %d",
workerCount, MaxWorkerNodesTracked)));
}
else
{
ereport(INFO, (errmsg("reading nodes from worker file: %s", workerFilename)));
}
/* before reading file's lines, reset worker node hash */
ResetWorkerNodesHash(WorkerNodesHash);
/* parse file lines */
foreach(workerCell, workerList)
{
WorkerNode *workerNode = NULL;
WorkerNode *parsedNode = lfirst(workerCell);
void *hashKey = NULL;
bool handleFound = false;
/*
* Search for the parsed worker node in the hash, and then insert parsed
* values. When searching, we make the hashKey point to the beginning of
* the parsed node; we previously set the key length and key comparison
* function to include both the node name and the port number.
*/
hashKey = (void *) parsedNode;
workerNode = (WorkerNode *) hash_search(WorkerNodesHash, hashKey,
HASH_ENTER, &handleFound);
if (handleFound)
{
/* display notification if worker node's rack changed */
char *oldWorkerRack = workerNode->workerRack;
char *newWorkerRack = parsedNode->workerRack;
if (strncmp(oldWorkerRack, newWorkerRack, WORKER_LENGTH) != 0)
{
ereport(INFO, (errmsg("worker node: \"%s:%u\" changed rack location",
workerNode->workerName, workerNode->workerPort)));
}
/* display warning if worker node already appeared in this file */
if (workerNode->inWorkerFile)
{
ereport(WARNING, (errmsg("multiple lines for worker node: \"%s:%u\"",
workerNode->workerName,
workerNode->workerPort)));
}
}
strlcpy(workerNode->workerName, parsedNode->workerName, WORKER_LENGTH);
strlcpy(workerNode->workerRack, parsedNode->workerRack, WORKER_LENGTH);
workerNode->workerPort = parsedNode->workerPort;
workerNode->inWorkerFile = parsedNode->inWorkerFile;
pfree(parsedNode);
}
}
/*
* ParseWorkerNodeFile opens and parses the node name and node port from the
* specified configuration file.
*/
static List *
ParseWorkerNodeFile(const char *workerNodeFilename)
{
FILE *workerFileStream = NULL;
List *workerNodeList = NIL;
char workerNodeLine[MAXPGPATH];
char *workerFilePath = make_absolute_path(workerNodeFilename);
char *workerPatternTemplate = "%%%u[^# \t]%%*[ \t]%%%u[^# \t]%%*[ \t]%%%u[^# \t]";
char workerLinePattern[1024];
const int workerNameIndex = 0;
const int workerPortIndex = 1;
memset(workerLinePattern, '\0', sizeof(workerLinePattern));
workerFileStream = AllocateFile(workerFilePath, PG_BINARY_R);
if (workerFileStream == NULL)
{
if (errno == ENOENT)
{
ereport(DEBUG1, (errmsg("worker list file located at \"%s\" is not present",
workerFilePath)));
}
else
{
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not open worker list file \"%s\": %m",
workerFilePath)));
}
return NIL;
}
/* build pattern to contain node name length limit */
snprintf(workerLinePattern, sizeof(workerLinePattern), workerPatternTemplate,
WORKER_LENGTH, MAX_PORT_LENGTH, WORKER_LENGTH);
while (fgets(workerNodeLine, sizeof(workerNodeLine), workerFileStream) != NULL)
{
const int workerLineLength = strnlen(workerNodeLine, MAXPGPATH);
WorkerNode *workerNode = NULL;
char *linePointer = NULL;
int32 nodePort = PostPortNumber; /* default port number */
int fieldCount = 0;
bool lineIsInvalid = false;
char nodeName[WORKER_LENGTH + 1];
char nodeRack[WORKER_LENGTH + 1];
char nodePortString[MAX_PORT_LENGTH + 1];
memset(nodeName, '\0', sizeof(nodeName));
strlcpy(nodeRack, WORKER_DEFAULT_RACK, sizeof(nodeRack));
memset(nodePortString, '\0', sizeof(nodePortString));
if (workerLineLength == MAXPGPATH - 1)
{
ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("worker node list file line exceeds the maximum "
"length of %d", MAXPGPATH)));
}
/* trim trailing newlines preserved by fgets, if any */
linePointer = workerNodeLine + workerLineLength - 1;
while (linePointer >= workerNodeLine &&
(*linePointer == '\n' || *linePointer == '\r'))
{
*linePointer-- = '\0';
}
/* skip leading whitespace */
for (linePointer = workerNodeLine; *linePointer; linePointer++)
{
if (!isspace((unsigned char) *linePointer))
{
break;
}
}
/* if the entire line is whitespace or a comment, skip it */
if (*linePointer == '\0' || *linePointer == '#')
{
continue;
}
/* parse line; node name is required, but port and rack are optional */
fieldCount = sscanf(linePointer, workerLinePattern,
nodeName, nodePortString, nodeRack);
/* adjust field count for zero based indexes */
fieldCount--;
/* raise error if no fields were assigned */
if (fieldCount < workerNameIndex)
{
lineIsInvalid = true;
}
/* no special treatment for nodeName: already parsed by sscanf */
/* if a second token was specified, convert to integer port */
if (fieldCount >= workerPortIndex)
{
char *nodePortEnd = NULL;
errno = 0;
nodePort = strtol(nodePortString, &nodePortEnd, 10);
if (errno != 0 || (*nodePortEnd) != '\0' || nodePort <= 0)
{
lineIsInvalid = true;
}
}
if (lineIsInvalid)
{
ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not parse worker node line: %s",
workerNodeLine),
errhint("Lines in the worker node file must contain a valid "
"node name and, optionally, a positive port number. "
"Comments begin with a '#' character and extend to "
"the end of their line.")));
}
/* allocate worker node structure and set fields */
workerNode = (WorkerNode *) palloc0(sizeof(WorkerNode));
strlcpy(workerNode->workerName, nodeName, WORKER_LENGTH + 1);
strlcpy(workerNode->workerRack, nodeRack, WORKER_LENGTH + 1);
workerNode->workerPort = nodePort;
workerNode->inWorkerFile = true;
workerNodeList = lappend(workerNodeList, workerNode);
}
FreeFile(workerFileStream);
free(workerFilePath);
return workerNodeList;
}
/* Marks all worker nodes in the shared hash as stale. */
static void
ResetWorkerNodesHash(HTAB *WorkerNodesHash)
{
WorkerNode *workerNode = NULL;
HASH_SEQ_STATUS status;
hash_seq_init(&status, WorkerNodesHash);
workerNode = (WorkerNode *) hash_seq_search(&status);
while (workerNode != NULL)
{
workerNode->inWorkerFile = false;
workerNode = (WorkerNode *) hash_seq_search(&status);
}
}
/* ResponsiveWorkerNodeList returns a list of all responsive worker nodes */
List *
ResponsiveWorkerNodeList(void)
{
List *responsiveWorkerNodeList = NULL;
ListCell *workerNodeCell = NULL;
List *workerNodeList = WorkerNodeList();
foreach(workerNodeCell, workerNodeList)
{
bool workerNodeResponsive = false;
WorkerNode *workerNode = lfirst(workerNodeCell);
workerNodeResponsive = WorkerNodeResponsive(workerNode->workerName,
workerNode->workerPort);
if (workerNodeResponsive)
{
responsiveWorkerNodeList = lappend(responsiveWorkerNodeList, workerNode);
}
}
return responsiveWorkerNodeList;
}
/*
* WorkerNodeResponsive returns true if the given worker node is reponsive.
* Otherwise, it returns false.
*
* This function is based on worker_node_responsive function present in the
* shard rebalancer.
*/
static bool
WorkerNodeResponsive(const char *workerName, uint32 workerPort)
{
bool workerNodeResponsive = false;
const char *databaseName = get_database_name(MyDatabaseId);
int connectionId = MultiClientConnect(workerName, workerPort, databaseName);
if (connectionId != INVALID_CONNECTION_ID)
{
MultiClientDisconnect(connectionId);
workerNodeResponsive = true;
}
return workerNodeResponsive;
}

View File

@ -0,0 +1,649 @@
/*-------------------------------------------------------------------------
*
* modify_planner.c
*
* This file contains functions to plan distributed table modifications.
*
* Copyright (c) 2014-2016, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include <stddef.h>
#if (PG_VERSION_NUM >= 90500 && PG_VERSION_NUM < 90600)
#include "access/stratnum.h"
#else
#include "access/skey.h"
#endif
#include "distributed/citus_nodes.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/metadata_cache.h"
#include "distributed/modify_planner.h" /* IWYU pragma: keep */
#include "distributed/multi_join_order.h"
#include "distributed/multi_logical_planner.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/multi_router_executor.h"
#include "distributed/listutils.h"
#include "distributed/citus_ruleutils.h"
#include "distributed/relay_utility.h"
#include "distributed/resource_lock.h"
#include "executor/execdesc.h"
#include "lib/stringinfo.h"
#if (PG_VERSION_NUM >= 90500)
#include "nodes/makefuncs.h"
#endif
#include "nodes/nodeFuncs.h"
#include "nodes/nodes.h"
#include "nodes/parsenodes.h"
#include "nodes/pg_list.h"
#include "nodes/primnodes.h"
#include "optimizer/clauses.h"
#include "parser/parsetree.h"
#include "storage/lock.h"
#include "utils/elog.h"
#include "utils/errcodes.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/relcache.h"
/* planner functions forward declarations */
static void ErrorIfQueryNotSupported(Query *queryTree);
static Task * DistributedModifyTask(Query *query);
#if (PG_VERSION_NUM >= 90500)
static OnConflictExpr * RebuildOnConflict(Oid relationId,
OnConflictExpr *originalOnConflict);
#endif
static Job * DistributedModifyJob(Query *query, Task *modifyTask);
static List * QueryRestrictList(Query *query);
static ShardInterval * DistributedModifyShardInterval(Query *query);
static Oid ExtractFirstDistributedTableId(Query *query);
static Const * ExtractPartitionValue(Query *query, Var *partitionColumn);
/*
* MultiModifyPlanCreate actually creates the distributed plan for execution
* of a distribution modification. It expects that the provided MultiTreeRoot
* is actually a Query object, which it uses directly to produce a MultiPlan.
*/
MultiPlan *
MultiModifyPlanCreate(Query *query)
{
Task *modifyTask = NULL;
Job *modifyJob = NULL;
MultiPlan *multiPlan = NULL;
ErrorIfQueryNotSupported(query);
modifyTask = DistributedModifyTask(query);
modifyJob = DistributedModifyJob(query, modifyTask);
multiPlan = CitusMakeNode(MultiPlan);
multiPlan->workerJob = modifyJob;
multiPlan->masterQuery = NULL;
multiPlan->masterTableName = NULL;
return multiPlan;
}
/*
* ErrorIfQueryNotSupported checks if the query contains unsupported features,
* and errors out if it does.
*/
static void
ErrorIfQueryNotSupported(Query *queryTree)
{
Oid distributedTableId = ExtractFirstDistributedTableId(queryTree);
uint32 rangeTableId = 1;
Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
char partitionMethod = PartitionMethod(distributedTableId);
List *rangeTableList = NIL;
ListCell *rangeTableCell = NULL;
bool hasValuesScan = false;
uint32 queryTableCount = 0;
bool hasNonConstTargetEntryExprs = false;
bool hasNonConstQualExprs = false;
bool specifiesPartitionValue = false;
#if (PG_VERSION_NUM >= 90500)
ListCell *setTargetCell = NULL;
List *onConflictSet = NIL;
Node *arbiterWhere = NULL;
Node *onConflictWhere = NULL;
#endif
CmdType commandType = queryTree->commandType;
Assert(commandType == CMD_INSERT || commandType == CMD_UPDATE ||
commandType == CMD_DELETE);
if (!(partitionMethod == DISTRIBUTE_BY_HASH ||
partitionMethod == DISTRIBUTE_BY_RANGE))
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot perform distributed planning for the given"
" modification"),
errdetail("Only hash- or range-partitioned tables may be the "
"target of distributed modifications")));
}
/*
* Reject subqueries which are in SELECT or WHERE clause.
* Queries which include subqueries in FROM clauses are rejected below.
*/
if (queryTree->hasSubLinks == true)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot perform distributed planning for the given"
" modification"),
errdetail("Subqueries are not supported in distributed"
" modifications.")));
}
/* reject queries which include CommonTableExpr */
if (queryTree->cteList != NIL)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot perform distributed planning for the given"
" modification"),
errdetail("Common table expressions are not supported in"
" distributed modifications.")));
}
/* extract range table entries */
ExtractRangeTableEntryWalker((Node *) queryTree, &rangeTableList);
foreach(rangeTableCell, rangeTableList)
{
RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell);
if (rangeTableEntry->rtekind == RTE_RELATION)
{
queryTableCount++;
}
else if (rangeTableEntry->rtekind == RTE_VALUES)
{
hasValuesScan = true;
}
else
{
/*
* Error out for rangeTableEntries that we do not support.
* We do not explicitly specify "in FROM clause" in the error detail
* for the features that we do not support at all (SUBQUERY, JOIN).
* We do not need to check for RTE_CTE because all common table expressions
* are rejected above with queryTree->cteList check.
*/
char *rangeTableEntryErrorDetail = NULL;
if (rangeTableEntry->rtekind == RTE_SUBQUERY)
{
rangeTableEntryErrorDetail = "Subqueries are not supported in"
" distributed modifications.";
}
else if (rangeTableEntry->rtekind == RTE_JOIN)
{
rangeTableEntryErrorDetail = "Joins are not supported in distributed"
" modifications.";
}
else if (rangeTableEntry->rtekind == RTE_FUNCTION)
{
rangeTableEntryErrorDetail = "Functions must not appear in the FROM"
" clause of a distributed modifications.";
}
else
{
rangeTableEntryErrorDetail = "Unrecognized range table entry.";
}
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot perform distributed planning for the given"
" modifications"),
errdetail("%s", rangeTableEntryErrorDetail)));
}
}
/*
* Reject queries which involve joins. Note that UPSERTs are exceptional for this case.
* Queries like "INSERT INTO table_name ON CONFLICT DO UPDATE (col) SET other_col = ''"
* contains two range table entries, and we have to allow them.
*/
if (commandType != CMD_INSERT && queryTableCount != 1)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot perform distributed planning for the given"
" modification"),
errdetail("Joins are not supported in distributed "
"modifications.")));
}
/* reject queries which involve multi-row inserts */
if (hasValuesScan)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot perform distributed planning for the given"
" modification"),
errdetail("Multi-row INSERTs to distributed tables are not "
"supported.")));
}
/* reject queries with a returning list */
if (list_length(queryTree->returningList) > 0)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot perform distributed planning for the given"
" modification"),
errdetail("RETURNING clauses are not supported in distributed "
"modifications.")));
}
if (commandType == CMD_INSERT || commandType == CMD_UPDATE ||
commandType == CMD_DELETE)
{
FromExpr *joinTree = NULL;
ListCell *targetEntryCell = NULL;
foreach(targetEntryCell, queryTree->targetList)
{
TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell);
/* skip resjunk entries: UPDATE adds some for ctid, etc. */
if (targetEntry->resjunk)
{
continue;
}
if (!IsA(targetEntry->expr, Const))
{
hasNonConstTargetEntryExprs = true;
}
if (commandType == CMD_UPDATE &&
targetEntry->resno == partitionColumn->varattno)
{
specifiesPartitionValue = true;
}
}
joinTree = queryTree->jointree;
if (joinTree != NULL && contain_mutable_functions(joinTree->quals))
{
hasNonConstQualExprs = true;
}
}
#if (PG_VERSION_NUM >= 90500)
if (commandType == CMD_INSERT && queryTree->onConflict != NULL)
{
onConflictSet = queryTree->onConflict->onConflictSet;
arbiterWhere = queryTree->onConflict->arbiterWhere;
onConflictWhere = queryTree->onConflict->onConflictWhere;
}
/*
* onConflictSet is expanded via expand_targetlist() on the standard planner.
* This ends up adding all the columns to the onConflictSet even if the user
* does not explicitly state the columns in the query.
*
* The following loop simply allows "DO UPDATE SET part_col = table.part_col"
* types of elements in the target list, which are added by expand_targetlist().
* Any other attempt to update partition column value is forbidden.
*/
foreach(setTargetCell, onConflictSet)
{
TargetEntry *setTargetEntry = (TargetEntry *) lfirst(setTargetCell);
if (setTargetEntry->resno == partitionColumn->varattno)
{
Expr *setExpr = setTargetEntry->expr;
if (IsA(setExpr, Var) &&
((Var *) setExpr)->varattno == partitionColumn->varattno)
{
specifiesPartitionValue = false;
}
else
{
specifiesPartitionValue = true;
}
}
else
{
/*
* Similarly, allow "DO UPDATE SET col_1 = table.col_1" types of
* target list elements. Note that, the following check allows
* "DO UPDATE SET col_1 = table.col_2", which is not harmful.
*/
if (IsA(setTargetEntry->expr, Var))
{
continue;
}
else if (contain_mutable_functions((Node *) setTargetEntry->expr))
{
hasNonConstTargetEntryExprs = true;
}
}
}
/* error if either arbiter or on conflict WHERE contains a mutable function */
if (contain_mutable_functions((Node *) arbiterWhere) ||
contain_mutable_functions((Node *) onConflictWhere))
{
hasNonConstQualExprs = true;
}
#endif
if (hasNonConstTargetEntryExprs || hasNonConstQualExprs)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot plan sharded modification containing values "
"which are not constants or constant expressions")));
}
if (specifiesPartitionValue)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("modifying the partition value of rows is not allowed")));
}
}
/*
* DistributedModifyTask builds a Task to represent a modification performed by
* the provided query against the provided shard interval. This task contains
* shard-extended deparsed SQL to be run during execution.
*/
static Task *
DistributedModifyTask(Query *query)
{
ShardInterval *shardInterval = DistributedModifyShardInterval(query);
uint64 shardId = shardInterval->shardId;
FromExpr *joinTree = NULL;
StringInfo queryString = makeStringInfo();
Task *modifyTask = NULL;
bool upsertQuery = false;
/* grab shared metadata lock to stop concurrent placement additions */
LockShardDistributionMetadata(shardId, ShareLock);
/*
* Convert the qualifiers to an explicitly and'd clause, which is needed
* before we deparse the query. This applies to SELECT, UPDATE and
* DELETE statements.
*/
joinTree = query->jointree;
if ((joinTree != NULL) && (joinTree->quals != NULL))
{
Node *whereClause = joinTree->quals;
if (IsA(whereClause, List))
{
joinTree->quals = (Node *) make_ands_explicit((List *) whereClause);
}
}
#if (PG_VERSION_NUM >= 90500)
if (query->onConflict != NULL)
{
RangeTblEntry *rangeTableEntry = NULL;
Oid relationId = shardInterval->relationId;
/* set the flag */
upsertQuery = true;
/* setting an alias simplifies deparsing of UPSERTs */
rangeTableEntry = linitial(query->rtable);
if (rangeTableEntry->alias == NULL)
{
Alias *alias = makeAlias(UPSERT_ALIAS, NIL);
rangeTableEntry->alias = alias;
}
/* some fields in onConflict expression needs to be updated for deparsing */
query->onConflict = RebuildOnConflict(relationId, query->onConflict);
}
#else
/* always set to false for PG_VERSION_NUM < 90500 */
upsertQuery = false;
#endif
deparse_shard_query(query, shardInterval->relationId, shardId, queryString);
ereport(DEBUG4, (errmsg("distributed statement: %s", queryString->data)));
modifyTask = CitusMakeNode(Task);
modifyTask->jobId = INVALID_JOB_ID;
modifyTask->taskId = INVALID_TASK_ID;
modifyTask->taskType = MODIFY_TASK;
modifyTask->queryString = queryString->data;
modifyTask->anchorShardId = shardId;
modifyTask->dependedTaskList = NIL;
modifyTask->upsertQuery = upsertQuery;
return modifyTask;
}
#if (PG_VERSION_NUM >= 90500)
/*
* RebuildOnConflict rebuilds OnConflictExpr for correct deparsing. The function
* makes WHERE clause elements explicit and filters dropped columns
* from the target list.
*/
static OnConflictExpr *
RebuildOnConflict(Oid relationId, OnConflictExpr *originalOnConflict)
{
OnConflictExpr *updatedOnConflict = copyObject(originalOnConflict);
Node *onConflictWhere = updatedOnConflict->onConflictWhere;
List *onConflictSet = updatedOnConflict->onConflictSet;
TupleDesc distributedRelationDesc = NULL;
ListCell *targetEntryCell = NULL;
List *filteredOnConflictSet = NIL;
Form_pg_attribute *tableAttributes = NULL;
Relation distributedRelation = RelationIdGetRelation(relationId);
/* Convert onConflictWhere qualifiers to an explicitly and'd clause */
updatedOnConflict->onConflictWhere =
(Node *) make_ands_explicit((List *) onConflictWhere);
/*
* Here we handle dropped columns on the distributed table. onConflictSet
* includes the table attributes even if they are dropped,
* since the it is expanded via expand_targetlist() on standard planner.
*/
/* get the relation tuple descriptor and table attributes */
distributedRelationDesc = RelationGetDescr(distributedRelation);
tableAttributes = distributedRelationDesc->attrs;
foreach(targetEntryCell, onConflictSet)
{
TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell);
FormData_pg_attribute *tableAttribute = tableAttributes[targetEntry->resno -1];
/* skip dropped columns */
if (tableAttribute->attisdropped)
{
continue;
}
/* we only want to deparse non-dropped columns */
filteredOnConflictSet = lappend(filteredOnConflictSet, targetEntry);
}
/* close distributedRelation to prevent leaks */
RelationClose(distributedRelation);
/* set onConflictSet again with the filtered list */
updatedOnConflict->onConflictSet = filteredOnConflictSet;
return updatedOnConflict;
}
#endif
/*
* DistributedModifyJob creates a Job for the specified query to execute the
* provided modification task. Modification task placements are produced using
* the "first-replica" algorithm, except modifications run against all matching
* placements rather than just the first successful one.
*/
Job *
DistributedModifyJob(Query *query, Task *modifyTask)
{
Job *modifyJob = NULL;
List *taskList = FirstReplicaAssignTaskList(list_make1(modifyTask));
modifyJob = CitusMakeNode(Job);
modifyJob->dependedJobList = NIL;
modifyJob->jobId = INVALID_JOB_ID;
modifyJob->subqueryPushdown = false;
modifyJob->jobQuery = query;
modifyJob->taskList = taskList;
return modifyJob;
}
/*
* DistributedModifyShardInterval determines the single shard targeted by a
* provided distributed modification command. If no matching shards exist, or
* if the modification targets more than one one shard, this function raises
* an error.
*/
static ShardInterval *
DistributedModifyShardInterval(Query *query)
{
List *restrictClauseList = NIL;
List *prunedShardList = NIL;
Index tableId = 1;
Oid distributedTableId = ExtractFirstDistributedTableId(query);
List *shardIntervalList = NIL;
/* error out if no shards exist for the table */
shardIntervalList = LoadShardIntervalList(distributedTableId);
if (shardIntervalList == NIL)
{
char *relationName = get_rel_name(distributedTableId);
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("could not find any shards for modification"),
errdetail("No shards exist for distributed table \"%s\".",
relationName),
errhint("Run master_create_worker_shards to create shards "
"and try again.")));
}
restrictClauseList = QueryRestrictList(query);
prunedShardList = PruneShardList(distributedTableId, tableId, restrictClauseList,
shardIntervalList);
if (list_length(prunedShardList) != 1)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("distributed modifications must target exactly one "
"shard")));
}
return (ShardInterval *) linitial(prunedShardList);
}
/*
* QueryRestrictList returns the restriction clauses for the query. For a SELECT
* statement these are the where-clause expressions. For INSERT statements we
* build an equality clause based on the partition-column and its supplied
* insert value.
*/
static List *
QueryRestrictList(Query *query)
{
List *queryRestrictList = NIL;
CmdType commandType = query->commandType;
if (commandType == CMD_INSERT)
{
/* build equality expression based on partition column value for row */
Oid distributedTableId = ExtractFirstDistributedTableId(query);
uint32 rangeTableId = 1;
Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
Const *partitionValue = ExtractPartitionValue(query, partitionColumn);
OpExpr *equalityExpr = MakeOpExpression(partitionColumn, BTEqualStrategyNumber);
Node *rightOp = get_rightop((Expr *) equalityExpr);
Const *rightConst = (Const *) rightOp;
Assert(IsA(rightOp, Const));
rightConst->constvalue = partitionValue->constvalue;
rightConst->constisnull = partitionValue->constisnull;
rightConst->constbyval = partitionValue->constbyval;
queryRestrictList = list_make1(equalityExpr);
}
else if (commandType == CMD_UPDATE || commandType == CMD_DELETE)
{
queryRestrictList = WhereClauseList(query->jointree);
}
return queryRestrictList;
}
/*
* ExtractFirstDistributedTableId takes a given query, and finds the relationId
* for the first distributed table in that query. If the function cannot find a
* distributed table, it returns InvalidOid.
*/
static Oid
ExtractFirstDistributedTableId(Query *query)
{
List *rangeTableList = NIL;
ListCell *rangeTableCell = NULL;
Oid distributedTableId = InvalidOid;
/* extract range table entries */
ExtractRangeTableEntryWalker((Node *) query, &rangeTableList);
foreach(rangeTableCell, rangeTableList)
{
RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell);
if (IsDistributedTable(rangeTableEntry->relid))
{
distributedTableId = rangeTableEntry->relid;
break;
}
}
return distributedTableId;
}
/*
* ExtractPartitionValue extracts the partition column value from a the target
* of a modification command. If a partition value is missing altogether or is
* NULL, this function throws an error.
*/
static Const *
ExtractPartitionValue(Query *query, Var *partitionColumn)
{
Const *partitionValue = NULL;
TargetEntry *targetEntry = get_tle_by_resno(query->targetList,
partitionColumn->varattno);
if (targetEntry != NULL)
{
Assert(IsA(targetEntry->expr, Const));
partitionValue = (Const *) targetEntry->expr;
}
if (partitionValue == NULL || partitionValue->constisnull)
{
ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("cannot plan INSERT using row with NULL value "
"in partition column")));
}
return partitionValue;
}

View File

@ -0,0 +1,108 @@
/*-------------------------------------------------------------------------
*
* multi_explain.c
* CitusDB explain support.
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "commands/prepare.h"
#include "distributed/citus_nodefuncs.h"
#include "distributed/multi_explain.h"
#include "distributed/multi_planner.h"
#include "distributed/multi_logical_optimizer.h"
#include "distributed/multi_physical_planner.h"
#include "nodes/print.h"
#include "optimizer/planner.h"
#include "tcop/tcopprot.h"
/* Config variables that enable printing distributed query plans */
bool ExplainMultiLogicalPlan = false;
bool ExplainMultiPhysicalPlan = false;
/*
* MultiExplainOneQuery takes the given query, and checks if the query is local
* or distributed. If the query is local, the function runs the standard explain
* logic. If the query is distributed, the function looks up configuration and
* prints out the distributed logical and physical plans as appropriate.
*/
void
MultiExplainOneQuery(Query *query, IntoClause *into, ExplainState *es,
const char *queryString, ParamListInfo params)
{
MultiTreeRoot *multiTree = NULL;
MultiPlan *multiPlan = NULL;
Query *queryCopy = NULL;
CmdType commandType = query->commandType;
/* if local query, run the standard explain and return */
bool localQuery = !NeedsDistributedPlanning(query);
if (localQuery)
{
PlannedStmt *plan = NULL;
instr_time planstart;
instr_time planduration;
INSTR_TIME_SET_CURRENT(planstart);
/* plan the query */
plan = pg_plan_query(query, 0, params);
INSTR_TIME_SET_CURRENT(planduration);
INSTR_TIME_SUBTRACT(planduration, planstart);
/* run it (if needed) and produce output */
ExplainOnePlan(plan, into, es, queryString, params, &planduration);
return;
}
/* error out early if the query is a modification */
if (commandType == CMD_INSERT || commandType == CMD_UPDATE ||
commandType == CMD_DELETE)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot show execution plan for distributed modification"),
errdetail("EXPLAIN commands are unsupported for distributed "
"modifications.")));
}
/* call standard planner to modify the query structure before multi planning */
standard_planner(query, 0, params);
queryCopy = copyObject(query);
/* create the logical and physical plan */
multiTree = MultiLogicalPlanCreate(queryCopy);
MultiLogicalPlanOptimize(multiTree);
multiPlan = MultiPhysicalPlanCreate(multiTree);
if (ExplainMultiLogicalPlan)
{
char *logicalPlanString = CitusNodeToString(multiTree);
char *formattedPlanString = pretty_format_node_dump(logicalPlanString);
appendStringInfo(es->str, "logical plan:\n");
appendStringInfo(es->str, "%s\n", formattedPlanString);
}
if (ExplainMultiPhysicalPlan)
{
char *physicalPlanString = CitusNodeToString(multiPlan);
char *formattedPlanString = pretty_format_node_dump(physicalPlanString);
appendStringInfo(es->str, "physical plan:\n");
appendStringInfo(es->str, "%s\n", formattedPlanString);
}
/* if explain printing isn't enabled, print error only after planning */
if (!ExplainMultiLogicalPlan && !ExplainMultiPhysicalPlan)
{
appendStringInfo(es->str, "explain statements for distributed queries ");
appendStringInfo(es->str, "are currently unsupported\n");
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,378 @@
/*-------------------------------------------------------------------------
*
* multi_master_planner.c
* Routines for building create table and select into table statements on the
* master node.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "distributed/multi_master_planner.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/multi_server_executor.h"
#include "distributed/worker_protocol.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/planmain.h"
#include "optimizer/tlist.h"
#include "optimizer/var.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/syscache.h"
/*
* MasterTargetList uses the given worker target list's expressions, and creates
* a target target list for the master node. This master target list keeps the
* temporary table's columns on the master node.
*/
static List *
MasterTargetList(List *workerTargetList)
{
List *masterTargetList = NIL;
const Index tableId = 1;
AttrNumber columnId = 1;
ListCell *workerTargetCell = NULL;
foreach(workerTargetCell, workerTargetList)
{
TargetEntry *workerTargetEntry = (TargetEntry *) lfirst(workerTargetCell);
TargetEntry *masterTargetEntry = copyObject(workerTargetEntry);
Var *masterColumn = makeVarFromTargetEntry(tableId, workerTargetEntry);
masterColumn->varattno = columnId;
masterColumn->varoattno = columnId;
columnId++;
/*
* The master target entry has two pieces to it. The first piece is the
* target entry's expression, which we set to the newly created column.
* The second piece is sort and group clauses that we implicitly copy
* from the worker target entry. Note that any changes to worker target
* entry's sort and group clauses will *break* us here.
*/
masterTargetEntry->expr = (Expr *) masterColumn;
masterTargetList = lappend(masterTargetList, masterTargetEntry);
}
return masterTargetList;
}
/*
* BuildCreateStatement builds the executable create statement for creating a
* temporary table on the master; and then returns this create statement. This
* function obtains the needed column type information from the target list.
*/
static CreateStmt *
BuildCreateStatement(char *masterTableName, List *masterTargetList,
List *masterColumnNameList)
{
CreateStmt *createStatement = NULL;
RangeVar *relation = NULL;
char *relationName = NULL;
List *columnTypeList = NIL;
List *columnDefinitionList = NIL;
ListCell *masterTargetCell = NULL;
/* build rangevar object for temporary table */
relationName = masterTableName;
relation = makeRangeVar(NULL, relationName, -1);
relation->relpersistence = RELPERSISTENCE_TEMP;
/* build the list of column types as cstrings */
foreach(masterTargetCell, masterTargetList)
{
TargetEntry *targetEntry = (TargetEntry *) lfirst(masterTargetCell);
Var *column = (Var *) targetEntry->expr;
Oid columnTypeId = exprType((Node *) column);
int32 columnTypeMod = exprTypmod((Node *) column);
char *columnTypeName = format_type_with_typemod(columnTypeId, columnTypeMod);
columnTypeList = lappend(columnTypeList, columnTypeName);
}
/* build the column definition list */
columnDefinitionList = ColumnDefinitionList(masterColumnNameList, columnTypeList);
/* build the create statement */
createStatement = CreateStatement(relation, columnDefinitionList);
return createStatement;
}
/*
* BuildAggregatePlan creates and returns an aggregate plan. This aggregate plan
* builds aggreation and grouping operators (if any) that are to be executed on
* the master node.
*/
static Agg *
BuildAggregatePlan(Query *masterQuery, Plan *subPlan)
{
Agg *aggregatePlan = NULL;
AggStrategy aggregateStrategy = AGG_PLAIN;
AggClauseCosts aggregateCosts;
AttrNumber *groupColumnIdArray = NULL;
List *aggregateTargetList = NIL;
List *groupColumnList = NIL;
List *columnList = NIL;
ListCell *columnCell = NULL;
Oid *groupColumnOpArray = NULL;
uint32 groupColumnCount = 0;
const long rowEstimate = 10;
/* assert that we need to build an aggregate plan */
Assert(masterQuery->hasAggs || masterQuery->groupClause);
aggregateTargetList = masterQuery->targetList;
count_agg_clauses(NULL, (Node *) aggregateTargetList, &aggregateCosts);
/*
* For upper level plans above the sequential scan, the planner expects the
* table id (varno) to be set to OUTER_VAR.
*/
columnList = pull_var_clause_default((Node *) aggregateTargetList);
foreach(columnCell, columnList)
{
Var *column = (Var *) lfirst(columnCell);
column->varno = OUTER_VAR;
}
groupColumnList = masterQuery->groupClause;
groupColumnCount = list_length(groupColumnList);
/* if we have grouping, then initialize appropriate information */
if (groupColumnCount > 0)
{
if (!grouping_is_hashable(groupColumnList))
{
ereport(ERROR, (errmsg("grouped column list cannot be hashed")));
}
/* switch to hashed aggregate strategy to allow grouping */
aggregateStrategy = AGG_HASHED;
/* get column indexes that are being grouped */
groupColumnIdArray = extract_grouping_cols(groupColumnList, subPlan->targetlist);
groupColumnOpArray = extract_grouping_ops(groupColumnList);
}
/* finally create the plan */
#if (PG_VERSION_NUM >= 90500)
aggregatePlan = make_agg(NULL, aggregateTargetList, NIL, aggregateStrategy,
&aggregateCosts, groupColumnCount, groupColumnIdArray,
groupColumnOpArray, NIL, rowEstimate, subPlan);
#else
aggregatePlan = make_agg(NULL, aggregateTargetList, NIL, aggregateStrategy,
&aggregateCosts, groupColumnCount, groupColumnIdArray,
groupColumnOpArray, rowEstimate, subPlan);
#endif
return aggregatePlan;
}
/*
* BuildSelectStatement builds the final select statement to run on the master
* node, before returning results to the user. The function first builds a scan
* statement for all results fetched to the master, and layers aggregation, sort
* and limit plans on top of the scan statement if necessary.
*/
static PlannedStmt *
BuildSelectStatement(Query *masterQuery, char *masterTableName,
List *masterTargetList)
{
PlannedStmt *selectStatement = NULL;
RangeTblEntry *rangeTableEntry = NULL;
RangeTblEntry *queryRangeTableEntry = NULL;
SeqScan *sequentialScan = NULL;
Agg *aggregationPlan = NULL;
Plan *topLevelPlan = NULL;
/* (1) make PlannedStmt and set basic information */
selectStatement = makeNode(PlannedStmt);
selectStatement->canSetTag = true;
selectStatement->relationOids = NIL; /* to be filled in exec_Start */
selectStatement->commandType = CMD_SELECT;
/* prepare the range table entry for our temporary table */
Assert(list_length(masterQuery->rtable) == 1);
queryRangeTableEntry = (RangeTblEntry *) linitial(masterQuery->rtable);
rangeTableEntry = copyObject(queryRangeTableEntry);
rangeTableEntry->rtekind = RTE_RELATION;
rangeTableEntry->eref = makeAlias(masterTableName, NIL);
rangeTableEntry->relid = 0; /* to be filled in exec_Start */
rangeTableEntry->inh = false;
rangeTableEntry->inFromCl = true;
/* set the single element range table list */
selectStatement->rtable = list_make1(rangeTableEntry);
/* (2) build and initialize sequential scan node */
sequentialScan = makeNode(SeqScan);
sequentialScan->scanrelid = 1; /* always one */
/* (3) add an aggregation plan if needed */
if (masterQuery->hasAggs || masterQuery->groupClause)
{
sequentialScan->plan.targetlist = masterTargetList;
aggregationPlan = BuildAggregatePlan(masterQuery, (Plan *) sequentialScan);
topLevelPlan = (Plan *) aggregationPlan;
}
else
{
/* otherwise set the final projections on the scan plan directly */
sequentialScan->plan.targetlist = masterQuery->targetList;
topLevelPlan = (Plan *) sequentialScan;
}
/* (4) add a sorting plan if needed */
if (masterQuery->sortClause)
{
List *sortClauseList = masterQuery->sortClause;
Sort *sortPlan = make_sort_from_sortclauses(NULL, sortClauseList, topLevelPlan);
topLevelPlan = (Plan *) sortPlan;
}
/* (5) add a limit plan if needed */
if (masterQuery->limitCount)
{
Node *limitCount = masterQuery->limitCount;
Node *limitOffset = masterQuery->limitOffset;
int64 offsetEstimate = 0;
int64 countEstimate = 0;
Limit *limitPlan = make_limit(topLevelPlan, limitOffset, limitCount,
offsetEstimate, countEstimate);
topLevelPlan = (Plan *) limitPlan;
}
/* (6) finally set our top level plan in the plan tree */
selectStatement->planTree = topLevelPlan;
return selectStatement;
}
/*
* ValueToStringList walks over the given list of string value types, converts
* value types to cstrings, and adds these cstrings into a new list.
*/
static List *
ValueToStringList(List *valueList)
{
List *stringList = NIL;
ListCell *valueCell = NULL;
foreach(valueCell, valueList)
{
Value *value = (Value *) lfirst(valueCell);
char *stringValue = strVal(value);
stringList = lappend(stringList, stringValue);
}
return stringList;
}
/*
* MasterNodeCreateStatement takes in a multi plan, and constructs a statement
* to create a temporary table on the master node for final result
* aggregation.
*/
CreateStmt *
MasterNodeCreateStatement(MultiPlan *multiPlan)
{
Query *masterQuery = multiPlan->masterQuery;
Job *workerJob = multiPlan->workerJob;
List *workerTargetList = workerJob->jobQuery->targetList;
List *rangeTableList = masterQuery->rtable;
char *tableName = multiPlan->masterTableName;
CreateStmt *createStatement = NULL;
RangeTblEntry *rangeTableEntry = (RangeTblEntry *) linitial(rangeTableList);
List *columnNameValueList = rangeTableEntry->eref->colnames;
List *columnNameList = ValueToStringList(columnNameValueList);
List *targetList = MasterTargetList(workerTargetList);
createStatement = BuildCreateStatement(tableName, targetList, columnNameList);
return createStatement;
}
/*
* MasterNodeSelectPlan takes in a distributed plan, finds the master node query
* structure in that plan, and builds the final select plan to execute on the
* master node. Note that this select plan is executed after result files are
* retrieved from worker nodes and are merged into a temporary table.
*/
PlannedStmt *
MasterNodeSelectPlan(MultiPlan *multiPlan)
{
Query *masterQuery = multiPlan->masterQuery;
char *tableName = multiPlan->masterTableName;
PlannedStmt *masterSelectPlan = NULL;
Job *workerJob = multiPlan->workerJob;
List *workerTargetList = workerJob->jobQuery->targetList;
List *masterTargetList = MasterTargetList(workerTargetList);
masterSelectPlan = BuildSelectStatement(masterQuery, tableName, masterTargetList);
return masterSelectPlan;
}
/*
* MasterNodeCopyStatementList takes in a multi plan, and constructs
* statements that copy over worker task results to a temporary table on the
* master node.
*/
List *
MasterNodeCopyStatementList(MultiPlan *multiPlan)
{
Job *workerJob = multiPlan->workerJob;
List *workerTaskList = workerJob->taskList;
char *tableName = multiPlan->masterTableName;
List *copyStatementList = NIL;
ListCell *workerTaskCell = NULL;
foreach(workerTaskCell, workerTaskList)
{
Task *workerTask = (Task *) lfirst(workerTaskCell);
StringInfo jobDirectoryName = JobDirectoryName(workerTask->jobId);
StringInfo taskFilename = TaskFilename(jobDirectoryName, workerTask->taskId);
RangeVar *relation = makeRangeVar(NULL, tableName, -1);
CopyStmt *copyStatement = makeNode(CopyStmt);
copyStatement->relation = relation;
copyStatement->is_from = true;
copyStatement->filename = taskFilename->data;
if (BinaryMasterCopyFormat)
{
DefElem *copyOption = makeDefElem("format", (Node *) makeString("binary"));
copyStatement->options = list_make1(copyOption);
}
else
{
copyStatement->options = NIL;
}
copyStatementList = lappend(copyStatementList, copyStatement);
}
return copyStatementList;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,294 @@
/*-------------------------------------------------------------------------
*
* multi_planner.c
* General CitusDB planner code.
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <limits.h>
#include "catalog/pg_type.h"
#include "distributed/citus_nodefuncs.h"
#include "distributed/citus_nodes.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_planner.h"
#include "distributed/multi_logical_optimizer.h"
#include "distributed/multi_logical_planner.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/modify_planner.h"
#include "executor/executor.h"
#include "optimizer/planner.h"
#include "utils/memutils.h"
/* local function forward declarations */
static void CheckNodeIsDumpable(Node *node);
/* local function forward declarations */
static MultiPlan * CreatePhysicalPlan(Query *parse);
static char * GetMultiPlanString(PlannedStmt *result);
static PlannedStmt * MultiQueryContainerNode(PlannedStmt *result, MultiPlan *multiPlan);
/* Distributed planner hook */
PlannedStmt *
multi_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
{
PlannedStmt *result = NULL;
/*
* First call into standard planner. This is required because the CitusDB
* planner relies on parse tree transformations made by postgres' planner.
*/
result = standard_planner(parse, cursorOptions, boundParams);
if (NeedsDistributedPlanning(parse))
{
MemoryContext oldcontext = NULL;
MultiPlan *physicalPlan = NULL;
/* Switch to top level message context */
oldcontext = MemoryContextSwitchTo(MessageContext);
physicalPlan = CreatePhysicalPlan(parse);
/* store required data into the planned statement */
result = MultiQueryContainerNode(result, physicalPlan);
/* Now switch back to original context */
MemoryContextSwitchTo(oldcontext);
}
return result;
}
/*
* CreatePhysicalPlan encapsulates the logic needed to transform a particular
* query into a physical plan. For modifications, queries immediately enter
* the physical planning stage, since they are essentially "routed" to remote
* target shards. SELECT queries go through the full logical plan/optimize/
* physical plan process needed to produce distributed query plans.
*/
static MultiPlan *
CreatePhysicalPlan(Query *parse)
{
Query *parseCopy = copyObject(parse);
MultiPlan *physicalPlan = NULL;
CmdType commandType = parse->commandType;
if (commandType == CMD_INSERT || commandType == CMD_UPDATE ||
commandType == CMD_DELETE)
{
/* modifications go directly from a query to a physical plan */
physicalPlan = MultiModifyPlanCreate(parse);
}
else
{
/* Create and optimize logical plan */
MultiTreeRoot *logicalPlan = MultiLogicalPlanCreate(parseCopy);
MultiLogicalPlanOptimize(logicalPlan);
/*
* This check is here to make it likely that all node types used in
* CitusDB are dumpable. Explain can dump logical and physical plans
* using the extended outfuncs infrastructure, but it's infeasible to
* test most plans. MultiQueryContainerNode always serializes the
* physical plan, so there's no need to check that separately.
*/
CheckNodeIsDumpable((Node *) logicalPlan);
/* Create the physical plan */
physicalPlan = MultiPhysicalPlanCreate(logicalPlan);
}
return physicalPlan;
}
/*
* GetMultiPlan returns the associated MultiPlan for a PlannedStmt if the
* statement requires distributed execution, NULL otherwise.
*/
MultiPlan *
GetMultiPlan(PlannedStmt *result)
{
char *serializedMultiPlan = NULL;
MultiPlan *multiPlan = NULL;
serializedMultiPlan = GetMultiPlanString(result);
multiPlan = (MultiPlan *) CitusStringToNode(serializedMultiPlan);
Assert(CitusIsA(multiPlan, MultiPlan));
return multiPlan;
}
/* Does the passed in statement require distributed execution? */
bool
HasCitusToplevelNode(PlannedStmt *result)
{
/*
* Can't be a distributed query if the extension hasn't been loaded
* yet. Directly return false, part of the required infrastructure for
* further checks might not be present.
*/
if (!CitusDBHasBeenLoaded())
{
return false;
}
if (GetMultiPlanString(result) == NULL)
{
return false;
}
else
{
return true;
}
}
/*
* CreateCitusToplevelNode creates the top-level planTree node for a
* distributed statement. That top-level node is a) recognizable by the
* executor hooks, allowing them to redirect execution, b) contains the
* parameters required for distributed execution.
*
* The exact representation of the top-level node is an implementation detail
* which should not be referred to outside this file, as it's likely to become
* version dependant. Use GetMultiPlan() and HasCitusToplevelNode() to access.
*
* Internally the data is stored as arguments to a 'citus_extradata_container'
* function, which has to be removed from the really executed plan tree before
* query execution.
*/
static PlannedStmt *
MultiQueryContainerNode(PlannedStmt *result, MultiPlan *multiPlan)
{
FunctionScan *fauxFunctionScan = NULL;
RangeTblFunction *fauxFunction = NULL;
FuncExpr *fauxFuncExpr = NULL;
Const *multiPlanData = NULL;
char *serializedPlan = NULL;
/* pass multiPlan serialized as a constant function argument */
serializedPlan = CitusNodeToString(multiPlan);
multiPlanData = makeNode(Const);
multiPlanData->consttype = CSTRINGOID;
multiPlanData->constlen = strlen(serializedPlan);
multiPlanData->constvalue = CStringGetDatum(serializedPlan);
multiPlanData->constbyval = false;
multiPlanData->location = -1;
fauxFuncExpr = makeNode(FuncExpr);
fauxFuncExpr->funcid = CitusExtraDataContainerFuncId();
fauxFuncExpr->funcretset = true;
fauxFuncExpr->location = -1;
fauxFuncExpr->args = list_make1(multiPlanData);
fauxFunction = makeNode(RangeTblFunction);
fauxFunction->funcexpr = (Node *) fauxFuncExpr;
fauxFunctionScan = makeNode(FunctionScan);
fauxFunctionScan->functions = lappend(fauxFunctionScan->functions, fauxFunction);
/*
* Add set returning function to target list if the original (postgres
* created) plan doesn't support backward scans; doing so prevents
* backward scans being supported by the new plantree as well. This is
* ugly as hell, but until we can rely on custom scans (which can signal
* this via CUSTOMPATH_SUPPORT_BACKWARD_SCAN), there's not really a pretty
* method to achieve this.
*
* FIXME: This should really be done on the master select plan.
*/
if (!ExecSupportsBackwardScan(result->planTree))
{
FuncExpr *funcExpr = makeNode(FuncExpr);
funcExpr->funcretset = true;
fauxFunctionScan->scan.plan.targetlist =
lappend(fauxFunctionScan->scan.plan.targetlist,
funcExpr);
}
result->planTree = (Plan *) fauxFunctionScan;
return result;
}
/*
* GetMultiPlanString returns either NULL, if the plan is not a distributed
* one, or the string representing the distributed plan.
*/
static char *
GetMultiPlanString(PlannedStmt *result)
{
FunctionScan *fauxFunctionScan = NULL;
RangeTblFunction *fauxFunction = NULL;
FuncExpr *fauxFuncExpr = NULL;
Const *multiPlanData = NULL;
if (!IsA(result->planTree, FunctionScan))
{
return NULL;
}
fauxFunctionScan = (FunctionScan *) result->planTree;
if (list_length(fauxFunctionScan->functions) != 1)
{
return NULL;
}
fauxFunction = linitial(fauxFunctionScan->functions);
if (!IsA(fauxFunction->funcexpr, FuncExpr))
{
return NULL;
}
fauxFuncExpr = (FuncExpr *) fauxFunction->funcexpr;
if (fauxFuncExpr->funcid != CitusExtraDataContainerFuncId())
{
return NULL;
}
if (list_length(fauxFuncExpr->args) != 1)
{
ereport(ERROR, (errmsg("unexpected number of function arguments to "
"citusdb_extradata_container")));
}
multiPlanData = (Const *) linitial(fauxFuncExpr->args);
Assert(IsA(multiPlanData, Const));
Assert(multiPlanData->consttype == CSTRINGOID);
return DatumGetCString(multiPlanData->constvalue);
}
/*
* CheckNodeIsDumpable checks that the passed node can be dumped using
* CitusNodeToString(). As this checks is expensive, it's only active when
* assertions are enabled.
*/
static void
CheckNodeIsDumpable(Node *node)
{
#ifdef USE_ASSERT_CHECKING
char *out = CitusNodeToString(node);
pfree(out);
#endif
}

View File

@ -0,0 +1,511 @@
/*-------------------------------------------------------------------------
*
* relay_event_utility.c
*
* Routines for handling DDL statements that relate to relay files. These
* routines extend relation, index and constraint names in utility commands.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "access/htup_details.h"
#include "access/skey.h"
#include "access/xact.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/pg_constraint.h"
#include "commands/defrem.h"
#include "distributed/relay_utility.h"
#include "nodes/parsenodes.h"
#include "parser/parse_utilcmd.h"
#include "storage/lock.h"
#include "tcop/utility.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/tqual.h"
/* Local functions forward declarations */
static bool TypeAddIndexConstraint(const AlterTableCmd *command);
static bool TypeDropIndexConstraint(const AlterTableCmd *command,
const RangeVar *relation, uint64 shardId);
static void AppendShardIdToConstraintName(AlterTableCmd *command, uint64 shardId);
/*
* RelayEventExtendNames extends relation names in the given parse tree for
* certain utility commands. The function more specifically extends table,
* sequence, and index names in the parse tree by appending the given shardId;
* thereby avoiding name collisions in the database among sharded tables. This
* function has the side effect of extending relation names in the parse tree.
*/
void
RelayEventExtendNames(Node *parseTree, uint64 shardId)
{
/* we don't extend names in extension or schema commands */
NodeTag nodeType = nodeTag(parseTree);
if (nodeType == T_CreateExtensionStmt || nodeType == T_CreateSchemaStmt)
{
return;
}
switch (nodeType)
{
case T_AlterSeqStmt:
{
AlterSeqStmt *alterSeqStmt = (AlterSeqStmt *) parseTree;
char **sequenceName = &(alterSeqStmt->sequence->relname);
AppendShardIdToName(sequenceName, shardId);
break;
}
case T_AlterTableStmt:
{
/*
* We append shardId to the very end of table, sequence and index
* names to avoid name collisions. We usually do not touch
* constraint names, except for cases where they refer to index
* names. In those cases, we also append to constraint names.
*/
AlterTableStmt *alterTableStmt = (AlterTableStmt *) parseTree;
char **relationName = &(alterTableStmt->relation->relname);
RangeVar *relation = alterTableStmt->relation; /* for constraints */
List *commandList = alterTableStmt->cmds;
ListCell *commandCell = NULL;
/* first append shardId to base relation name */
AppendShardIdToName(relationName, shardId);
foreach(commandCell, commandList)
{
AlterTableCmd *command = (AlterTableCmd *) lfirst(commandCell);
if (TypeAddIndexConstraint(command) ||
TypeDropIndexConstraint(command, relation, shardId))
{
AppendShardIdToConstraintName(command, shardId);
}
else if (command->subtype == AT_ClusterOn)
{
char **indexName = &(command->name);
AppendShardIdToName(indexName, shardId);
}
}
break;
}
case T_ClusterStmt:
{
ClusterStmt *clusterStmt = (ClusterStmt *) parseTree;
char **relationName = NULL;
/* we do not support clustering the entire database */
if (clusterStmt->relation == NULL)
{
ereport(ERROR, (errmsg("cannot extend name for multi-relation cluster")));
}
relationName = &(clusterStmt->relation->relname);
AppendShardIdToName(relationName, shardId);
if (clusterStmt->indexname != NULL)
{
char **indexName = &(clusterStmt->indexname);
AppendShardIdToName(indexName, shardId);
}
break;
}
case T_CreateSeqStmt:
{
CreateSeqStmt *createSeqStmt = (CreateSeqStmt *) parseTree;
char **sequenceName = &(createSeqStmt->sequence->relname);
AppendShardIdToName(sequenceName, shardId);
break;
}
case T_CreateForeignServerStmt:
{
CreateForeignServerStmt *serverStmt = (CreateForeignServerStmt *) parseTree;
char **serverName = &(serverStmt->servername);
AppendShardIdToName(serverName, shardId);
break;
}
case T_CreateForeignTableStmt:
{
CreateForeignTableStmt *createStmt = (CreateForeignTableStmt *) parseTree;
char **serverName = &(createStmt->servername);
AppendShardIdToName(serverName, shardId);
/*
* Since CreateForeignTableStmt inherits from CreateStmt and any change
* performed on CreateStmt should be done here too, we simply *fall
* through* to avoid code repetition.
*/
}
case T_CreateStmt:
{
CreateStmt *createStmt = (CreateStmt *) parseTree;
char **relationName = &(createStmt->relation->relname);
AppendShardIdToName(relationName, shardId);
break;
}
case T_DropStmt:
{
DropStmt *dropStmt = (DropStmt *) parseTree;
ObjectType objectType = dropStmt->removeType;
if (objectType == OBJECT_TABLE || objectType == OBJECT_SEQUENCE ||
objectType == OBJECT_INDEX || objectType == OBJECT_FOREIGN_TABLE ||
objectType == OBJECT_FOREIGN_SERVER)
{
List *relationNameList = NULL;
int relationNameListLength = 0;
Value *relationNameValue = NULL;
char **relationName = NULL;
uint32 dropCount = list_length(dropStmt->objects);
if (dropCount > 1)
{
ereport(ERROR,
(errmsg("cannot extend name for multiple drop objects")));
}
/*
* We now need to extend a single relation, sequence or index
* name. To be able to do this extension, we need to extract the
* names' addresses from the value objects they are stored in.
* Otherwise, the repalloc called in AppendShardIdToName() will
* not have the correct memory address for the name.
*/
relationNameList = (List *) linitial(dropStmt->objects);
relationNameListLength = list_length(relationNameList);
switch (relationNameListLength)
{
case 1:
relationNameValue = linitial(relationNameList);
break;
case 2:
relationNameValue = lsecond(relationNameList);
break;
case 3:
relationNameValue = lthird(relationNameList);
break;
default:
ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
errmsg("improper relation name: \"%s\"",
NameListToString(relationNameList))));
break;
}
relationName = &(relationNameValue->val.str);
AppendShardIdToName(relationName, shardId);
}
else
{
ereport(WARNING, (errmsg("unsafe object type in drop statement"),
errdetail("Object type: %u", (uint32) objectType)));
}
break;
}
case T_IndexStmt:
{
IndexStmt *indexStmt = (IndexStmt *) parseTree;
char **relationName = &(indexStmt->relation->relname);
char **indexName = &(indexStmt->idxname);
/*
* Concurrent index statements cannot run within a transaction block.
* Therefore, we do not support them.
*/
if (indexStmt->concurrent)
{
ereport(ERROR, (errmsg("cannot extend name for concurrent index")));
}
/*
* In the regular DDL execution code path (for non-sharded tables),
* if the index statement results from a table creation command, the
* indexName may be null. For sharded tables however, we intercept
* that code path and explicitly set the index name. Therefore, the
* index name in here cannot be null.
*/
if ((*indexName) == NULL)
{
ereport(ERROR, (errmsg("cannot extend name for null index name")));
}
AppendShardIdToName(relationName, shardId);
AppendShardIdToName(indexName, shardId);
break;
}
case T_ReindexStmt:
{
ReindexStmt *reindexStmt = (ReindexStmt *) parseTree;
#if (PG_VERSION_NUM >= 90500)
ReindexObjectType objectType = reindexStmt->kind;
if (objectType == REINDEX_OBJECT_TABLE || objectType == REINDEX_OBJECT_INDEX)
{
char **objectName = &(reindexStmt->relation->relname);
AppendShardIdToName(objectName, shardId);
}
else if (objectType == REINDEX_OBJECT_DATABASE)
{
ereport(ERROR, (errmsg("cannot extend name for multi-relation reindex")));
}
#else
ObjectType objectType = reindexStmt->kind;
if (objectType == OBJECT_TABLE || objectType == OBJECT_INDEX)
{
char **objectName = &(reindexStmt->relation->relname);
AppendShardIdToName(objectName, shardId);
}
else if (objectType == OBJECT_DATABASE)
{
ereport(ERROR, (errmsg("cannot extend name for multi-relation reindex")));
}
#endif
else
{
ereport(ERROR, (errmsg("invalid object type in reindex statement"),
errdetail("Object type: %u", (uint32) objectType)));
}
break;
}
case T_RenameStmt:
{
RenameStmt *renameStmt = (RenameStmt *) parseTree;
ObjectType objectType = renameStmt->renameType;
if (objectType == OBJECT_TABLE || objectType == OBJECT_SEQUENCE ||
objectType == OBJECT_INDEX)
{
char **oldRelationName = &(renameStmt->relation->relname);
char **newRelationName = &(renameStmt->newname);
AppendShardIdToName(oldRelationName, shardId);
AppendShardIdToName(newRelationName, shardId);
}
else if (objectType == OBJECT_COLUMN || objectType == OBJECT_TRIGGER)
{
char **relationName = &(renameStmt->relation->relname);
AppendShardIdToName(relationName, shardId);
}
else
{
ereport(WARNING, (errmsg("unsafe object type in rename statement"),
errdetail("Object type: %u", (uint32) objectType)));
}
break;
}
case T_TruncateStmt:
{
/*
* We currently do not support truncate statements. This is
* primarily because truncates allow implicit modifications to
* sequences through table column dependencies. As we have not
* determined our dependency model for sequences, we error here.
*/
ereport(ERROR, (errmsg("cannot extend name for truncate statement")));
break;
}
default:
{
ereport(WARNING, (errmsg("unsafe statement type in name extension"),
errdetail("Statement type: %u", (uint32) nodeType)));
break;
}
}
}
/*
* TypeAddIndexConstraint checks if the alter table command adds a constraint
* and if that constraint also results in an index creation.
*/
static bool
TypeAddIndexConstraint(const AlterTableCmd *command)
{
if (command->subtype == AT_AddConstraint)
{
if (IsA(command->def, Constraint))
{
Constraint *constraint = (Constraint *) command->def;
if (constraint->contype == CONSTR_PRIMARY ||
constraint->contype == CONSTR_UNIQUE)
{
return true;
}
}
}
return false;
}
/*
* TypeDropIndexConstraint checks if the alter table command drops a constraint
* and if that constraint also results in an index drop. Note that drop
* constraints do not have access to constraint type information; this is in
* contrast with add constraint commands. This function therefore performs
* additional system catalog lookups to determine if the drop constraint is
* associated with an index.
*/
static bool
TypeDropIndexConstraint(const AlterTableCmd *command,
const RangeVar *relation, uint64 shardId)
{
Relation pgConstraint = NULL;
SysScanDesc scanDescriptor = NULL;
ScanKeyData scanKey[1];
int scanKeyCount = 1;
HeapTuple heapTuple = NULL;
char *searchedConstraintName = NULL;
bool indexConstraint = false;
Oid relationId = InvalidOid;
bool failOK = true;
if (command->subtype != AT_DropConstraint)
{
return false;
}
/*
* At this stage, our only option is performing a relationId lookup. We
* first find the relationId, and then scan the pg_constraints system
* catalog using this relationId. Finally, we check if the passed in
* constraint is for a primary key or unique index.
*/
relationId = RangeVarGetRelid(relation, NoLock, failOK);
if (!OidIsValid(relationId))
{
/* overlook this error, it should be signaled later in the pipeline */
return false;
}
searchedConstraintName = pnstrdup(command->name, NAMEDATALEN);
AppendShardIdToName(&searchedConstraintName, shardId);
pgConstraint = heap_open(ConstraintRelationId, AccessShareLock);
ScanKeyInit(&scanKey[0], Anum_pg_constraint_conrelid,
BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(relationId));
scanDescriptor = systable_beginscan(pgConstraint,
ConstraintRelidIndexId, true, /* indexOK */
NULL, scanKeyCount, scanKey);
heapTuple = systable_getnext(scanDescriptor);
while (HeapTupleIsValid(heapTuple))
{
Form_pg_constraint constraintForm = (Form_pg_constraint) GETSTRUCT(heapTuple);
char *constraintName = NameStr(constraintForm->conname);
if (strncmp(constraintName, searchedConstraintName, NAMEDATALEN) == 0)
{
/* we found the constraint, now check if it is for an index */
if (constraintForm->contype == CONSTRAINT_PRIMARY ||
constraintForm->contype == CONSTRAINT_UNIQUE)
{
indexConstraint = true;
}
break;
}
heapTuple = systable_getnext(scanDescriptor);
}
systable_endscan(scanDescriptor);
heap_close(pgConstraint, AccessShareLock);
pfree(searchedConstraintName);
return indexConstraint;
}
/*
* AppendShardIdToConstraintName extends given constraint name with given
* shardId. Note that we only extend constraint names if they correspond to
* indexes, and the caller should verify that index correspondence before
* calling this function.
*/
static void
AppendShardIdToConstraintName(AlterTableCmd *command, uint64 shardId)
{
if (command->subtype == AT_AddConstraint)
{
Constraint *constraint = (Constraint *) command->def;
char **constraintName = &(constraint->conname);
AppendShardIdToName(constraintName, shardId);
}
else if (command->subtype == AT_DropConstraint)
{
char **constraintName = &(command->name);
AppendShardIdToName(constraintName, shardId);
}
}
/*
* AppendShardIdToName appends shardId to the given name. The function takes in
* the name's address in order to reallocate memory for the name in the same
* memory context the name was originally created in.
*/
void
AppendShardIdToName(char **name, uint64 shardId)
{
char extendedName[NAMEDATALEN];
uint32 extendedNameLength = 0;
snprintf(extendedName, NAMEDATALEN, "%s%c" UINT64_FORMAT,
(*name), SHARD_NAME_SEPARATOR, shardId);
/*
* Parser should have already checked that the table name has enough space
* reserved for appending shardIds. Nonetheless, we perform an additional
* check here to verify that the appended name does not overflow.
*/
extendedNameLength = strlen(extendedName) + 1;
if (extendedNameLength >= NAMEDATALEN)
{
ereport(ERROR, (errmsg("shard name too long to extend: \"%s\"", (*name))));
}
(*name) = (char *) repalloc((*name), extendedNameLength);
snprintf((*name), extendedNameLength, "%s", extendedName);
}

View File

@ -0,0 +1,535 @@
/*-------------------------------------------------------------------------
*
* shared_library_init.c
* Initialize CitusDB extension
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <limits.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "fmgr.h"
#include "miscadmin.h"
#include "commands/explain.h"
#include "executor/executor.h"
#include "distributed/master_protocol.h"
#include "distributed/modify_planner.h"
#include "distributed/multi_executor.h"
#include "distributed/multi_explain.h"
#include "distributed/multi_join_order.h"
#include "distributed/multi_logical_optimizer.h"
#include "distributed/multi_planner.h"
#include "distributed/multi_router_executor.h"
#include "distributed/multi_server_executor.h"
#include "distributed/multi_utility.h"
#include "distributed/task_tracker.h"
#include "distributed/worker_manager.h"
#include "distributed/worker_protocol.h"
#include "postmaster/postmaster.h"
#include "optimizer/planner.h"
#include "utils/guc.h"
#include "utils/guc_tables.h"
/* marks shared object as one loadable by the postgres version compiled against */
PG_MODULE_MAGIC;
void _PG_init(void);
static void CreateRequiredDirectories(void);
static void RegisterCitusConfigVariables(void);
static void NormalizeWorkerListPath(void);
/* GUC enum definitions */
static const struct config_enum_entry task_assignment_policy_options[] = {
{"greedy", TASK_ASSIGNMENT_GREEDY, false},
{"first-replica", TASK_ASSIGNMENT_FIRST_REPLICA, false},
{"round-robin", TASK_ASSIGNMENT_ROUND_ROBIN, false},
{NULL, 0, false}
};
static const struct config_enum_entry task_executor_type_options[] = {
{"real-time", MULTI_EXECUTOR_REAL_TIME, false},
{"task-tracker", MULTI_EXECUTOR_TASK_TRACKER, false},
{"router", MULTI_EXECUTOR_ROUTER, false},
{NULL, 0, false}
};
static const struct config_enum_entry shard_placement_policy_options[] = {
{"local-node-first", SHARD_PLACEMENT_LOCAL_NODE_FIRST, false},
{"round-robin", SHARD_PLACEMENT_ROUND_ROBIN, false},
{NULL, 0, false}
};
/* shared library initialization function */
void
_PG_init(void)
{
if (!process_shared_preload_libraries_in_progress)
{
ereport(ERROR, (errmsg("CitusDB can only be loaded via shared_preload_libraries"),
errhint("Add citusdb to shared_preload_libraries.")));
}
/*
* Perform checks before registering any hooks, to avoid erroring out in a
* partial state.
*
* In many cases (e.g. planner and utility hook, to run inside
* pg_stat_statements et. al.) we have to be loaded before other hooks
* (thus as the innermost/last running hook) to be able to do our
* duties. For simplicity insist that all hooks are previously unused.
*/
if (planner_hook != NULL ||
ExplainOneQuery_hook != NULL ||
ExecutorStart_hook != NULL ||
ExecutorRun_hook != NULL ||
ExecutorFinish_hook != NULL ||
ExecutorEnd_hook != NULL ||
ProcessUtility_hook != NULL)
{
ereport(ERROR, (errmsg("CitusDB has to be loaded first"),
errhint("Place citusdb at the beginning of "
"shared_preload_libraries.")));
}
/*
* Extend the database directory structure before continuing with
* initialization - one of the later steps might require them to exist.
*/
CreateRequiredDirectories();
/*
* Register CitusDB configuration variables. Do so before intercepting
* hooks or calling initialization functions, in case we want to do the
* latter in a configuration dependent manner.
*/
RegisterCitusConfigVariables();
/* intercept planner */
planner_hook = multi_planner;
/* intercept explain */
ExplainOneQuery_hook = MultiExplainOneQuery;
/* intercept executor */
ExecutorStart_hook = multi_ExecutorStart;
ExecutorRun_hook = multi_ExecutorRun;
ExecutorFinish_hook = multi_ExecutorFinish;
ExecutorEnd_hook = multi_ExecutorEnd;
/* register utility hook */
ProcessUtility_hook = multi_ProcessUtility;
/* organize that task tracker is started once server is up */
TaskTrackerRegister();
/* initialize worker node manager */
WorkerNodeRegister();
}
/*
* CreateRequiredDirectories - Create directories required for CitusDB to
* function.
*
* These used to be created by initdb, but that's not possible anymore.
*/
static void
CreateRequiredDirectories(void)
{
int dirNo = 0;
const char *subdirs[] = {
"pg_foreign_file",
"pg_foreign_file/cached",
"base/pgsql_job_cache"
};
for (dirNo = 0; dirNo < lengthof(subdirs); dirNo++)
{
int ret = mkdir(subdirs[dirNo], S_IRWXU);
if (ret != 0 && errno != EEXIST)
{
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not create directory \"%s\": %m",
subdirs[dirNo])));
}
}
}
/* Register CitusDB configuration variables. */
static void
RegisterCitusConfigVariables(void)
{
DefineCustomStringVariable(
"citusdb.worker_list_file",
gettext_noop("Sets the server's \"worker_list\" configuration file."),
NULL,
&WorkerListFileName,
NULL,
PGC_POSTMASTER,
GUC_SUPERUSER_ONLY,
NULL, NULL, NULL);
NormalizeWorkerListPath();
DefineCustomBoolVariable(
"citusdb.binary_master_copy_format",
gettext_noop("Use the binary master copy format."),
gettext_noop("When enabled, data is copied from workers to the master "
"in PostgreSQL's binary serialization format."),
&BinaryMasterCopyFormat,
false,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citusdb.binary_worker_copy_format",
gettext_noop("Use the binary worker copy format."),
gettext_noop("When enabled, data is copied from workers to workers "
"in PostgreSQL's binary serialization format when "
"joining large tables."),
&BinaryWorkerCopyFormat,
false,
PGC_SIGHUP,
0,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citusdb.expire_cached_shards",
gettext_noop("Enables shard cache expiration if a shard's size on disk has changed. "),
gettext_noop("When appending to an existing shard, old data may still be cached on "
"other workers. This configuration entry activates automatic "
"expiration, but should not be used with manual updates to shards."),
&ExpireCachedShards,
false,
PGC_SIGHUP,
0,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citusdb.subquery_pushdown",
gettext_noop("Enables supported subquery pushdown to workers."),
NULL,
&SubqueryPushdown,
false,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citusdb.log_multi_join_order",
gettext_noop("Logs the distributed join order to the server log."),
gettext_noop("We use this private configuration entry as a debugging aid. "
"If enabled, we print the distributed join order."),
&LogMultiJoinOrder,
false,
PGC_USERSET,
GUC_NO_SHOW_ALL,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citusdb.explain_multi_logical_plan",
gettext_noop("Enables Explain to print out distributed logical plans."),
gettext_noop("We use this private configuration entry as a debugging aid. "
"If enabled, the Explain command prints out the optimized "
"logical plan for distributed queries."),
&ExplainMultiLogicalPlan,
false,
PGC_USERSET,
GUC_NO_SHOW_ALL,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citusdb.explain_multi_physical_plan",
gettext_noop("Enables Explain to print out distributed physical plans."),
gettext_noop("We use this private configuration entry as a debugging aid. "
"If enabled, the Explain command prints out the physical "
"plan for distributed queries."),
&ExplainMultiPhysicalPlan,
false,
PGC_USERSET,
GUC_NO_SHOW_ALL,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citusdb.all_modifications_commutative",
gettext_noop("Bypasses commutativity checks when enabled"),
NULL,
&AllModificationsCommutative,
false,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.shard_replication_factor",
gettext_noop("Sets the replication factor for shards."),
gettext_noop("Shards are replicated across nodes according to this "
"replication factor. Note that shards read this "
"configuration value at sharded table creation time, "
"and later reuse the initially read value."),
&ShardReplicationFactor,
2, 1, 100,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.shard_max_size",
gettext_noop("Sets the maximum size a shard will grow before it gets split."),
gettext_noop("Shards store table and file data. When the source "
"file's size for one shard exceeds this configuration "
"value, the database ensures that either a new shard "
"gets created, or the current one gets split. Note that "
"shards read this configuration value at sharded table "
"creation time, and later reuse the initially read value."),
&ShardMaxSize,
1048576, 256, INT_MAX, /* max allowed size not set to MAX_KILOBYTES on purpose */
PGC_USERSET,
GUC_UNIT_KB,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.max_worker_nodes_tracked",
gettext_noop("Sets the maximum number of worker nodes that are tracked."),
gettext_noop("Worker nodes' network locations, their membership and "
"health status are tracked in a shared hash table on "
"the master node. This configuration value limits the "
"size of the hash table, and consequently the maximum "
"number of worker nodes that can be tracked."),
&MaxWorkerNodesTracked,
2048, 8, INT_MAX,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.remote_task_check_interval",
gettext_noop("Sets the frequency at which we check job statuses."),
gettext_noop("The master node assigns tasks to workers nodes, and "
"then regularly checks with them about each task's "
"progress. This configuration value sets the time "
"interval between two consequent checks."),
&RemoteTaskCheckInterval,
10, 1, REMOTE_NODE_CONNECT_TIMEOUT,
PGC_USERSET,
GUC_UNIT_MS,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.task_tracker_delay",
gettext_noop("Task tracker sleep time between task management rounds."),
gettext_noop("The task tracker process wakes up regularly, walks over "
"all tasks assigned to it, and schedules and executes these "
"tasks. Then, the task tracker sleeps for a time period "
"before walking over these tasks again. This configuration "
"value determines the length of that sleeping period."),
&TaskTrackerDelay,
200, 10, 100000,
PGC_SIGHUP,
GUC_UNIT_MS,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.max_assign_task_batch_size",
gettext_noop("Sets the maximum number of tasks to assign per round."),
gettext_noop("The master node synchronously assigns tasks to workers in "
"batches. Bigger batches allow for faster task assignment, "
"but it may take longer for all workers to get tasks "
"if the number of workers is large. This configuration "
"value controls the maximum batch size."),
&MaxAssignTaskBatchSize,
64, 1, INT_MAX,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.max_tracked_tasks_per_node",
gettext_noop("Sets the maximum number of tracked tasks per node."),
gettext_noop("The task tracker processes keeps all assigned tasks in "
"a shared hash table, and schedules and executes these "
"tasks as appropriate. This configuration value limits "
"the size of the hash table, and therefore the maximum "
"number of tasks that can be tracked at any given time."),
&MaxTrackedTasksPerNode,
1024, 8, INT_MAX,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.max_running_tasks_per_node",
gettext_noop("Sets the maximum number of tasks to run concurrently per node."),
gettext_noop("The task tracker process schedules and executes the tasks "
"assigned to it as appropriate. This configuration value "
"sets the maximum number of tasks to execute concurrently "
"on one node at any given time."),
&MaxRunningTasksPerNode,
8, 1, INT_MAX,
PGC_SIGHUP,
0,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.partition_buffer_size",
gettext_noop("Sets the buffer size to use for partition operations."),
gettext_noop("Worker nodes allow for table data to be repartitioned "
"into multiple text files, much like Hadoop's Map "
"command. This configuration value sets the buffer size "
"to use per partition operation. After the buffer fills "
"up, we flush the repartitioned data into text files."),
&PartitionBufferSize,
8192, 0, (INT_MAX / 1024), /* result stored in int variable */
PGC_USERSET,
GUC_UNIT_KB,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.large_table_shard_count",
gettext_noop("The shard count threshold over which a table is considered large."),
gettext_noop("A distributed table is considered to be large if it has "
"more shards than the value specified here. This largeness "
"criteria is then used in picking a table join order during "
"distributed query planning."),
&LargeTableShardCount,
4, 1, 10000,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citusdb.limit_clause_row_fetch_count",
gettext_noop("Number of rows to fetch per task for limit clause optimization."),
gettext_noop("Select queries get partitioned and executed as smaller "
"tasks. In some cases, select queries with limit clauses "
"may need to fetch all rows from each task to generate "
"results. In those cases, and where an approximation would "
"produce meaningful results, this configuration value sets "
"the number of rows to fetch from each task."),
&LimitClauseRowFetchCount,
-1, -1, INT_MAX,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomRealVariable(
"citusdb.count_distinct_error_rate",
gettext_noop("Desired error rate when calculating count(distinct) "
"approximates using the postgresql-hll extension. "
"0.0 disables approximations for count(distinct); 1.0 "
"provides no guarantees about the accuracy of results."),
NULL,
&CountDistinctErrorRate,
0.0, 0.0, 1.0,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomEnumVariable(
"citusdb.task_assignment_policy",
gettext_noop("Sets the policy to use when assigning tasks to worker nodes."),
gettext_noop("The master node assigns tasks to worker nodes based on shard "
"locations. This configuration value specifies the policy to "
"use when making these assignments. The greedy policy aims to "
"evenly distribute tasks across worker nodes, first-replica just "
"assigns tasks in the order shard placements were created, "
"and the round-robin policy assigns tasks to worker nodes in "
"a round-robin fashion."),
&TaskAssignmentPolicy,
TASK_ASSIGNMENT_GREEDY,
task_assignment_policy_options,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomEnumVariable(
"citusdb.task_executor_type",
gettext_noop("Sets the executor type to be used for distributed queries."),
gettext_noop("The master node chooses between three different executor types "
"when executing a distributed query. The router executor is "
"optimal for simple key-value lookups on a single shard. The "
"real-time executor is optimal for queries that involve "
"aggregations and/or co-located joins on multiple shards. The "
"task-tracker executor is optimal for long-running, complex "
"queries that touch thousands of shards and/or that involve "
"table repartitioning."),
&TaskExecutorType,
MULTI_EXECUTOR_REAL_TIME,
task_executor_type_options,
PGC_USERSET,
0,
NULL, NULL, NULL);
DefineCustomEnumVariable(
"citusdb.shard_placement_policy",
gettext_noop("Sets the policy to use when choosing nodes for shard placement."),
gettext_noop("The master node chooses which worker nodes to place new shards "
"on. This configuration value specifies the policy to use when "
"selecting these nodes. The local-node-first policy places the "
"first replica on the client node and chooses others randomly. "
"The round-robin policy aims to distribute shards evenly across "
"the cluster by selecting nodes in a round-robin fashion."),
&ShardPlacementPolicy,
SHARD_PLACEMENT_ROUND_ROBIN, shard_placement_policy_options,
PGC_USERSET,
0,
NULL, NULL, NULL);
/* warn about config items in the citusdb namespace that are not registered above */
EmitWarningsOnPlaceholders("citusdb");
/* Also warn about citus namespace, as that's a very likely misspelling */
EmitWarningsOnPlaceholders("citus");
}
/*
* NormalizeWorkerListPath converts the path configured via
* citusdb.worker_list_file into an absolute path, falling back to the default
* value if necessary. The previous value of the config variable is
* overwritten with the normalized value.
*
* NB: This has to be called before ChangeToDataDir() is called as otherwise
* the relative paths won't make much sense to the user anymore.
*/
static void
NormalizeWorkerListPath(void)
{
char *absoluteFileName = NULL;
if (WorkerListFileName != NULL)
{
absoluteFileName = make_absolute_path(WorkerListFileName);
}
else if (DataDir != NULL)
{
absoluteFileName = malloc(strlen(DataDir) + strlen(WORKER_LIST_FILENAME) + 2);
if (absoluteFileName == NULL)
ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
sprintf(absoluteFileName, "%s/%s", DataDir, WORKER_LIST_FILENAME);
}
else
{
ereport(FATAL, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("%s does not know where to find the \"worker_list_file\" "
"configuration file.\n"
"This can be specified as \"citusdb.worker_list_file\" in "
"\"%s\", or by the -D invocation option, or by the PGDATA "
"environment variable.\n", progname, ConfigFileName)));
}
SetConfigOption("citusdb.worker_list_file", absoluteFileName, PGC_POSTMASTER, PGC_S_OVERRIDE);
free(absoluteFileName);
}

View File

@ -0,0 +1,170 @@
/*-------------------------------------------------------------------------
*
* test/src/connection_cache.c
*
* This file contains functions to exercise CitusDB's connection hash
* functionality for purposes of unit testing.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include "fmgr.h"
#include "libpq-int.h"
#include <stddef.h>
#include <string.h>
#include "catalog/pg_type.h"
#include "distributed/connection_cache.h"
#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
#include "utils/lsyscache.h"
/* local function forward declarations */
static Datum ExtractIntegerDatum(char *input);
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(initialize_remote_temp_table);
PG_FUNCTION_INFO_V1(count_remote_temp_table_rows);
PG_FUNCTION_INFO_V1(get_and_purge_connection);
PG_FUNCTION_INFO_V1(set_connection_status_bad);
/*
* initialize_remote_temp_table connects to a specified host on a specified
* port and creates a temporary table with 100 rows. Because the table is
* temporary, it will be visible if a connection is reused but not if a new
* connection is opened to the node.
*/
Datum
initialize_remote_temp_table(PG_FUNCTION_ARGS)
{
char *nodeName = PG_GETARG_CSTRING(0);
int32 nodePort = PG_GETARG_INT32(1);
PGresult *result = NULL;
PGconn *connection = GetConnection(nodeName, nodePort);
if (connection == NULL)
{
PG_RETURN_BOOL(false);
}
result = PQexec(connection, POPULATE_TEMP_TABLE);
if (PQresultStatus(result) != PGRES_COMMAND_OK)
{
ReportRemoteError(connection, result);
}
PQclear(result);
PG_RETURN_BOOL(true);
}
/*
* count_remote_temp_table_rows just returns the integer count of rows in the
* table created by initialize_remote_temp_table. If no such table exists, this
* function emits a warning and returns -1.
*/
Datum
count_remote_temp_table_rows(PG_FUNCTION_ARGS)
{
char *nodeName = PG_GETARG_CSTRING(0);
int32 nodePort = PG_GETARG_INT32(1);
Datum count = Int32GetDatum(-1);
PGresult *result = NULL;
PGconn *connection = GetConnection(nodeName, nodePort);
if (connection == NULL)
{
PG_RETURN_DATUM(count);
}
result = PQexec(connection, COUNT_TEMP_TABLE);
if (PQresultStatus(result) != PGRES_TUPLES_OK)
{
ReportRemoteError(connection, result);
}
else
{
char *countText = PQgetvalue(result, 0, 0);
count = ExtractIntegerDatum(countText);
}
PQclear(result);
PG_RETURN_DATUM(count);
}
/*
* get_and_purge_connection first gets a connection using the provided hostname
* and port before immediately passing that connection to PurgeConnection.
* Simply a wrapper around PurgeConnection that uses hostname/port rather than
* PGconn.
*/
Datum
get_and_purge_connection(PG_FUNCTION_ARGS)
{
char *nodeName = PG_GETARG_CSTRING(0);
int32 nodePort = PG_GETARG_INT32(1);
PGconn *connection = GetConnection(nodeName, nodePort);
if (connection == NULL)
{
PG_RETURN_BOOL(false);
}
PurgeConnection(connection);
PG_RETURN_BOOL(true);
}
/*
* set_connection_status_bad does not remove the given connection from the connection hash.
* It only sets its status to CONNECTION_BAD. On success, it returns true.
*/
Datum
set_connection_status_bad(PG_FUNCTION_ARGS)
{
char *nodeName = PG_GETARG_CSTRING(0);
int32 nodePort = PG_GETARG_INT32(1);
PGconn *connection = GetConnection(nodeName, nodePort);
if (connection == NULL)
{
PG_RETURN_BOOL(false);
}
/* set the connection status */
connection->status = CONNECTION_BAD;
PG_RETURN_BOOL(true);
}
/*
* ExtractIntegerDatum transforms an integer in textual form into a Datum.
*/
static Datum
ExtractIntegerDatum(char *input)
{
Oid typIoFunc = InvalidOid;
Oid typIoParam = InvalidOid;
Datum intDatum = 0;
FmgrInfo fmgrInfo;
memset(&fmgrInfo, 0, sizeof(fmgrInfo));
getTypeInputInfo(INT4OID, &typIoFunc, &typIoParam);
fmgr_info(typIoFunc, &fmgrInfo);
intDatum = InputFunctionCall(&fmgrInfo, input, typIoFunc, -1);
return intDatum;
}

View File

@ -0,0 +1,69 @@
/*-------------------------------------------------------------------------
*
* test/src/create_shards.c
*
* This file contains functions to exercise shard creation functionality
* within CitusDB.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include "fmgr.h"
#include <string.h>
#include "distributed/listutils.h"
#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
#include "lib/stringinfo.h"
#include "nodes/pg_list.h"
/* local function forward declarations */
static int CompareStrings(const void *leftElement, const void *rightElement);
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(sort_names);
/*
* sort_names accepts three strings, places them in a list, then calls PGSSortList
* to test its sort functionality. Returns a string containing sorted lines.
*/
Datum
sort_names(PG_FUNCTION_ARGS)
{
char *first = PG_GETARG_CSTRING(0);
char *second = PG_GETARG_CSTRING(1);
char *third = PG_GETARG_CSTRING(2);
List *nameList = SortList(list_make3(first, second, third),
(int (*)(const void *, const void *))(&CompareStrings));
StringInfo sortedNames = makeStringInfo();
ListCell *nameCell = NULL;
foreach(nameCell, nameList)
{
char *name = lfirst(nameCell);
appendStringInfo(sortedNames, "%s\n", name);
}
PG_RETURN_CSTRING(sortedNames->data);
}
/*
* A simple wrapper around strcmp suitable for use with PGSSortList or qsort.
*/
static int
CompareStrings(const void *leftElement, const void *rightElement)
{
const char *leftString = *((const char **) leftElement);
const char *rightString = *((const char **) rightElement);
return strcmp(leftString, rightString);
}

View File

@ -0,0 +1,365 @@
/*-------------------------------------------------------------------------
*
* test/src/distribution_metadata.c
*
* This file contains functions to exercise distributed table metadata
* functionality within CitusDB.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include "fmgr.h"
#include <stddef.h>
#include <stdint.h>
#include "access/heapam.h"
#include "catalog/pg_type.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/master_protocol.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_join_order.h"
#include "distributed/pg_dist_shard.h"
#include "distributed/resource_lock.h"
#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
#include "lib/stringinfo.h"
#include "nodes/pg_list.h"
#include "nodes/primnodes.h"
#include "storage/lock.h"
#include "utils/array.h"
#include "utils/elog.h"
#include "utils/errcodes.h"
#include "utils/builtins.h"
#include "utils/palloc.h"
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(load_shard_id_array);
PG_FUNCTION_INFO_V1(load_shard_interval_array);
PG_FUNCTION_INFO_V1(load_shard_placement_array);
PG_FUNCTION_INFO_V1(partition_column_id);
PG_FUNCTION_INFO_V1(partition_type);
PG_FUNCTION_INFO_V1(is_distributed_table);
PG_FUNCTION_INFO_V1(column_name_to_column);
PG_FUNCTION_INFO_V1(column_name_to_column_id);
PG_FUNCTION_INFO_V1(create_monolithic_shard_row);
PG_FUNCTION_INFO_V1(create_healthy_local_shard_placement_row);
PG_FUNCTION_INFO_V1(delete_shard_placement_row);
PG_FUNCTION_INFO_V1(update_shard_placement_row_state);
PG_FUNCTION_INFO_V1(acquire_shared_shard_lock);
/*
* load_shard_id_array returns the shard identifiers for a particular
* distributed table as a bigint array. If the table is not distributed
* yet, the function errors-out.
*/
Datum
load_shard_id_array(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
ArrayType *shardIdArrayType = NULL;
ListCell *shardCell = NULL;
int shardIdIndex = 0;
Oid shardIdTypeId = INT8OID;
int shardIdCount = -1;
Datum *shardIdDatumArray = NULL;
List *shardList = LoadShardIntervalList(distributedTableId);
shardIdCount = list_length(shardList);
shardIdDatumArray = palloc0(shardIdCount * sizeof(Datum));
foreach(shardCell, shardList)
{
ShardInterval *shardId = (ShardInterval *) lfirst(shardCell);
Datum shardIdDatum = Int64GetDatum(shardId->shardId);
shardIdDatumArray[shardIdIndex] = shardIdDatum;
shardIdIndex++;
}
shardIdArrayType = DatumArrayToArrayType(shardIdDatumArray, shardIdCount,
shardIdTypeId);
PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
}
/*
* load_shard_interval_array loads a shard interval using a provided identifier
* and returns a two-element array consisting of min/max values contained in
* that shard interval. If no such interval can be found, this function raises
* an error instead.
*/
Datum
load_shard_interval_array(PG_FUNCTION_ARGS)
{
int64 shardId = PG_GETARG_INT64(0);
Oid expectedType PG_USED_FOR_ASSERTS_ONLY = get_fn_expr_argtype(fcinfo->flinfo, 1);
ShardInterval *shardInterval = LoadShardInterval(shardId);
Datum shardIntervalArray[] = { shardInterval->minValue, shardInterval->maxValue };
ArrayType *shardIntervalArrayType = NULL;
Assert(expectedType == shardInterval->valueTypeId);
shardIntervalArrayType = DatumArrayToArrayType(shardIntervalArray, 2,
shardInterval->valueTypeId);
PG_RETURN_ARRAYTYPE_P(shardIntervalArrayType);
}
/*
* load_shard_placement_array loads a shard interval using the provided ID
* and returns an array of strings containing the node name and port for each
* placement of the specified shard interval. If the second argument is true,
* only finalized placements are returned; otherwise, all are. If no such shard
* interval can be found, this function raises an error instead.
*/
Datum
load_shard_placement_array(PG_FUNCTION_ARGS)
{
int64 shardId = PG_GETARG_INT64(0);
bool onlyFinalized = PG_GETARG_BOOL(1);
ArrayType *placementArrayType = NULL;
List *placementList = NIL;
ListCell *placementCell = NULL;
int placementCount = -1;
int placementIndex = 0;
Datum *placementDatumArray = NULL;
Oid placementTypeId = TEXTOID;
StringInfo placementInfo = makeStringInfo();
if (onlyFinalized)
{
placementList = FinalizedShardPlacementList(shardId);
}
else
{
placementList = ShardPlacementList(shardId);
}
placementCount = list_length(placementList);
placementDatumArray = palloc0(placementCount * sizeof(Datum));
foreach(placementCell, placementList)
{
ShardPlacement *placement = (ShardPlacement *) lfirst(placementCell);
appendStringInfo(placementInfo, "%s:%d", placement->nodeName,
placement->nodePort);
placementDatumArray[placementIndex] = CStringGetTextDatum(placementInfo->data);
placementIndex++;
resetStringInfo(placementInfo);
}
placementArrayType = DatumArrayToArrayType(placementDatumArray, placementCount,
placementTypeId);
PG_RETURN_ARRAYTYPE_P(placementArrayType);
}
/*
* partition_column_id simply finds a distributed table using the provided Oid
* and returns the column_id of its partition column. If the specified table is
* not distributed, this function raises an error instead.
*/
Datum
partition_column_id(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
uint32 rangeTableId = 1;
Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
PG_RETURN_INT16((int16) partitionColumn->varattno);
}
/*
* partition_type simply finds a distributed table using the provided Oid and
* returns the type of partitioning in use by that table. If the specified
* table is not distributed, this function raises an error instead.
*/
Datum
partition_type(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
char partitionType = PartitionMethod(distributedTableId);
PG_RETURN_CHAR(partitionType);
}
/*
* is_distributed_table simply returns whether a given table is distributed. No
* errors, just a boolean.
*/
Datum
is_distributed_table(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
bool isDistributedTable = IsDistributedTable(distributedTableId);
PG_RETURN_BOOL(isDistributedTable);
}
/*
* column_name_to_column is an internal UDF to obtain a textual representation
* of a particular column node (Var), given a relation identifier and column
* name. There is no requirement that the table be distributed; this function
* simply returns the textual representation of a Var representing a column.
* This function will raise an ERROR if no such column can be found or if the
* provided name refers to a system column.
*/
Datum
column_name_to_column(PG_FUNCTION_ARGS)
{
Oid relationId = PG_GETARG_OID(0);
text *columnText = PG_GETARG_TEXT_P(1);
Relation relation = NULL;
char *columnName = text_to_cstring(columnText);
Var *column = NULL;
char *columnNodeString = NULL;
text *columnNodeText = NULL;
relation = relation_open(relationId, AccessExclusiveLock);
column = (Var *) BuildDistributionKeyFromColumnName(relation, columnName);
columnNodeString = nodeToString(column);
columnNodeText = cstring_to_text(columnNodeString);
relation_close(relation, NoLock);
PG_RETURN_TEXT_P(columnNodeText);
}
/*
* column_name_to_column_id takes a relation identifier and a name of a column
* in that relation and returns the index of that column in the relation. If
* the provided name is a system column or no column at all, this function will
* throw an error instead.
*/
Datum
column_name_to_column_id(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
char *columnName = PG_GETARG_CSTRING(1);
Relation relation = NULL;
Var *column = NULL;
relation = relation_open(distributedTableId, AccessExclusiveLock);
column = (Var *) BuildDistributionKeyFromColumnName(relation, columnName);
relation_close(relation, NoLock);
PG_RETURN_INT16((int16) column->varattno);
}
/*
* create_monolithic_shard_row creates a single shard covering all possible
* hash values for a given table and inserts a row representing that shard
* into the backing store. It returns the primary key of the new row.
*/
Datum
create_monolithic_shard_row(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
StringInfo minInfo = makeStringInfo();
StringInfo maxInfo = makeStringInfo();
Datum newShardIdDatum = master_get_new_shardid(NULL);
int64 newShardId = DatumGetInt64(newShardIdDatum);
text *maxInfoText = NULL;
text *minInfoText = NULL;
appendStringInfo(minInfo, "%d", INT32_MIN);
appendStringInfo(maxInfo, "%d", INT32_MAX);
minInfoText = cstring_to_text(minInfo->data);
maxInfoText = cstring_to_text(maxInfo->data);
InsertShardRow(distributedTableId, newShardId, SHARD_STORAGE_TABLE, minInfoText,
maxInfoText);
PG_RETURN_INT64(newShardId);
}
/*
* create_healthy_local_shard_placement_row inserts a row representing a
* finalized placement for localhost (on the default port) into the backing
* store.
*/
Datum
create_healthy_local_shard_placement_row(PG_FUNCTION_ARGS)
{
int64 shardId = PG_GETARG_INT64(0);
int64 shardLength = 0;
InsertShardPlacementRow(shardId, FILE_FINALIZED, shardLength, "localhost", 5432);
PG_RETURN_VOID();
}
/*
* delete_shard_placement_row removes a shard placement with the specified ID.
*/
Datum
delete_shard_placement_row(PG_FUNCTION_ARGS)
{
int64 shardId = PG_GETARG_INT64(0);
text *hostName = PG_GETARG_TEXT_P(1);
int64 hostPort = PG_GETARG_INT64(2);
bool successful = true;
char *hostNameString = text_to_cstring(hostName);
DeleteShardPlacementRow(shardId, hostNameString, hostPort);
PG_RETURN_BOOL(successful);
}
/*
* update_shard_placement_row_state sets the state of the placement with the
* specified ID.
*/
Datum
update_shard_placement_row_state(PG_FUNCTION_ARGS)
{
int64 shardId = PG_GETARG_INT64(0);
text *hostName = PG_GETARG_TEXT_P(1);
int64 hostPort = PG_GETARG_INT64(2);
RelayFileState shardState = (RelayFileState) PG_GETARG_INT32(3);
bool successful = true;
char *hostNameString = text_to_cstring(hostName);
uint64 shardLength = 0;
DeleteShardPlacementRow(shardId, hostNameString, hostPort);
InsertShardPlacementRow(shardId, shardState, shardLength, hostNameString, hostPort);
PG_RETURN_BOOL(successful);
}
/*
* acquire_shared_shard_lock grabs a shared lock for the specified shard.
*/
Datum
acquire_shared_shard_lock(PG_FUNCTION_ARGS)
{
int64 shardId = PG_GETARG_INT64(0);
LockShardResource(shardId, ShareLock);
PG_RETURN_VOID();
}

View File

@ -0,0 +1,168 @@
/*-------------------------------------------------------------------------
*
* test/src/fake_fdw.c
*
* This file contains a barebones FDW implementation, suitable for use in
* test code. Inspired by Andrew Dunstan's blackhole_fdw.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include "fmgr.h"
#include <stddef.h>
#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
#include "executor/tuptable.h"
#include "foreign/fdwapi.h"
#include "nodes/execnodes.h"
#include "nodes/nodes.h"
#include "nodes/pg_list.h"
#include "nodes/plannodes.h"
#include "nodes/relation.h"
#include "optimizer/pathnode.h"
#include "optimizer/planmain.h"
#include "optimizer/restrictinfo.h"
#include "utils/palloc.h"
/* local function forward declarations */
static void FakeGetForeignRelSize(PlannerInfo *root, RelOptInfo *baserel,
Oid foreigntableid);
static void FakeGetForeignPaths(PlannerInfo *root, RelOptInfo *baserel,
Oid foreigntableid);
#if (PG_VERSION_NUM >= 90300 && PG_VERSION_NUM < 90500)
static ForeignScan * FakeGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel,
Oid foreigntableid, ForeignPath *best_path,
List *tlist, List *scan_clauses);
#else
static ForeignScan * FakeGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel,
Oid foreigntableid, ForeignPath *best_path,
List *tlist, List *scan_clauses,
Plan *outer_plan);
#endif
static void FakeBeginForeignScan(ForeignScanState *node, int eflags);
static TupleTableSlot * FakeIterateForeignScan(ForeignScanState *node);
static void FakeReScanForeignScan(ForeignScanState *node);
static void FakeEndForeignScan(ForeignScanState *node);
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(fake_fdw_handler);
/*
* fake_fdw_handler populates an FdwRoutine with pointers to the functions
* implemented within this file.
*/
Datum
fake_fdw_handler(PG_FUNCTION_ARGS)
{
FdwRoutine *fdwroutine = makeNode(FdwRoutine);
fdwroutine->GetForeignRelSize = FakeGetForeignRelSize;
fdwroutine->GetForeignPaths = FakeGetForeignPaths;
fdwroutine->GetForeignPlan = FakeGetForeignPlan;
fdwroutine->BeginForeignScan = FakeBeginForeignScan;
fdwroutine->IterateForeignScan = FakeIterateForeignScan;
fdwroutine->ReScanForeignScan = FakeReScanForeignScan;
fdwroutine->EndForeignScan = FakeEndForeignScan;
PG_RETURN_POINTER(fdwroutine);
}
/*
* FakeGetForeignRelSize populates baserel with a fake relation size.
*/
static void
FakeGetForeignRelSize(PlannerInfo *root, RelOptInfo *baserel, Oid foreigntableid)
{
baserel->rows = 0;
baserel->fdw_private = (void *) palloc0(1);
}
/*
* FakeGetForeignPaths adds a single fake foreign path to baserel.
*/
static void
FakeGetForeignPaths(PlannerInfo *root, RelOptInfo *baserel, Oid foreigntableid)
{
Cost startup_cost = 0;
Cost total_cost = startup_cost + baserel->rows;
#if (PG_VERSION_NUM >= 90300 && PG_VERSION_NUM < 90500)
add_path(baserel, (Path *) create_foreignscan_path(root, baserel, baserel->rows,
startup_cost, total_cost, NIL,
NULL, NIL));
#else
add_path(baserel, (Path *) create_foreignscan_path(root, baserel, baserel->rows,
startup_cost, total_cost, NIL,
NULL, NULL, NIL));
#endif
}
/*
* FakeGetForeignPlan builds a fake foreign plan.
*/
#if (PG_VERSION_NUM >= 90300 && PG_VERSION_NUM < 90500)
static ForeignScan *
FakeGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel, Oid foreigntableid,
ForeignPath *best_path, List *tlist, List *scan_clauses)
#else
static ForeignScan *
FakeGetForeignPlan(PlannerInfo *root, RelOptInfo *baserel, Oid foreigntableid,
ForeignPath *best_path, List *tlist, List *scan_clauses,
Plan *outer_plan)
#endif
{
Index scan_relid = baserel->relid;
scan_clauses = extract_actual_clauses(scan_clauses, false);
/* make_foreignscan has a different signature in 9.3 and 9.4 than in 9.5 */
#if (PG_VERSION_NUM >= 90300 && PG_VERSION_NUM < 90500)
return make_foreignscan(tlist, scan_clauses, scan_relid, NIL, NIL);
#else
return make_foreignscan(tlist, scan_clauses, scan_relid, NIL, NIL, NIL, NIL,
outer_plan);
#endif
}
/*
* FakeBeginForeignScan begins the fake plan (i.e. does nothing).
*/
static void
FakeBeginForeignScan(ForeignScanState *node, int eflags) { }
/*
* FakeIterateForeignScan continues the fake plan (i.e. does nothing).
*/
static TupleTableSlot *
FakeIterateForeignScan(ForeignScanState *node)
{
TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
ExecClearTuple(slot);
return slot;
}
/*
* FakeReScanForeignScan restarts the fake plan (i.e. does nothing).
*/
static void
FakeReScanForeignScan(ForeignScanState *node) { }
/*
* FakeEndForeignScan ends the fake plan (i.e. does nothing).
*/
static void
FakeEndForeignScan(ForeignScanState *node) { }

View File

@ -0,0 +1,67 @@
/*-------------------------------------------------------------------------
*
* test/src/generate_ddl_commands.c
*
* This file contains functions to exercise DDL generation functionality
* within CitusDB.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include "fmgr.h"
#include <stddef.h>
#include "catalog/pg_type.h"
#include "distributed/master_protocol.h"
#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
#include "lib/stringinfo.h"
#include "nodes/makefuncs.h"
#include "nodes/nodes.h"
#include "nodes/parsenodes.h"
#include "nodes/pg_list.h"
#include "nodes/value.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/palloc.h"
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(table_ddl_command_array);
/*
* table_ddl_command_array returns an array of strings, each of which is a DDL
* command required to recreate a table (specified by OID).
*/
Datum
table_ddl_command_array(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
ArrayType *ddlCommandArrayType = NULL;
List *ddlCommandList = GetTableDDLEvents(distributedTableId);
int ddlCommandCount = list_length(ddlCommandList);
Datum *ddlCommandDatumArray = palloc0(ddlCommandCount * sizeof(Datum));
ListCell *ddlCommandCell = NULL;
int ddlCommandIndex = 0;
Oid ddlCommandTypeId = TEXTOID;
foreach(ddlCommandCell, ddlCommandList)
{
char *ddlCommand = (char *) lfirst(ddlCommandCell);
Datum ddlCommandDatum = CStringGetTextDatum(ddlCommand);
ddlCommandDatumArray[ddlCommandIndex] = ddlCommandDatum;
ddlCommandIndex++;
}
ddlCommandArrayType = DatumArrayToArrayType(ddlCommandDatumArray, ddlCommandCount,
ddlCommandTypeId);
PG_RETURN_ARRAYTYPE_P(ddlCommandArrayType);
}

View File

@ -0,0 +1,214 @@
/*-------------------------------------------------------------------------
*
* test/src/create_shards.c
*
* This file contains functions to exercise shard creation functionality
* within CitusDB.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include "fmgr.h"
#include <string.h>
#if (PG_VERSION_NUM >= 90500 && PG_VERSION_NUM < 90600)
#include "access/stratnum.h"
#else
#include "access/skey.h"
#endif
#include "catalog/pg_type.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/multi_join_order.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/resource_lock.h"
#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
#include "nodes/pg_list.h"
#include "nodes/primnodes.h"
#include "nodes/nodes.h"
#include "optimizer/clauses.h"
#include "utils/array.h"
#include "utils/palloc.h"
/* local function forward declarations */
static Expr * MakeTextPartitionExpression(Oid distributedTableId, text *value);
static ArrayType * PrunedShardIdsForTable(Oid distributedTableId, List *whereClauseList);
/* declarations for dynamic loading */
PG_FUNCTION_INFO_V1(prune_using_no_values);
PG_FUNCTION_INFO_V1(prune_using_single_value);
PG_FUNCTION_INFO_V1(prune_using_either_value);
PG_FUNCTION_INFO_V1(prune_using_both_values);
PG_FUNCTION_INFO_V1(debug_equality_expression);
/*
* prune_using_no_values returns the shards for the specified distributed table
* after pruning using an empty clause list.
*/
Datum
prune_using_no_values(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
List *whereClauseList = NIL;
ArrayType *shardIdArrayType = PrunedShardIdsForTable(distributedTableId,
whereClauseList);
PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
}
/*
* prune_using_single_value returns the shards for the specified distributed
* table after pruning using a single value provided by the caller.
*/
Datum
prune_using_single_value(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
text *value = (PG_ARGISNULL(1)) ? NULL : PG_GETARG_TEXT_P(1);
Expr *equalityExpr = MakeTextPartitionExpression(distributedTableId, value);
List *whereClauseList = list_make1(equalityExpr);
ArrayType *shardIdArrayType = PrunedShardIdsForTable(distributedTableId,
whereClauseList);
PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
}
/*
* prune_using_either_value returns the shards for the specified distributed
* table after pruning using either of two values provided by the caller (OR).
*/
Datum
prune_using_either_value(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
text *firstValue = PG_GETARG_TEXT_P(1);
text *secondValue = PG_GETARG_TEXT_P(2);
Expr *firstQual = MakeTextPartitionExpression(distributedTableId, firstValue);
Expr *secondQual = MakeTextPartitionExpression(distributedTableId, secondValue);
Expr *orClause = make_orclause(list_make2(firstQual, secondQual));
List *whereClauseList = list_make1(orClause);
ArrayType *shardIdArrayType = PrunedShardIdsForTable(distributedTableId,
whereClauseList);
PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
}
/*
* prune_using_both_values returns the shards for the specified distributed
* table after pruning using both of the values provided by the caller (AND).
*/
Datum
prune_using_both_values(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
text *firstValue = PG_GETARG_TEXT_P(1);
text *secondValue = PG_GETARG_TEXT_P(2);
Expr *firstQual = MakeTextPartitionExpression(distributedTableId, firstValue);
Expr *secondQual = MakeTextPartitionExpression(distributedTableId, secondValue);
List *whereClauseList = list_make2(firstQual, secondQual);
ArrayType *shardIdArrayType = PrunedShardIdsForTable(distributedTableId,
whereClauseList);
PG_RETURN_ARRAYTYPE_P(shardIdArrayType);
}
/*
* debug_equality_expression returns the textual representation of an equality
* expression generated by a call to MakeOpExpression.
*/
Datum
debug_equality_expression(PG_FUNCTION_ARGS)
{
Oid distributedTableId = PG_GETARG_OID(0);
uint32 rangeTableId = 1;
Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
OpExpr *equalityExpression = MakeOpExpression(partitionColumn, BTEqualStrategyNumber);
PG_RETURN_CSTRING(nodeToString(equalityExpression));
}
/*
* MakeTextPartitionExpression returns an equality expression between the
* specified table's partition column and the provided values.
*/
static Expr *
MakeTextPartitionExpression(Oid distributedTableId, text *value)
{
uint32 rangeTableId = 1;
Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId);
Expr *partitionExpression = NULL;
if (value != NULL)
{
OpExpr *equalityExpr = MakeOpExpression(partitionColumn, BTEqualStrategyNumber);
Const *rightConst = (Const *) get_rightop((Expr *) equalityExpr);
rightConst->constvalue = (Datum) value;
rightConst->constisnull = false;
rightConst->constbyval = false;
partitionExpression = (Expr *) equalityExpr;
}
else
{
NullTest *nullTest = makeNode(NullTest);
nullTest->arg = (Expr *) partitionColumn;
nullTest->nulltesttype = IS_NULL;
partitionExpression = (Expr *) nullTest;
}
return partitionExpression;
}
/*
* PrunedShardIdsForTable loads the shard intervals for the specified table,
* prunes them using the provided clauses. It returns an ArrayType containing
* the shard identifiers, suitable for return from an SQL-facing function.
*/
static ArrayType *
PrunedShardIdsForTable(Oid distributedTableId, List *whereClauseList)
{
ArrayType *shardIdArrayType = NULL;
ListCell *shardCell = NULL;
int shardIdIndex = 0;
Oid shardIdTypeId = INT8OID;
Index tableId = 1;
List *shardList = LoadShardIntervalList(distributedTableId);
int shardIdCount = -1;
Datum *shardIdDatumArray = NULL;
shardList = PruneShardList(distributedTableId, tableId, whereClauseList, shardList);
shardIdCount = list_length(shardList);
shardIdDatumArray = palloc0(shardIdCount * sizeof(Datum));
foreach(shardCell, shardList)
{
ShardInterval *shardId = (ShardInterval *) lfirst(shardCell);
Datum shardIdDatum = Int64GetDatum(shardId->shardId);
shardIdDatumArray[shardIdIndex] = shardIdDatum;
shardIdIndex++;
}
shardIdArrayType = DatumArrayToArrayType(shardIdDatumArray, shardIdCount,
shardIdTypeId);
return shardIdArrayType;
}

View File

@ -0,0 +1,39 @@
/*-------------------------------------------------------------------------
*
* test/src/test_helper_functions.c
*
* This file contains helper functions used in many CitusDB tests.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include <string.h>
#include "distributed/test_helper_functions.h" /* IWYU pragma: keep */
#include "utils/array.h"
#include "utils/lsyscache.h"
/*
* DatumArrayToArrayType converts the provided Datum array (of the specified
* length and type) into an ArrayType suitable for returning from a UDF.
*/
ArrayType *
DatumArrayToArrayType(Datum *datumArray, int datumCount, Oid datumTypeId)
{
ArrayType *arrayObject = NULL;
int16 typeLength = 0;
bool typeByValue = false;
char typeAlignment = 0;
get_typlenbyvalalign(datumTypeId, &typeLength, &typeByValue, &typeAlignment);
arrayObject = construct_array(datumArray, datumCount, datumTypeId,
typeLength, typeByValue, typeAlignment);
return arrayObject;
}

View File

@ -0,0 +1,304 @@
/*-------------------------------------------------------------------------
*
* citus_nodefuncs.c
* Helper functions for dealing with nodes
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/pg_type.h"
#include "distributed/citus_nodefuncs.h"
#include "distributed/metadata_cache.h"
/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(citusdb_extradata_container);
/*
* SetRangeTblExtraData adds additional data to a RTE, overwriting previous
* values, if present.
*
* The data is stored as RTE_FUNCTION type RTE of a special
* citus_extradata_container function, with the extra data serialized into the
* function arguments. That works, because these RTEs aren't used by Postgres
* to any significant degree, and Citus' variant of ruleutils.c knows how to
* deal with these extended RTEs. Note that rte->eref needs to be set prior
* to calling SetRangeTblExtraData to ensure the funccolcount can be set
* correctly.
*
* NB: If used for postgres defined RTEKinds, fields specific to that RTEKind
* will not be handled by out/readfuncs.c. For the current uses that's ok.
*/
void
SetRangeTblExtraData(RangeTblEntry *rte, CitusRTEKind rteKind,
char *fragmentSchemaName, char *fragmentTableName,
List *tableIdList)
{
RangeTblFunction *fauxFunction = NULL;
FuncExpr *fauxFuncExpr = NULL;
Const *rteKindData = NULL;
Const *fragmentSchemaData = NULL;
Const *fragmentTableData = NULL;
Const *tableIdListData = NULL;
Assert(rte->eref && rte->eref->colnames != NIL);
/* store RTE kind as a plain int4 */
rteKindData = makeNode(Const);
rteKindData->consttype = INT4OID;
rteKindData->constlen = 4;
rteKindData->constvalue = Int32GetDatum(rteKind);
rteKindData->constbyval = true;
rteKindData->constisnull = false;
rteKindData->location = -1;
/* store the fragment schema as a cstring */
fragmentSchemaData = makeNode(Const);
fragmentSchemaData->consttype = CSTRINGOID;
fragmentSchemaData->constlen = -2;
fragmentSchemaData->constvalue = CStringGetDatum(fragmentSchemaName);
fragmentSchemaData->constbyval = false;
fragmentSchemaData->constisnull = fragmentSchemaName == NULL;
fragmentSchemaData->location = -1;
/* store the fragment name as a cstring */
fragmentTableData = makeNode(Const);
fragmentTableData->consttype = CSTRINGOID;
fragmentTableData->constlen = -2;
fragmentTableData->constvalue = CStringGetDatum(fragmentTableName);
fragmentTableData->constbyval = false;
fragmentTableData->constisnull = fragmentTableName == NULL;
fragmentTableData->location = -1;
/* store the table id list as an array of integers: FIXME */
tableIdListData = makeNode(Const);
tableIdListData->consttype = CSTRINGOID;
tableIdListData->constbyval = false;
tableIdListData->constlen = -2;
tableIdListData->location = -1;
/* serialize tableIdList to a string, seems simplest that way */
if (tableIdList != NIL)
{
char *serializedList = nodeToString(tableIdList);
tableIdListData->constisnull = false;
tableIdListData->constvalue = CStringGetDatum(serializedList);
}
else
{
tableIdListData->constisnull = true;
}
/* create function expression to store our faux arguments in */
fauxFuncExpr = makeNode(FuncExpr);
fauxFuncExpr->funcid = CitusExtraDataContainerFuncId();
fauxFuncExpr->funcretset = true;
fauxFuncExpr->location = -1;
fauxFuncExpr->args = list_make4(rteKindData, fragmentSchemaData,
fragmentTableData, tableIdListData);
fauxFunction = makeNode(RangeTblFunction);
fauxFunction->funcexpr = (Node *) fauxFuncExpr;
/* set the column count to pass ruleutils checks, not used elsewhere */
fauxFunction->funccolcount = list_length(rte->eref->colnames);
rte->rtekind = RTE_FUNCTION;
rte->functions = list_make1(fauxFunction);
}
/*
* ExtractRangeTblExtraData extracts extra data stored for a range table entry
* that previously has been stored with
* Set/ModifyRangeTblExtraData. Parameters can be NULL if unintersting. It is
* valid to use the function on a RTE without extra data.
*/
void
ExtractRangeTblExtraData(RangeTblEntry *rte, CitusRTEKind *rteKind,
char **fragmentSchemaName, char **fragmentTableName,
List **tableIdList)
{
RangeTblFunction *fauxFunction = NULL;
FuncExpr *fauxFuncExpr = NULL;
Const *tmpConst = NULL;
/* set base rte kind first, so this can be used for 'non-extended' RTEs as well */
if (rteKind != NULL)
{
*rteKind = (CitusRTEKind) rte->rtekind;
}
/* reset values of optionally-present fields, will later be overwritten, if present */
if (fragmentSchemaName != NULL)
{
*fragmentSchemaName = NULL;
}
if (fragmentTableName != NULL)
{
*fragmentTableName = NULL;
}
if (tableIdList != NULL)
{
*tableIdList = NIL;
}
/* only function RTEs have our special extra data */
if (rte->rtekind != RTE_FUNCTION)
{
return;
}
/* we only ever generate one argument */
if (list_length(rte->functions) != 1)
{
return;
}
/* should pretty much always be a FuncExpr, but be liberal in what we expect... */
fauxFunction = linitial(rte->functions);
if (!IsA(fauxFunction->funcexpr, FuncExpr))
{
return;
}
fauxFuncExpr = (FuncExpr *) fauxFunction->funcexpr;
/*
* There will never be a range table entry with this function id, but for
* the purpose of this file.
*/
if (fauxFuncExpr->funcid != CitusExtraDataContainerFuncId())
{
return;
}
/*
* Extra data for rtes is stored in the function arguments. The first
* argument stores the rtekind, second fragmentSchemaName, third
* fragmentTableName, fourth tableIdList.
*/
if (list_length(fauxFuncExpr->args) != 4)
{
ereport(ERROR, (errmsg("unexpected number of function arguments to "
"citusdb_extradata_container")));
return;
}
/* extract rteKind */
tmpConst = (Const *) linitial(fauxFuncExpr->args);
Assert(IsA(tmpConst, Const));
Assert(tmpConst->consttype == INT4OID);
if (rteKind != NULL)
{
*rteKind = DatumGetInt32(tmpConst->constvalue);
}
/* extract fragmentSchemaName */
tmpConst = (Const *) lsecond(fauxFuncExpr->args);
Assert(IsA(tmpConst, Const));
Assert(tmpConst->consttype == CSTRINGOID);
if (fragmentSchemaName != NULL && !tmpConst->constisnull)
{
*fragmentSchemaName = DatumGetCString(tmpConst->constvalue);
}
/* extract fragmentTableName */
tmpConst = (Const *) lthird(fauxFuncExpr->args);
Assert(IsA(tmpConst, Const));
Assert(tmpConst->consttype == CSTRINGOID);
if (fragmentTableName != NULL && !tmpConst->constisnull)
{
*fragmentTableName = DatumGetCString(tmpConst->constvalue);
}
/* extract tableIdList, stored as a serialized integer list */
tmpConst = (Const *) lfourth(fauxFuncExpr->args);
Assert(IsA(tmpConst, Const));
Assert(tmpConst->consttype == CSTRINGOID);
if (tableIdList != NULL && !tmpConst->constisnull)
{
Node *deserializedList = stringToNode(DatumGetCString(tmpConst->constvalue));
Assert(IsA(deserializedList, IntList));
*tableIdList = (List *) deserializedList;
}
}
/*
* ModifyRangeTblExtraData sets the RTE extra data fields for the passed
* fields, leaving the current values in place for the ones not specified.
*
* rteKind has to be specified, fragmentSchemaName, fragmentTableName,
* tableIdList can be set to NULL/NIL respectively to leave the current values
* in-place.
*/
void
ModifyRangeTblExtraData(RangeTblEntry *rte, CitusRTEKind rteKind,
char *fragmentSchemaName, char *fragmentTableName,
List *tableIdList)
{
/* load existing values for the arguments not specifying a new value */
ExtractRangeTblExtraData(rte, NULL,
fragmentSchemaName == NULL ? &fragmentSchemaName : NULL,
fragmentTableName == NULL ? &fragmentTableName : NULL,
tableIdList == NIL ? &tableIdList : NULL);
SetRangeTblExtraData(rte, rteKind,
fragmentSchemaName, fragmentTableName,
tableIdList);
}
/* GetRangeTblKind returns rtekind of a RTE, be it an extended one or not. */
CitusRTEKind
GetRangeTblKind(RangeTblEntry *rte)
{
CitusRTEKind rteKind = CITUS_RTE_RELATION /* invalid */;
switch(rte->rtekind)
{
/* directly rtekind if it's not possibly an extended RTE */
case RTE_RELATION:
case RTE_SUBQUERY:
case RTE_JOIN:
case RTE_VALUES:
case RTE_CTE:
rteKind = (CitusRTEKind) rte->rtekind;
break;
case RTE_FUNCTION:
/*
* Extract extra data - correct even if a plain RTE_FUNCTION, not
* an extended one, ExtractRangeTblExtraData handles that case
* transparently.
*/
ExtractRangeTblExtraData(rte, &rteKind, NULL, NULL, NULL);
break;
}
return rteKind;
}
/*
* citusdb_extradata_container is a placeholder function to store information
* needed by CitusDB in plain postgres node trees. Executor and other hooks
* should always intercept statements containing calls to this function. It's
* not actually SQL callable by the user because of an INTERNAL argument.
*/
Datum
citusdb_extradata_container(PG_FUNCTION_ARGS)
{
ereport(ERROR, (errmsg("not supposed to get here, did you cheat?")));
PG_RETURN_NULL();
}

View File

@ -0,0 +1,595 @@
/*-------------------------------------------------------------------------
*
* citus_outfuncs.c
* Output functions for CitusDB tree nodes.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2012-2015, Citus Data, Inc.
*
* NOTES
* This is a wrapper around postgres' nodeToString() that additionally
* supports CitusDB node types.
*
* Keep as closely aligned with the upstream version as possible.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include "distributed/citus_nodefuncs.h"
#include "distributed/citus_nodes.h"
#include "distributed/multi_logical_planner.h"
#include "distributed/multi_physical_planner.h"
#include "distributed/master_metadata_utility.h"
#include "lib/stringinfo.h"
#include "nodes/plannodes.h"
#include "nodes/relation.h"
#include "utils/datum.h"
/*
* Macros to simplify output of different kinds of fields. Use these
* wherever possible to reduce the chance for silly typos. Note that these
* hard-wire conventions about the names of the local variables in an Out
* routine.
*/
/* Write the label for the node type */
#define WRITE_NODE_TYPE(nodelabel) \
appendStringInfoString(str, nodelabel)
/* Write an integer field (anything written as ":fldname %d") */
#define WRITE_INT_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %d", node->fldname)
/* Write an unsigned integer field (anything written as ":fldname %u") */
#define WRITE_UINT_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
/* XXX: Citus: Write an unsigned 64-bit integer field */
#define WRITE_UINT64_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " " UINT64_FORMAT, node->fldname)
/* Write an OID field (don't hard-wire assumption that OID is same as uint) */
#define WRITE_OID_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
/* Write a long-integer field */
#define WRITE_LONG_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %ld", node->fldname)
/* Write a char field (ie, one ascii character) */
#define WRITE_CHAR_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %c", node->fldname)
/* Write an enumerated-type field as an integer code */
#define WRITE_ENUM_FIELD(fldname, enumtype) \
appendStringInfo(str, " :" CppAsString(fldname) " %d", \
(int) node->fldname)
/* Write a float field --- caller must give format to define precision */
#define WRITE_FLOAT_FIELD(fldname,format) \
appendStringInfo(str, " :" CppAsString(fldname) " " format, node->fldname)
/* Write a boolean field */
#define WRITE_BOOL_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %s", \
booltostr(node->fldname))
/* Write a character-string (possibly NULL) field */
#define WRITE_STRING_FIELD(fldname) \
(appendStringInfo(str, " :" CppAsString(fldname) " "), \
_outToken(str, node->fldname))
/* Write a parse location field (actually same as INT case) */
#define WRITE_LOCATION_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %d", node->fldname)
/* Write a Node field */
#define WRITE_NODE_FIELD(fldname) \
(appendStringInfo(str, " :" CppAsString(fldname) " "), \
_outNode(str, node->fldname))
/* Write a bitmapset field */
#define WRITE_BITMAPSET_FIELD(fldname) \
(appendStringInfo(str, " :" CppAsString(fldname) " "), \
_outBitmapset(str, node->fldname))
#define booltostr(x) ((x) ? "true" : "false")
static void _outNode(StringInfo str, const void *obj);
/*
* _outToken
* Convert an ordinary string (eg, an identifier) into a form that
* will be decoded back to a plain token by read.c's functions.
*
* If a null or empty string is given, it is encoded as "<>".
*/
static void
_outToken(StringInfo str, const char *s)
{
if (s == NULL || *s == '\0')
{
appendStringInfoString(str, "<>");
return;
}
/*
* Look for characters or patterns that are treated specially by read.c
* (either in pg_strtok() or in nodeRead()), and therefore need a
* protective backslash.
*/
/* These characters only need to be quoted at the start of the string */
if (*s == '<' ||
*s == '\"' ||
isdigit((unsigned char) *s) ||
((*s == '+' || *s == '-') &&
(isdigit((unsigned char) s[1]) || s[1] == '.')))
appendStringInfoChar(str, '\\');
while (*s)
{
/* These chars must be backslashed anywhere in the string */
if (*s == ' ' || *s == '\n' || *s == '\t' ||
*s == '(' || *s == ')' || *s == '{' || *s == '}' ||
*s == '\\')
appendStringInfoChar(str, '\\');
appendStringInfoChar(str, *s++);
}
}
static void
_outList(StringInfo str, const List *node)
{
const ListCell *lc;
appendStringInfoChar(str, '(');
if (IsA(node, IntList))
appendStringInfoChar(str, 'i');
else if (IsA(node, OidList))
appendStringInfoChar(str, 'o');
foreach(lc, node)
{
/*
* For the sake of backward compatibility, we emit a slightly
* different whitespace format for lists of nodes vs. other types of
* lists. XXX: is this necessary?
*/
if (IsA(node, List))
{
_outNode(str, lfirst(lc));
if (lnext(lc))
appendStringInfoChar(str, ' ');
}
else if (IsA(node, IntList))
appendStringInfo(str, " %d", lfirst_int(lc));
else if (IsA(node, OidList))
appendStringInfo(str, " %u", lfirst_oid(lc));
else
elog(ERROR, "unrecognized list node type: %d",
(int) node->type);
}
appendStringInfoChar(str, ')');
}
/*
* Print the value of a Datum given its type.
*/
static void
_outDatum(StringInfo str, Datum value, int typlen, bool typbyval)
{
Size length,
i;
char *s;
length = datumGetSize(value, typbyval, typlen);
if (typbyval)
{
s = (char *) (&value);
appendStringInfo(str, "%u [ ", (unsigned int) length);
for (i = 0; i < (Size) sizeof(Datum); i++)
appendStringInfo(str, "%d ", (int) (s[i]));
appendStringInfoChar(str, ']');
}
else
{
s = (char *) DatumGetPointer(value);
if (!PointerIsValid(s))
appendStringInfoString(str, "0 [ ]");
else
{
appendStringInfo(str, "%u [ ", (unsigned int) length);
for (i = 0; i < length; i++)
appendStringInfo(str, "%d ", (int) (s[i]));
appendStringInfoChar(str, ']');
}
}
}
/*****************************************************************************
* Output routines for CitusDB node types
*****************************************************************************/
static void
_outMultiUnaryNode(StringInfo str, const MultiUnaryNode *node)
{
WRITE_NODE_FIELD(childNode);
}
static void
_outMultiBinaryNode(StringInfo str, const MultiBinaryNode *node)
{
WRITE_NODE_FIELD(leftChildNode);
WRITE_NODE_FIELD(rightChildNode);
}
static void
_outMultiTreeRoot(StringInfo str, const MultiTreeRoot *node)
{
WRITE_NODE_TYPE("MULTITREEROOT");
_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
}
static void
_outMultiPlan(StringInfo str, const MultiPlan *node)
{
WRITE_NODE_TYPE("MULTIPLAN");
WRITE_NODE_FIELD(workerJob);
WRITE_NODE_FIELD(masterQuery);
WRITE_STRING_FIELD(masterTableName);
}
static void
_outMultiProject(StringInfo str, const MultiProject *node)
{
WRITE_NODE_TYPE("MULTIPROJECT");
WRITE_NODE_FIELD(columnList);
_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
}
static void
_outMultiCollect(StringInfo str, const MultiCollect *node)
{
WRITE_NODE_TYPE("MULTICOLLECT");
_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
}
static void
_outMultiSelect(StringInfo str, const MultiSelect *node)
{
WRITE_NODE_TYPE("MULTISELECT");
WRITE_NODE_FIELD(selectClauseList);
_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
}
static void
_outMultiTable(StringInfo str, const MultiTable *node)
{
WRITE_NODE_TYPE("MULTITABLE");
WRITE_OID_FIELD(relationId);
WRITE_INT_FIELD(rangeTableId);
_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
}
static void
_outMultiJoin(StringInfo str, const MultiJoin *node)
{
WRITE_NODE_TYPE("MULTIJOIN");
WRITE_NODE_FIELD(joinClauseList);
WRITE_ENUM_FIELD(joinRuleType, JoinRuleType);
WRITE_ENUM_FIELD(joinType, JoinType);
_outMultiBinaryNode(str, (const MultiBinaryNode *) node);
}
static void
_outMultiPartition(StringInfo str, const MultiPartition *node)
{
WRITE_NODE_TYPE("MULTIPARTITION");
WRITE_NODE_FIELD(partitionColumn);
_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
}
static void
_outMultiCartesianProduct(StringInfo str, const MultiCartesianProduct *node)
{
WRITE_NODE_TYPE("MULTICARTESIANPRODUCT");
_outMultiBinaryNode(str, (const MultiBinaryNode *) node);
}
static void
_outMultiExtendedOp(StringInfo str, const MultiExtendedOp *node)
{
WRITE_NODE_TYPE("MULTIEXTENDEDOP");
WRITE_NODE_FIELD(targetList);
WRITE_NODE_FIELD(groupClauseList);
WRITE_NODE_FIELD(sortClauseList);
WRITE_NODE_FIELD(limitCount);
WRITE_NODE_FIELD(limitOffset);
_outMultiUnaryNode(str, (const MultiUnaryNode *) node);
}
static void
_outJobInfo(StringInfo str, const Job *node)
{
WRITE_UINT64_FIELD(jobId);
WRITE_NODE_FIELD(jobQuery);
WRITE_NODE_FIELD(taskList);
WRITE_NODE_FIELD(dependedJobList);
WRITE_BOOL_FIELD(subqueryPushdown);
}
static void
_outJob(StringInfo str, const Job *node)
{
WRITE_NODE_TYPE("JOB");
_outJobInfo(str, node);
}
static void
_outShardInterval(StringInfo str, const ShardInterval *node)
{
WRITE_NODE_TYPE("SHARDINTERVAL");
WRITE_OID_FIELD(relationId);
WRITE_CHAR_FIELD(storageType);
WRITE_OID_FIELD(valueTypeId);
WRITE_INT_FIELD(valueTypeLen);
WRITE_BOOL_FIELD(valueByVal);
WRITE_BOOL_FIELD(minValueExists);
WRITE_BOOL_FIELD(maxValueExists);
appendStringInfoString(str, " :minValue ");
if (!node->minValueExists)
appendStringInfoString(str, "<>");
else
_outDatum(str, node->minValue, node->valueTypeLen, node->valueByVal);
appendStringInfoString(str, " :maxValue ");
if (!node->maxValueExists)
appendStringInfoString(str, "<>");
else
_outDatum(str, node->maxValue, node->valueTypeLen, node->valueByVal);
WRITE_UINT64_FIELD(shardId);
}
static void
_outMapMergeJob(StringInfo str, const MapMergeJob *node)
{
int arrayLength = node->sortedShardIntervalArrayLength;
int i;
WRITE_NODE_TYPE("MAPMERGEJOB");
_outJobInfo(str, (Job *) node);
WRITE_NODE_FIELD(reduceQuery);
WRITE_ENUM_FIELD(partitionType, PartitionType);
WRITE_NODE_FIELD(partitionColumn);
WRITE_UINT_FIELD(partitionCount);
WRITE_INT_FIELD(sortedShardIntervalArrayLength);
for (i = 0; i < arrayLength; ++i)
{
ShardInterval *writeElement = node->sortedShardIntervalArray[i];
_outShardInterval(str, writeElement);
}
WRITE_NODE_FIELD(mapTaskList);
WRITE_NODE_FIELD(mergeTaskList);
}
static void
_outShardPlacement(StringInfo str, const ShardPlacement *node)
{
WRITE_NODE_TYPE("SHARDPLACEMENT");
WRITE_OID_FIELD(tupleOid);
WRITE_UINT64_FIELD(shardId);
WRITE_UINT64_FIELD(shardLength);
WRITE_ENUM_FIELD(shardState, RelayFileState);
WRITE_STRING_FIELD(nodeName);
WRITE_UINT_FIELD(nodePort);
}
static void
_outTask(StringInfo str, const Task *node)
{
WRITE_NODE_TYPE("TASK");
WRITE_ENUM_FIELD(taskType, TaskType);
WRITE_UINT64_FIELD(jobId);
WRITE_UINT_FIELD(taskId);
WRITE_STRING_FIELD(queryString);
WRITE_UINT64_FIELD(anchorShardId);
WRITE_NODE_FIELD(taskPlacementList);
WRITE_NODE_FIELD(dependedTaskList);
WRITE_UINT_FIELD(partitionId);
WRITE_UINT_FIELD(upstreamTaskId);
WRITE_NODE_FIELD(shardInterval);
WRITE_BOOL_FIELD(assignmentConstrained);
WRITE_NODE_FIELD(taskExecution);
WRITE_BOOL_FIELD(upsertQuery);
}
/*
* _outNode -
* converts a Node into ascii string and append it to 'str'
*/
static void
_outNode(StringInfo str, const void *obj)
{
if (obj == NULL)
{
appendStringInfoString(str, "<>");
return;
}
switch (CitusNodeTag(obj))
{
case T_List:
case T_IntList:
case T_OidList:
_outList(str, obj);
break;
case T_MultiTreeRoot:
appendStringInfoChar(str, '{');
_outMultiTreeRoot(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MultiProject:
appendStringInfoChar(str, '{');
_outMultiProject(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MultiCollect:
appendStringInfoChar(str, '{');
_outMultiCollect(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MultiSelect:
appendStringInfoChar(str, '{');
_outMultiSelect(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MultiTable:
appendStringInfoChar(str, '{');
_outMultiTable(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MultiJoin:
appendStringInfoChar(str, '{');
_outMultiJoin(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MultiPartition:
appendStringInfoChar(str, '{');
_outMultiPartition(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MultiCartesianProduct:
appendStringInfoChar(str, '{');
_outMultiCartesianProduct(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MultiExtendedOp:
appendStringInfoChar(str, '{');
_outMultiExtendedOp(str, obj);
appendStringInfoChar(str, '}');
break;
case T_Job:
appendStringInfoChar(str, '{');
_outJob(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MapMergeJob:
appendStringInfoChar(str, '{');
_outMapMergeJob(str, obj);
appendStringInfoChar(str, '}');
break;
case T_MultiPlan:
appendStringInfoChar(str, '{');
_outMultiPlan(str, obj);
appendStringInfoChar(str, '}');
break;
case T_Task:
appendStringInfoChar(str, '{');
_outTask(str, obj);
appendStringInfoChar(str, '}');
break;
case T_ShardInterval:
appendStringInfoChar(str, '{');
_outShardInterval(str, obj);
appendStringInfoChar(str, '}');
break;
case T_ShardPlacement:
appendStringInfoChar(str, '{');
_outShardPlacement(str, obj);
appendStringInfoChar(str, '}');
break;
default:
/* fall back into postgres' normal nodeToString machinery */
appendStringInfoString(str, nodeToString(obj));
}
}
/*
* CitusNodeToString -
* returns the ascii representation of the Node as a palloc'd string
*/
char *
CitusNodeToString(const void *obj)
{
StringInfoData str;
initStringInfo(&str);
_outNode(&str, obj);
return str.data;
}

View File

@ -0,0 +1,348 @@
/*-------------------------------------------------------------------------
*
* citus_read.c
* Citus version of postgres' read.c, using a different state variable for
* citus_pg_strtok.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2012-2015, Citus Data, Inc.
*
* NOTES
* Unfortunately we have to copy this file as the state variable for
* pg_strtok is not externally accessible. That prevents creating a a
* version of stringToNode() that calls CitusNodeRead() instead of
* nodeRead(). Luckily these functions seldomly change.
*
* Keep as closely aligned with the upstream version as possible.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include "nodes/pg_list.h"
#include "nodes/readfuncs.h"
#include "distributed/citus_nodefuncs.h"
#include "nodes/value.h"
/* Static state for citus_pg_strtok */
static char *citus_pg_strtok_ptr = NULL;
/*
* CitusStringToNode -
* returns a Node with a given legal ASCII representation
*/
void *
CitusStringToNode(char *str)
{
char *save_strtok;
void *retval;
/*
* We save and restore the pre-existing state of citus_pg_strtok. This makes the
* world safe for re-entrant invocation of stringToNode, without incurring
* a lot of notational overhead by having to pass the next-character
* pointer around through all the readfuncs.c code.
*/
save_strtok = citus_pg_strtok_ptr;
citus_pg_strtok_ptr = str; /* point citus_pg_strtok at the string to read */
retval = CitusNodeRead(NULL, 0); /* do the reading */
citus_pg_strtok_ptr = save_strtok;
return retval;
}
/*
* citus_pg_strtok is a copy of postgres' pg_strtok routine, referencing
* citus_pg_strtok_ptr instead of pg_strtok_ptr as state.
*/
char *
citus_pg_strtok(int *length)
{
char *local_str; /* working pointer to string */
char *ret_str; /* start of token to return */
local_str = citus_pg_strtok_ptr;
while (*local_str == ' ' || *local_str == '\n' || *local_str == '\t')
local_str++;
if (*local_str == '\0')
{
*length = 0;
citus_pg_strtok_ptr = local_str;
return NULL; /* no more tokens */
}
/*
* Now pointing at start of next token.
*/
ret_str = local_str;
if (*local_str == '(' || *local_str == ')' ||
*local_str == '{' || *local_str == '}')
{
/* special 1-character token */
local_str++;
}
else
{
/* Normal token, possibly containing backslashes */
while (*local_str != '\0' &&
*local_str != ' ' && *local_str != '\n' &&
*local_str != '\t' &&
*local_str != '(' && *local_str != ')' &&
*local_str != '{' && *local_str != '}')
{
if (*local_str == '\\' && local_str[1] != '\0')
local_str += 2;
else
local_str++;
}
}
*length = local_str - ret_str;
/* Recognize special case for "empty" token */
if (*length == 2 && ret_str[0] == '<' && ret_str[1] == '>')
*length = 0;
citus_pg_strtok_ptr = local_str;
return ret_str;
}
#define RIGHT_PAREN (1000000 + 1)
#define LEFT_PAREN (1000000 + 2)
#define LEFT_BRACE (1000000 + 3)
#define OTHER_TOKEN (1000000 + 4)
/*
* nodeTokenType -
* returns the type of the node token contained in token.
* It returns one of the following valid NodeTags:
* T_Integer, T_Float, T_String, T_BitString
* and some of its own:
* RIGHT_PAREN, LEFT_PAREN, LEFT_BRACE, OTHER_TOKEN
*
* Assumption: the ascii representation is legal
*/
static NodeTag
nodeTokenType(char *token, int length)
{
NodeTag retval;
char *numptr;
int numlen;
/*
* Check if the token is a number
*/
numptr = token;
numlen = length;
if (*numptr == '+' || *numptr == '-')
numptr++, numlen--;
if ((numlen > 0 && isdigit((unsigned char) *numptr)) ||
(numlen > 1 && *numptr == '.' && isdigit((unsigned char) numptr[1])))
{
/*
* Yes. Figure out whether it is integral or float; this requires
* both a syntax check and a range check. strtol() can do both for us.
* We know the token will end at a character that strtol will stop at,
* so we do not need to modify the string.
*/
long val;
char *endptr;
errno = 0;
val = strtol(token, &endptr, 10);
(void) val; /* avoid compiler warning if unused */
if (endptr != token + length || errno == ERANGE
#ifdef HAVE_LONG_INT_64
/* if long > 32 bits, check for overflow of int4 */
|| val != (long) ((int32) val)
#endif
)
return T_Float;
return T_Integer;
}
/*
* these three cases do not need length checks, since citus_pg_strtok() will
* always treat them as single-byte tokens
*/
else if (*token == '(')
retval = LEFT_PAREN;
else if (*token == ')')
retval = RIGHT_PAREN;
else if (*token == '{')
retval = LEFT_BRACE;
else if (*token == '\"' && length > 1 && token[length - 1] == '\"')
retval = T_String;
else if (*token == 'b')
retval = T_BitString;
else
retval = OTHER_TOKEN;
return retval;
}
/*
* CitusNodeRead is an adapted copy of postgres' nodeRead routine, using
* citus_pg_strtok_ptr instead of pg_strtok_ptr.
*/
void *
CitusNodeRead(char *token, int tok_len)
{
Node *result;
NodeTag type;
if (token == NULL) /* need to read a token? */
{
token = citus_pg_strtok(&tok_len);
if (token == NULL) /* end of input */
return NULL;
}
type = nodeTokenType(token, tok_len);
switch ((int) type)
{
case LEFT_BRACE:
result = CitusParseNodeString();
token = citus_pg_strtok(&tok_len);
if (token == NULL || token[0] != '}')
elog(ERROR, "did not find '}' at end of input node");
break;
case LEFT_PAREN:
{
List *l = NIL;
/*----------
* Could be an integer list: (i int int ...)
* or an OID list: (o int int ...)
* or a list of nodes/values: (node node ...)
*----------
*/
token = citus_pg_strtok(&tok_len);
if (token == NULL)
elog(ERROR, "unterminated List structure");
if (tok_len == 1 && token[0] == 'i')
{
/* List of integers */
for (;;)
{
int val;
char *endptr;
token = citus_pg_strtok(&tok_len);
if (token == NULL)
elog(ERROR, "unterminated List structure");
if (token[0] == ')')
break;
val = (int) strtol(token, &endptr, 10);
if (endptr != token + tok_len)
elog(ERROR, "unrecognized integer: \"%.*s\"",
tok_len, token);
l = lappend_int(l, val);
}
}
else if (tok_len == 1 && token[0] == 'o')
{
/* List of OIDs */
for (;;)
{
Oid val;
char *endptr;
token = citus_pg_strtok(&tok_len);
if (token == NULL)
elog(ERROR, "unterminated List structure");
if (token[0] == ')')
break;
val = (Oid) strtoul(token, &endptr, 10);
if (endptr != token + tok_len)
elog(ERROR, "unrecognized OID: \"%.*s\"",
tok_len, token);
l = lappend_oid(l, val);
}
}
else
{
/* List of other node types */
for (;;)
{
/* We have already scanned next token... */
if (token[0] == ')')
break;
l = lappend(l, CitusNodeRead(token, tok_len));
token = citus_pg_strtok(&tok_len);
if (token == NULL)
elog(ERROR, "unterminated List structure");
}
}
result = (Node *) l;
break;
}
case RIGHT_PAREN:
elog(ERROR, "unexpected right parenthesis");
result = NULL; /* keep compiler happy */
break;
case OTHER_TOKEN:
if (tok_len == 0)
{
/* must be "<>" --- represents a null pointer */
result = NULL;
}
else
{
elog(ERROR, "unrecognized token: \"%.*s\"", tok_len, token);
result = NULL; /* keep compiler happy */
}
break;
case T_Integer:
/*
* we know that the token terminates on a char atol will stop at
*/
result = (Node *) makeInteger(atol(token));
break;
case T_Float:
{
char *fval = (char *) palloc(tok_len + 1);
memcpy(fval, token, tok_len);
fval[tok_len] = '\0';
result = (Node *) makeFloat(fval);
}
break;
case T_String:
/* need to remove leading and trailing quotes, and backslashes */
result = (Node *) makeString(debackslash(token + 1, tok_len - 2));
break;
case T_BitString:
{
char *val = palloc(tok_len);
/* skip leading 'b' */
memcpy(val, token + 1, tok_len - 1);
val[tok_len - 1] = '\0';
result = (Node *) makeBitString(val);
break;
}
default:
elog(ERROR, "unrecognized node type: %d", (int) type);
result = NULL; /* keep compiler happy */
break;
}
return (void *) result;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,560 @@
/*-------------------------------------------------------------------------
*
* citus_ruleutils.c
* Version independent ruleutils wrapper
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <unistd.h>
#include <fcntl.h>
#include "access/htup_details.h"
#include "access/sysattr.h"
#include "catalog/dependency.h"
#include "catalog/indexing.h"
#include "catalog/pg_aggregate.h"
#include "catalog/pg_extension.h"
#include "catalog/pg_foreign_data_wrapper.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_proc.h"
#include "catalog/pg_type.h"
#include "distributed/citus_nodefuncs.h"
#include "distributed/citus_ruleutils.h"
#include "commands/defrem.h"
#include "commands/extension.h"
#include "foreign/foreign.h"
#include "funcapi.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/tlist.h"
#include "parser/keywords.h"
#include "parser/parse_agg.h"
#include "parser/parse_func.h"
#include "parser/parse_oper.h"
#include "parser/parser.h"
#include "parser/parsetree.h"
#include "rewrite/rewriteHandler.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#if (PG_VERSION_NUM >= 90500)
#include "utils/ruleutils.h"
#endif
#include "utils/syscache.h"
#include "utils/typcache.h"
#include "utils/xml.h"
static Oid get_extension_schema(Oid ext_oid);
static void AppendOptionListToString(StringInfo stringData, List *options);
/*
* pg_get_extensiondef_string finds the foreign data wrapper that corresponds to
* the given foreign tableId, and checks if an extension owns this foreign data
* wrapper. If it does, the function returns the extension's definition. If not,
* the function returns null.
*/
char *
pg_get_extensiondef_string(Oid tableRelationId)
{
ForeignTable *foreignTable = GetForeignTable(tableRelationId);
ForeignServer *server = GetForeignServer(foreignTable->serverid);
ForeignDataWrapper *foreignDataWrapper = GetForeignDataWrapper(server->fdwid);
StringInfoData buffer = { NULL, 0, 0, 0 };
Oid classId = ForeignDataWrapperRelationId;
Oid objectId = server->fdwid;
Oid extensionId = getExtensionOfObject(classId, objectId);
if (OidIsValid(extensionId))
{
char *extensionName = get_extension_name(extensionId);
Oid extensionSchemaId = get_extension_schema(extensionId);
char *extensionSchema = get_namespace_name(extensionSchemaId);
initStringInfo(&buffer);
appendStringInfo(&buffer, "CREATE EXTENSION IF NOT EXISTS %s WITH SCHEMA %s",
quote_identifier(extensionName),
quote_identifier(extensionSchema));
}
else
{
ereport(NOTICE, (errmsg("foreign-data wrapper \"%s\" does not have an "
"extension defined", foreignDataWrapper->fdwname)));
}
return (buffer.data);
}
/*
* get_extension_schema - given an extension OID, fetch its extnamespace
*
* Returns InvalidOid if no such extension.
*/
static Oid
get_extension_schema(Oid ext_oid)
{
Oid result;
Relation rel;
SysScanDesc scandesc;
HeapTuple tuple;
ScanKeyData entry[1];
rel = heap_open(ExtensionRelationId, AccessShareLock);
ScanKeyInit(&entry[0],
ObjectIdAttributeNumber,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(ext_oid));
scandesc = systable_beginscan(rel, ExtensionOidIndexId, true,
NULL, 1, entry);
tuple = systable_getnext(scandesc);
/* We assume that there can be at most one matching tuple */
if (HeapTupleIsValid(tuple))
result = ((Form_pg_extension) GETSTRUCT(tuple))->extnamespace;
else
result = InvalidOid;
systable_endscan(scandesc);
heap_close(rel, AccessShareLock);
return result;
}
/*
* pg_get_serverdef_string finds the foreign server that corresponds to the
* given foreign tableId, and returns this server's definition.
*/
char *
pg_get_serverdef_string(Oid tableRelationId)
{
ForeignTable *foreignTable = GetForeignTable(tableRelationId);
ForeignServer *server = GetForeignServer(foreignTable->serverid);
ForeignDataWrapper *foreignDataWrapper = GetForeignDataWrapper(server->fdwid);
StringInfoData buffer = { NULL, 0, 0, 0 };
initStringInfo(&buffer);
appendStringInfo(&buffer, "CREATE SERVER %s", quote_identifier(server->servername));
if (server->servertype != NULL)
{
appendStringInfo(&buffer, " TYPE %s",
quote_literal_cstr(server->servertype));
}
if (server->serverversion != NULL)
{
appendStringInfo(&buffer, " VERSION %s",
quote_literal_cstr(server->serverversion));
}
appendStringInfo(&buffer, " FOREIGN DATA WRAPPER %s",
quote_identifier(foreignDataWrapper->fdwname));
/* append server options, if any */
AppendOptionListToString(&buffer, server->options);
return (buffer.data);
}
/*
* AppendOptionListToString converts the option list to its textual format, and
* appends this text to the given string buffer.
*/
static void
AppendOptionListToString(StringInfo stringBuffer, List *optionList)
{
if (optionList != NIL)
{
ListCell *optionCell = NULL;
bool firstOptionPrinted = false;
appendStringInfo(stringBuffer, " OPTIONS (");
foreach(optionCell, optionList)
{
DefElem *option = (DefElem*) lfirst(optionCell);
char *optionName = option->defname;
char *optionValue = defGetString(option);
if (firstOptionPrinted)
{
appendStringInfo(stringBuffer, ", ");
}
firstOptionPrinted = true;
appendStringInfo(stringBuffer, "%s ", quote_identifier(optionName));
appendStringInfo(stringBuffer, "%s", quote_literal_cstr(optionValue));
}
appendStringInfo(stringBuffer, ")");
}
}
/*
* pg_get_tableschemadef_string returns the definition of a given table. This
* definition includes table's schema, default column values, not null and check
* constraints. The definition does not include constraints that trigger index
* creations; specifically, unique and primary key constraints are excluded.
*/
char *
pg_get_tableschemadef_string(Oid tableRelationId)
{
Relation relation = NULL;
char *relationName = NULL;
char relationKind = 0;
TupleDesc tupleDescriptor = NULL;
TupleConstr *tupleConstraints = NULL;
int attributeIndex = 0;
bool firstAttributePrinted = false;
AttrNumber defaultValueIndex = 0;
AttrNumber constraintIndex = 0;
AttrNumber constraintCount = 0;
StringInfoData buffer = { NULL, 0, 0, 0 };
/*
* Instead of retrieving values from system catalogs as other functions in
* ruleutils.c do, we follow an unusual approach here: we open the relation,
* and fetch the relation's tuple descriptor. We do this because the tuple
* descriptor already contains information harnessed from pg_attrdef,
* pg_attribute, pg_constraint, and pg_class; and therefore using the
* descriptor saves us from a lot of additional work.
*/
relation = relation_open(tableRelationId, AccessShareLock);
relationName = generate_relation_name(tableRelationId, NIL);
relationKind = relation->rd_rel->relkind;
if (relationKind != RELKIND_RELATION && relationKind != RELKIND_FOREIGN_TABLE)
{
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("%s is not a regular or foreign table", relationName)));
}
initStringInfo(&buffer);
if (relationKind == RELKIND_RELATION)
{
appendStringInfo(&buffer, "CREATE TABLE %s (", relationName);
}
else
{
appendStringInfo(&buffer, "CREATE FOREIGN TABLE %s (", relationName);
}
/*
* Iterate over the table's columns. If a particular column is not dropped
* and is not inherited from another table, print the column's name and its
* formatted type.
*/
tupleDescriptor = RelationGetDescr(relation);
tupleConstraints = tupleDescriptor->constr;
for (attributeIndex = 0; attributeIndex < tupleDescriptor->natts; attributeIndex++)
{
Form_pg_attribute attributeForm = tupleDescriptor->attrs[attributeIndex];
const char *attributeName = NULL;
const char *attributeTypeName = NULL;
if (!attributeForm->attisdropped && attributeForm->attinhcount == 0)
{
if (firstAttributePrinted)
{
appendStringInfoString(&buffer, ", ");
}
firstAttributePrinted = true;
attributeName = NameStr(attributeForm->attname);
appendStringInfo(&buffer, "%s ", quote_identifier(attributeName));
attributeTypeName = format_type_with_typemod(attributeForm->atttypid,
attributeForm->atttypmod);
appendStringInfoString(&buffer, attributeTypeName);
/* if this column has a default value, append the default value */
if (attributeForm->atthasdef)
{
AttrDefault *defaultValueList = NULL;
AttrDefault *defaultValue = NULL;
Node *defaultNode = NULL;
List *defaultContext = NULL;
char *defaultString = NULL;
Assert(tupleConstraints != NULL);
defaultValueList = tupleConstraints->defval;
Assert(defaultValueList != NULL);
defaultValue = &(defaultValueList[defaultValueIndex]);
defaultValueIndex++;
Assert(defaultValue->adnum == (attributeIndex + 1));
Assert(defaultValueIndex <= tupleConstraints->num_defval);
/* convert expression to node tree, and prepare deparse context */
defaultNode = (Node *) stringToNode(defaultValue->adbin);
defaultContext = deparse_context_for(relationName, tableRelationId);
/* deparse default value string */
defaultString = deparse_expression(defaultNode, defaultContext,
false, false);
appendStringInfo(&buffer, " DEFAULT %s", defaultString);
}
/* if this column has a not null constraint, append the constraint */
if (attributeForm->attnotnull)
{
appendStringInfoString(&buffer, " NOT NULL");
}
}
}
/*
* Now check if the table has any constraints. If it does, set the number of
* check constraints here. Then iterate over all check constraints and print
* them.
*/
if (tupleConstraints != NULL)
{
constraintCount = tupleConstraints->num_check;
}
for (constraintIndex = 0; constraintIndex < constraintCount; constraintIndex++)
{
ConstrCheck *checkConstraintList = tupleConstraints->check;
ConstrCheck *checkConstraint = &(checkConstraintList[constraintIndex]);
Node *checkNode = NULL;
List *checkContext = NULL;
char *checkString = NULL;
/* if an attribute or constraint has been printed, format properly */
if (firstAttributePrinted || constraintIndex > 0)
{
appendStringInfoString(&buffer, ", ");
}
appendStringInfo(&buffer, "CONSTRAINT %s CHECK ",
quote_identifier(checkConstraint->ccname));
/* convert expression to node tree, and prepare deparse context */
checkNode = (Node *) stringToNode(checkConstraint->ccbin);
checkContext = deparse_context_for(relationName, tableRelationId);
/* deparse check constraint string */
checkString = deparse_expression(checkNode, checkContext, false, false);
appendStringInfoString(&buffer, checkString);
}
/* close create table's outer parentheses */
appendStringInfoString(&buffer, ")");
/*
* If the relation is a foreign table, append the server name and options to
* the create table statement.
*/
if (relationKind == RELKIND_FOREIGN_TABLE)
{
ForeignTable *foreignTable = GetForeignTable(tableRelationId);
ForeignServer *foreignServer = GetForeignServer(foreignTable->serverid);
char *serverName = foreignServer->servername;
appendStringInfo(&buffer, " SERVER %s", quote_identifier(serverName));
AppendOptionListToString(&buffer, foreignTable->options);
}
relation_close(relation, AccessShareLock);
return (buffer.data);
}
/*
* pg_get_tablecolumnoptionsdef_string returns column storage type and column
* statistics definitions for given table, _if_ these definitions differ from
* their default values. The function returns null if all columns use default
* values for their storage types and statistics.
*/
char *
pg_get_tablecolumnoptionsdef_string(Oid tableRelationId)
{
Relation relation = NULL;
char *relationName = NULL;
char relationKind = 0;
TupleDesc tupleDescriptor = NULL;
AttrNumber attributeIndex = 0;
char *columnOptionStatement = NULL;
List *columnOptionList = NIL;
ListCell *columnOptionCell = NULL;
bool firstOptionPrinted = false;
StringInfoData buffer = { NULL, 0, 0, 0 };
/*
* Instead of retrieving values from system catalogs, we open the relation,
* and use the relation's tuple descriptor to access attribute information.
* This is primarily to maintain symmetry with pg_get_tableschemadef.
*/
relation = relation_open(tableRelationId, AccessShareLock);
relationName = generate_relation_name(tableRelationId, NIL);
relationKind = relation->rd_rel->relkind;
if (relationKind != RELKIND_RELATION && relationKind != RELKIND_FOREIGN_TABLE)
{
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("%s is not a regular or foreign table", relationName)));
}
/*
* Iterate over the table's columns. If a particular column is not dropped
* and is not inherited from another table, check if column storage or
* statistics statements need to be printed.
*/
tupleDescriptor = RelationGetDescr(relation);
for (attributeIndex = 0; attributeIndex < tupleDescriptor->natts; attributeIndex++)
{
Form_pg_attribute attributeForm = tupleDescriptor->attrs[attributeIndex];
char *attributeName = NameStr(attributeForm->attname);
char defaultStorageType = get_typstorage(attributeForm->atttypid);
if (!attributeForm->attisdropped && attributeForm->attinhcount == 0)
{
/*
* If the user changed the column's default storage type, create
* alter statement and add statement to a list for later processing.
*/
if (attributeForm->attstorage != defaultStorageType)
{
char *storageName = 0;
StringInfoData statement = { NULL, 0, 0, 0 };
initStringInfo(&statement);
switch (attributeForm->attstorage)
{
case 'p':
storageName = "PLAIN";
break;
case 'e':
storageName = "EXTERNAL";
break;
case 'm':
storageName = "MAIN";
break;
case 'x':
storageName = "EXTENDED";
break;
default:
ereport(ERROR, (errmsg("unrecognized storage type: %c",
attributeForm->attstorage)));
break;
}
appendStringInfo(&statement, "ALTER COLUMN %s ",
quote_identifier(attributeName));
appendStringInfo(&statement, "SET STORAGE %s", storageName);
columnOptionList = lappend(columnOptionList, statement.data);
}
/*
* If the user changed the column's statistics target, create
* alter statement and add statement to a list for later processing.
*/
if (attributeForm->attstattarget >= 0)
{
StringInfoData statement = { NULL, 0, 0, 0 };
initStringInfo(&statement);
appendStringInfo(&statement, "ALTER COLUMN %s ",
quote_identifier(attributeName));
appendStringInfo(&statement, "SET STATISTICS %d",
attributeForm->attstattarget);
columnOptionList = lappend(columnOptionList, statement.data);
}
}
}
/*
* Iterate over column storage and statistics statements that we created,
* and append them to a single alter table statement.
*/
foreach(columnOptionCell, columnOptionList)
{
if (!firstOptionPrinted)
{
initStringInfo(&buffer);
appendStringInfo(&buffer, "ALTER TABLE ONLY %s ",
generate_relation_name(tableRelationId, NIL));
}
else
{
appendStringInfoString(&buffer, ", ");
}
firstOptionPrinted = true;
columnOptionStatement = (char *) lfirst(columnOptionCell);
appendStringInfoString(&buffer, columnOptionStatement);
pfree(columnOptionStatement);
}
list_free(columnOptionList);
relation_close(relation, AccessShareLock);
return (buffer.data);
}
/*
* pg_get_indexclusterdef_string returns the definition of a cluster statement
* for given index. The function returns null if the table is not clustered on
* given index.
*/
char *
pg_get_indexclusterdef_string(Oid indexRelationId)
{
HeapTuple indexTuple = NULL;
Form_pg_index indexForm = NULL;
Oid tableRelationId = InvalidOid;
StringInfoData buffer = { NULL, 0, 0, 0 };
indexTuple = SearchSysCache(INDEXRELID, ObjectIdGetDatum(indexRelationId), 0, 0, 0);
if (!HeapTupleIsValid(indexTuple))
{
ereport(ERROR, (errmsg("cache lookup failed for index %u", indexRelationId)));
}
indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
tableRelationId = indexForm->indrelid;
/* check if the table is clustered on this index */
if (indexForm->indisclustered)
{
char *tableName = generate_relation_name(tableRelationId, NIL);
char *indexName = get_rel_name(indexRelationId); /* needs to be quoted */
initStringInfo(&buffer);
appendStringInfo(&buffer, "ALTER TABLE %s CLUSTER ON %s",
tableName, quote_identifier(indexName));
}
ReleaseSysCache(indexTuple);
return (buffer.data);
}

View File

@ -0,0 +1,334 @@
/*-------------------------------------------------------------------------
*
* connection_cache.c
*
* This file contains functions to implement a connection hash.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h" /* IWYU pragma: keep */
#include "c.h"
#include "libpq-fe.h"
#include "miscadmin.h"
#include <stddef.h>
#include <string.h>
#include "commands/dbcommands.h"
#include "distributed/connection_cache.h"
#include "lib/stringinfo.h"
#include "mb/pg_wchar.h"
#include "utils/builtins.h"
#include "utils/elog.h"
#include "utils/errcodes.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/palloc.h"
/*
* NodeConnectionHash is the connection hash itself. It begins uninitialized.
* The first call to GetConnection triggers hash creation.
*/
static HTAB *NodeConnectionHash = NULL;
/* local function forward declarations */
static HTAB * CreateNodeConnectionHash(void);
static PGconn * ConnectToNode(char *nodeName, char *nodePort);
static char * ConnectionGetOptionValue(PGconn *connection, char *optionKeyword);
/*
* GetConnection returns a PGconn which can be used to execute queries on a
* remote PostgreSQL server. If no suitable connection to the specified node on
* the specified port yet exists, the function establishes a new connection and
* returns that.
*
* Returned connections are guaranteed to be in the CONNECTION_OK state. If the
* requested connection cannot be established, or if it was previously created
* but is now in an unrecoverable bad state, this function returns NULL.
*
* This function throws an error if a hostname over 255 characters is provided.
*/
PGconn *
GetConnection(char *nodeName, int32 nodePort)
{
PGconn *connection = NULL;
NodeConnectionKey nodeConnectionKey;
NodeConnectionEntry *nodeConnectionEntry = NULL;
bool entryFound = false;
bool needNewConnection = true;
/* check input */
if (strnlen(nodeName, MAX_NODE_LENGTH + 1) > MAX_NODE_LENGTH)
{
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("hostname exceeds the maximum length of %d",
MAX_NODE_LENGTH)));
}
/* if first call, initialize the connection hash */
if (NodeConnectionHash == NULL)
{
NodeConnectionHash = CreateNodeConnectionHash();
}
memset(&nodeConnectionKey, 0, sizeof(nodeConnectionKey));
strncpy(nodeConnectionKey.nodeName, nodeName, MAX_NODE_LENGTH);
nodeConnectionKey.nodePort = nodePort;
nodeConnectionEntry = hash_search(NodeConnectionHash, &nodeConnectionKey,
HASH_FIND, &entryFound);
if (entryFound)
{
connection = nodeConnectionEntry->connection;
if (PQstatus(connection) == CONNECTION_OK)
{
needNewConnection = false;
}
else
{
PurgeConnection(connection);
}
}
if (needNewConnection)
{
StringInfo nodePortString = makeStringInfo();
appendStringInfo(nodePortString, "%d", nodePort);
connection = ConnectToNode(nodeName, nodePortString->data);
if (connection != NULL)
{
nodeConnectionEntry = hash_search(NodeConnectionHash, &nodeConnectionKey,
HASH_ENTER, &entryFound);
nodeConnectionEntry->connection = connection;
}
}
return connection;
}
/*
* PurgeConnection removes the given connection from the connection hash and
* closes it using PQfinish. If our hash does not contain the given connection,
* this method simply prints a warning and exits.
*/
void
PurgeConnection(PGconn *connection)
{
NodeConnectionKey nodeConnectionKey;
NodeConnectionEntry *nodeConnectionEntry = NULL;
bool entryFound = false;
char *nodeNameString = NULL;
char *nodePortString = NULL;
nodeNameString = ConnectionGetOptionValue(connection, "host");
if (nodeNameString == NULL)
{
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("connection is missing host option")));
}
nodePortString = ConnectionGetOptionValue(connection, "port");
if (nodePortString == NULL)
{
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("connection is missing port option")));
}
memset(&nodeConnectionKey, 0, sizeof(nodeConnectionKey));
strncpy(nodeConnectionKey.nodeName, nodeNameString, MAX_NODE_LENGTH);
nodeConnectionKey.nodePort = pg_atoi(nodePortString, sizeof(int32), 0);
pfree(nodeNameString);
pfree(nodePortString);
nodeConnectionEntry = hash_search(NodeConnectionHash, &nodeConnectionKey,
HASH_REMOVE, &entryFound);
if (entryFound)
{
/*
* It's possible the provided connection matches the host and port for
* an entry in the hash without being precisely the same connection. In
* that case, we will want to close the hash's connection (because the
* entry has already been removed) in addition to the provided one.
*/
if (nodeConnectionEntry->connection != connection)
{
ereport(WARNING, (errmsg("hash entry for \"%s:%d\" contained different "
"connection than that provided by caller",
nodeConnectionKey.nodeName,
nodeConnectionKey.nodePort)));
PQfinish(nodeConnectionEntry->connection);
}
}
else
{
ereport(WARNING, (errcode(ERRCODE_NO_DATA),
errmsg("could not find hash entry for connection to \"%s:%d\"",
nodeConnectionKey.nodeName,
nodeConnectionKey.nodePort)));
}
PQfinish(connection);
}
/*
* ReportRemoteError retrieves various error fields from the a remote result and
* produces an error report at the WARNING level.
*/
void
ReportRemoteError(PGconn *connection, PGresult *result)
{
char *sqlStateString = PQresultErrorField(result, PG_DIAG_SQLSTATE);
char *remoteMessage = PQresultErrorField(result, PG_DIAG_MESSAGE_PRIMARY);
char *nodeName = ConnectionGetOptionValue(connection, "host");
char *nodePort = ConnectionGetOptionValue(connection, "port");
char *errorPrefix = "Connection failed to";
int sqlState = ERRCODE_CONNECTION_FAILURE;
if (sqlStateString != NULL)
{
sqlState = MAKE_SQLSTATE(sqlStateString[0], sqlStateString[1], sqlStateString[2],
sqlStateString[3], sqlStateString[4]);
/* use more specific error prefix for result failures */
if (sqlState != ERRCODE_CONNECTION_FAILURE)
{
errorPrefix = "Bad result from";
}
}
/*
* If the PGresult did not contain a message, the connection may provide a
* suitable top level one. At worst, this is an empty string.
*/
if (remoteMessage == NULL)
{
char *lastNewlineIndex = NULL;
remoteMessage = PQerrorMessage(connection);
lastNewlineIndex = strrchr(remoteMessage, '\n');
/* trim trailing newline, if any */
if (lastNewlineIndex != NULL)
{
*lastNewlineIndex = '\0';
}
}
ereport(WARNING, (errcode(sqlState),
errmsg("%s %s:%s", errorPrefix, nodeName, nodePort),
errdetail("Remote message: %s", remoteMessage)));
}
/*
* CreateNodeConnectionHash returns a newly created hash table suitable for
* storing unlimited connections indexed by node name and port.
*/
static HTAB *
CreateNodeConnectionHash(void)
{
HTAB *nodeConnectionHash = NULL;
HASHCTL info;
int hashFlags = 0;
memset(&info, 0, sizeof(info));
info.keysize = sizeof(NodeConnectionKey);
info.entrysize = sizeof(NodeConnectionEntry);
info.hash = tag_hash;
info.hcxt = CacheMemoryContext;
hashFlags = (HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
nodeConnectionHash = hash_create("citusdb connection cache", 32, &info, hashFlags);
return nodeConnectionHash;
}
/*
* ConnectToNode opens a connection to a remote PostgreSQL server. The function
* configures the connection's fallback application name to 'citusdb' and sets
* the remote encoding to match the local one. This function requires that the
* port be specified as a string for easier use with libpq functions.
*
* We attempt to connect up to MAX_CONNECT_ATTEMPT times. After that we give up
* and return NULL.
*/
static PGconn *
ConnectToNode(char *nodeName, char *nodePort)
{
PGconn *connection = NULL;
const char *clientEncoding = GetDatabaseEncodingName();
const char *dbname = get_database_name(MyDatabaseId);
int attemptIndex = 0;
const char *keywordArray[] = {
"host", "port", "fallback_application_name",
"client_encoding", "connect_timeout", "dbname", NULL
};
const char *valueArray[] = {
nodeName, nodePort, "citusdb", clientEncoding,
CLIENT_CONNECT_TIMEOUT_SECONDS, dbname, NULL
};
Assert(sizeof(keywordArray) == sizeof(valueArray));
for (attemptIndex = 0; attemptIndex < MAX_CONNECT_ATTEMPTS; attemptIndex++)
{
connection = PQconnectdbParams(keywordArray, valueArray, false);
if (PQstatus(connection) == CONNECTION_OK)
{
break;
}
else
{
/* warn if still erroring on final attempt */
if (attemptIndex == MAX_CONNECT_ATTEMPTS - 1)
{
ReportRemoteError(connection, NULL);
}
PQfinish(connection);
connection = NULL;
}
}
return connection;
}
/*
* ConnectionGetOptionValue inspects the provided connection for an option with
* a given keyword and returns a new palloc'd string with that options's value.
* The function returns NULL if the connection has no setting for an option with
* the provided keyword.
*/
static char *
ConnectionGetOptionValue(PGconn *connection, char *optionKeyword)
{
char *optionValue = NULL;
PQconninfoOption *conninfoOptions = PQconninfo(connection);
PQconninfoOption *option = NULL;
for (option = conninfoOptions; option->keyword != NULL; option++)
{
if (strncmp(option->keyword, optionKeyword, NAMEDATALEN) == 0)
{
optionValue = pstrdup(option->val);
}
}
PQconninfoFree(conninfoOptions);
return optionValue;
}

View File

@ -0,0 +1,62 @@
/*-------------------------------------------------------------------------
*
* listutils.c
*
* This file contains functions to perform useful operations on lists.
*
* Copyright (c) 2014-2015, Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "c.h"
#include "port.h"
#include "distributed/listutils.h"
#include "nodes/pg_list.h"
#include "utils/memutils.h"
/*
* SortList takes in a list of void pointers, and sorts these pointers (and the
* values they point to) by applying the given comparison function. The function
* then returns the sorted list of pointers.
*
* Because the input list is a list of pointers, and because qsort expects to
* compare pointers to the list elements, the provided comparison function must
* compare pointers to pointers to elements. In addition, this sort function
* naturally exhibits the same lack of stability exhibited by qsort. See that
* function's man page for more details.
*/
List *
SortList(List *pointerList, int (*comparisonFunction)(const void *, const void *))
{
List *sortedList = NIL;
uint32 arrayIndex = 0;
uint32 arraySize = (uint32) list_length(pointerList);
void **array = (void **) palloc0(arraySize * sizeof(void *));
ListCell *pointerCell = NULL;
foreach(pointerCell, pointerList)
{
void *pointer = lfirst(pointerCell);
array[arrayIndex] = pointer;
arrayIndex++;
}
/* sort the array of pointers using the comparison function */
qsort(array, arraySize, sizeof(void *), comparisonFunction);
/* convert the sorted array of pointers back to a sorted list */
for (arrayIndex = 0; arrayIndex < arraySize; arrayIndex++)
{
void *sortedPointer = array[arrayIndex];
sortedList = lappend(sortedList, sortedPointer);
}
pfree(array);
return sortedList;
}

View File

@ -0,0 +1,929 @@
/*-------------------------------------------------------------------------
*
* metadata_cache.c
* Distributed table metadata cache
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "access/htup_details.h"
#include "catalog/indexing.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_type.h"
#include "commands/extension.h"
#include "commands/trigger.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/metadata_cache.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/pg_dist_shard.h"
#include "parser/parse_func.h"
#include "utils/builtins.h"
#include "utils/catcache.h"
#include "utils/datum.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/relfilenodemap.h"
#include "utils/relmapper.h"
#include "utils/syscache.h"
/* Hash table for informations about each partition */
static HTAB *DistTableCacheHash = NULL;
/* built first time through in InitializePartitionCache */
static ScanKeyData DistPartitionScanKey[1];
static ScanKeyData DistShardScanKey[1];
/* local function forward declarations */
static DistTableCacheEntry * LookupDistTableCacheEntry(Oid relationId);
static void InitializeDistTableCache(void);
static void ResetDistTableCacheEntry(DistTableCacheEntry *cacheEntry);
static void InvalidateDistRelationCacheCallback(Datum argument, Oid relationId);
static HeapTuple LookupDistPartitionTuple(Oid relationId);
static List * LookupDistShardTuples(Oid relationId);
static void GetPartitionTypeInputInfo(char *partitionKeyString, char partitionMethod,
Oid *intervalTypeId, int32 *intervalTypeMod);
static ShardInterval * TupleToShardInterval(HeapTuple heapTuple,
TupleDesc tupleDescriptor, Oid intervalTypeId,
int32 intervalTypeMod);
static void CachedRelationLookup(const char *relationName, Oid *cachedOid);
/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(master_dist_partition_cache_invalidate);
PG_FUNCTION_INFO_V1(master_dist_shard_cache_invalidate);
/*
* IsDistributedTable returns whether relationId is a distributed relation or
* not.
*/
bool
IsDistributedTable(Oid relationId)
{
DistTableCacheEntry *cacheEntry = NULL;
/*
* Can't be a distributed relation if the extension hasn't been loaded
* yet. As we can't do lookups in nonexistent tables, directly return
* false.
*/
if (!CitusDBHasBeenLoaded())
{
return false;
}
cacheEntry = LookupDistTableCacheEntry(relationId);
return cacheEntry->isDistributedTable;
}
/*
* LoadShardInterval reads shard metadata for given shardId from pg_dist_shard,
* and converts min/max values in these metadata to their properly typed datum
* representations. The function then allocates a structure that stores the read
* and converted values, and returns this structure.
*/
ShardInterval *
LoadShardInterval(uint64 shardId)
{
ShardInterval *shardInterval;
SysScanDesc scanDescriptor = NULL;
ScanKeyData scanKey[1];
int scanKeyCount = 1;
HeapTuple heapTuple = NULL;
Form_pg_dist_shard shardForm = NULL;
DistTableCacheEntry *partitionEntry;
Oid intervalTypeId = InvalidOid;
int32 intervalTypeMod = -1;
Relation pgDistShard = heap_open(DistShardRelationId(), AccessShareLock);
TupleDesc tupleDescriptor = RelationGetDescr(pgDistShard);
ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
scanDescriptor = systable_beginscan(pgDistShard,
DistShardShardidIndexId(), true,
NULL, scanKeyCount, scanKey);
heapTuple = systable_getnext(scanDescriptor);
if (!HeapTupleIsValid(heapTuple))
{
ereport(ERROR, (errmsg("could not find valid entry for shard "
UINT64_FORMAT, shardId)));
}
shardForm = (Form_pg_dist_shard) GETSTRUCT(heapTuple);
partitionEntry = DistributedTableCacheEntry(shardForm->logicalrelid);
GetPartitionTypeInputInfo(partitionEntry->partitionKeyString,
partitionEntry->partitionMethod, &intervalTypeId,
&intervalTypeMod);
shardInterval = TupleToShardInterval(heapTuple, tupleDescriptor, intervalTypeId,
intervalTypeMod);
systable_endscan(scanDescriptor);
heap_close(pgDistShard, AccessShareLock);
return shardInterval;
}
/*
* DistributedTableCacheEntry looks up a pg_dist_partition entry for a
* relation.
*
* Errors out if no relation matching the criteria could be found.
*/
DistTableCacheEntry *
DistributedTableCacheEntry(Oid distributedRelationId)
{
DistTableCacheEntry *cacheEntry = NULL;
/*
* Can't be a distributed relation if the extension hasn't been loaded
* yet. As we can't do lookups in nonexistent tables, directly return NULL
* here.
*/
if (!CitusDBHasBeenLoaded())
{
return NULL;
}
cacheEntry = LookupDistTableCacheEntry(distributedRelationId);
if (cacheEntry->isDistributedTable)
{
return cacheEntry;
}
else
{
ereport(ERROR, (errmsg("relation %u is not distributed",
distributedRelationId)));
}
}
/*
* LookupDistTableCacheEntry returns the distributed table metadata for the
* passed relationId. For efficiency it caches lookups.
*/
static DistTableCacheEntry *
LookupDistTableCacheEntry(Oid relationId)
{
DistTableCacheEntry *cacheEntry = NULL;
bool foundInCache = false;
HeapTuple distPartitionTuple = NULL;
char *partitionKeyString = NULL;
char partitionMethod = 0;
List *distShardTupleList = NIL;
int shardIntervalArrayLength = 0;
ShardInterval *shardIntervalArray = NULL;
void *hashKey = (void *) &relationId;
if (DistTableCacheHash == NULL)
{
InitializeDistTableCache();
}
cacheEntry = hash_search(DistTableCacheHash, hashKey, HASH_FIND, &foundInCache);
/* return valid matches */
if ((cacheEntry != NULL) && (cacheEntry->isValid))
{
return cacheEntry;
}
/* free the content of old, invalid, entries */
if (cacheEntry != NULL)
{
ResetDistTableCacheEntry(cacheEntry);
}
distPartitionTuple = LookupDistPartitionTuple(relationId);
if (distPartitionTuple != NULL)
{
Form_pg_dist_partition partitionForm =
(Form_pg_dist_partition) GETSTRUCT(distPartitionTuple);
Datum partitionKeyDatum = PointerGetDatum(&partitionForm->partkey);
MemoryContext oldContext = MemoryContextSwitchTo(CacheMemoryContext);
partitionKeyString = TextDatumGetCString(partitionKeyDatum);
partitionMethod = partitionForm->partmethod;
MemoryContextSwitchTo(oldContext);
heap_freetuple(distPartitionTuple);
}
distShardTupleList = LookupDistShardTuples(relationId);
shardIntervalArrayLength = list_length(distShardTupleList);
if (shardIntervalArrayLength > 0)
{
Relation distShardRelation = heap_open(DistShardRelationId(), AccessShareLock);
TupleDesc distShardTupleDesc = RelationGetDescr(distShardRelation);
ListCell *distShardTupleCell = NULL;
int arrayIndex = 0;
Oid intervalTypeId = InvalidOid;
int32 intervalTypeMod = -1;
GetPartitionTypeInputInfo(partitionKeyString, partitionMethod, &intervalTypeId,
&intervalTypeMod);
shardIntervalArray = MemoryContextAllocZero(CacheMemoryContext,
shardIntervalArrayLength *
sizeof(ShardInterval));
foreach(distShardTupleCell, distShardTupleList)
{
HeapTuple shardTuple = lfirst(distShardTupleCell);
ShardInterval *shardInterval = TupleToShardInterval(shardTuple,
distShardTupleDesc,
intervalTypeId,
intervalTypeMod);
MemoryContext oldContext = MemoryContextSwitchTo(CacheMemoryContext);
CopyShardInterval(shardInterval, &shardIntervalArray[arrayIndex]);
MemoryContextSwitchTo(oldContext);
heap_freetuple(shardTuple);
arrayIndex++;
}
heap_close(distShardRelation, AccessShareLock);
}
cacheEntry = hash_search(DistTableCacheHash, hashKey, HASH_ENTER, NULL);
/* zero out entry, but not the key part */
memset(((char *) cacheEntry) + sizeof(Oid), 0,
sizeof(DistTableCacheEntry) - sizeof(Oid));
if (distPartitionTuple == NULL)
{
cacheEntry->isValid = true;
cacheEntry->isDistributedTable = false;
}
else
{
cacheEntry->isValid = true;
cacheEntry->isDistributedTable = true;
cacheEntry->partitionKeyString = partitionKeyString;
cacheEntry->partitionMethod = partitionMethod;
cacheEntry->shardIntervalArrayLength = shardIntervalArrayLength;
cacheEntry->shardIntervalArray = shardIntervalArray;
}
return cacheEntry;
}
/*
* CitusDBHasBeenLoaded returns true if the citusdb extension has been created
* in the current database and the extension script has been executed. Otherwise,
* it returns false. The result is cached as this is called very frequently.
*
* NB: The way this is cached means the result will be wrong after the
* extension is dropped. A reconnect fixes that though, so that seems
* acceptable.
*/
bool
CitusDBHasBeenLoaded(void)
{
static bool extensionLoaded = false;
/* recheck presence until citusdb has been loaded */
if (!extensionLoaded)
{
bool extensionPresent = false;
bool extensionScriptExecuted = true;
Oid extensionOid = get_extension_oid("citusdb", true);
if (extensionOid != InvalidOid)
{
extensionPresent = true;
}
if (extensionPresent)
{
/* check if CitusDB extension objects are still being created */
if (creating_extension && CurrentExtensionObject == extensionOid)
{
extensionScriptExecuted = false;
}
}
extensionLoaded = extensionPresent && extensionScriptExecuted;
}
return extensionLoaded;
}
/* return oid of pg_dist_shard relation */
Oid
DistShardRelationId(void)
{
static Oid cachedOid = InvalidOid;
CachedRelationLookup("pg_dist_shard", &cachedOid);
return cachedOid;
}
/* return oid of pg_dist_shard_placement relation */
Oid
DistShardPlacementRelationId(void)
{
static Oid cachedOid = InvalidOid;
CachedRelationLookup("pg_dist_shard_placement", &cachedOid);
return cachedOid;
}
/* return oid of pg_dist_partition relation */
Oid
DistPartitionRelationId(void)
{
static Oid cachedOid = InvalidOid;
CachedRelationLookup("pg_dist_partition", &cachedOid);
return cachedOid;
}
/* return oid of pg_dist_partition_logical_relid_index index */
Oid
DistPartitionLogicalRelidIndexId(void)
{
static Oid cachedOid = InvalidOid;
CachedRelationLookup("pg_dist_partition_logical_relid_index", &cachedOid);
return cachedOid;
}
/* return oid of pg_dist_shard_logical_relid_index index */
Oid
DistShardLogicalRelidIndexId(void)
{
static Oid cachedOid = InvalidOid;
CachedRelationLookup("pg_dist_shard_logical_relid_index", &cachedOid);
return cachedOid;
}
/* return oid of pg_dist_shard_shardid_index index */
Oid
DistShardShardidIndexId(void)
{
static Oid cachedOid = InvalidOid;
CachedRelationLookup("pg_dist_shard_shardid_index", &cachedOid);
return cachedOid;
}
/* return oid of pg_dist_shard_placement_shardid_index */
Oid
DistShardPlacementShardidIndexId(void)
{
static Oid cachedOid = InvalidOid;
CachedRelationLookup("pg_dist_shard_placement_shardid_index", &cachedOid);
return cachedOid;
}
/* return oid of the citus_extradata_container(internal) function */
Oid
CitusExtraDataContainerFuncId(void)
{
static Oid cachedOid = 0;
List *nameList = NIL;
Oid paramOids[1] = { INTERNALOID };
if (cachedOid == InvalidOid)
{
nameList = list_make2(makeString("pg_catalog"),
makeString("citusdb_extradata_container"));
cachedOid = LookupFuncName(nameList, 1, paramOids, false);
}
return cachedOid;
}
/*
* master_dist_partition_cache_invalidate is a trigger function that performs
* relcache invalidations when the contents of pg_dist_partition are changed
* on the SQL level.
*/
Datum
master_dist_partition_cache_invalidate(PG_FUNCTION_ARGS)
{
TriggerData *triggerData = (TriggerData *) fcinfo->context;
HeapTuple newTuple = NULL;
HeapTuple oldTuple = NULL;
Oid oldLogicalRelationId = InvalidOid;
Oid newLogicalRelationId = InvalidOid;
if (!CALLED_AS_TRIGGER(fcinfo))
{
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
errmsg("must be called as trigger")));
}
newTuple = triggerData->tg_newtuple;
oldTuple = triggerData->tg_trigtuple;
/* collect logicalrelid for OLD and NEW tuple */
if (oldTuple != NULL)
{
Form_pg_dist_partition distPart = (Form_pg_dist_partition) GETSTRUCT(oldTuple);
oldLogicalRelationId = distPart->logicalrelid;
}
if (newTuple != NULL)
{
Form_pg_dist_partition distPart = (Form_pg_dist_partition) GETSTRUCT(newTuple);
newLogicalRelationId = distPart->logicalrelid;
}
/*
* Invalidate relcache for the relevant relation(s). In theory
* logicalrelid should never change, but it doesn't hurt to be
* paranoid. We ignore the case that there's no corresponding pg_class
* entry - that happens if the pg_dist_partition tuple is deleted after
* the relation has been dropped.
*/
if (oldLogicalRelationId != InvalidOid &&
oldLogicalRelationId != newLogicalRelationId)
{
HeapTuple oldClassTuple =
SearchSysCache1(RELOID, ObjectIdGetDatum(oldLogicalRelationId));
if (HeapTupleIsValid(oldClassTuple))
{
CacheInvalidateRelcacheByTuple(oldClassTuple);
ReleaseSysCache(oldClassTuple);
}
}
if (newLogicalRelationId != InvalidOid)
{
HeapTuple newClassTuple =
SearchSysCache1(RELOID, ObjectIdGetDatum(newLogicalRelationId));
if (HeapTupleIsValid(newClassTuple))
{
CacheInvalidateRelcacheByTuple(newClassTuple);
ReleaseSysCache(newClassTuple);
}
}
PG_RETURN_DATUM(PointerGetDatum(NULL));
}
/*
* master_dist_shard_cache_invalidate is a trigger function that performs
* relcache invalidations when the contents of pg_dist_shard are changed
* on the SQL level.
*/
Datum
master_dist_shard_cache_invalidate(PG_FUNCTION_ARGS)
{
TriggerData *triggerData = (TriggerData *) fcinfo->context;
HeapTuple newTuple = NULL;
HeapTuple oldTuple = NULL;
Oid oldLogicalRelationId = InvalidOid;
Oid newLogicalRelationId = InvalidOid;
if (!CALLED_AS_TRIGGER(fcinfo))
{
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
errmsg("must be called as trigger")));
}
newTuple = triggerData->tg_newtuple;
oldTuple = triggerData->tg_trigtuple;
/* collect logicalrelid for OLD and NEW tuple */
if (oldTuple != NULL)
{
Form_pg_dist_shard distShard = (Form_pg_dist_shard) GETSTRUCT(oldTuple);
oldLogicalRelationId = distShard->logicalrelid;
}
if (newTuple != NULL)
{
Form_pg_dist_shard distShard = (Form_pg_dist_shard) GETSTRUCT(newTuple);
newLogicalRelationId = distShard->logicalrelid;
}
/*
* Invalidate relcache for the relevant relation(s). In theory
* logicalrelid should never change, but it doesn't hurt to be
* paranoid. We ignore the case that there's no corresponding pg_class
* entry - that happens if the pg_dist_shard tuple is deleted after
* the relation has been dropped.
*/
if (oldLogicalRelationId != InvalidOid &&
oldLogicalRelationId != newLogicalRelationId)
{
HeapTuple oldClassTuple =
SearchSysCache1(RELOID, ObjectIdGetDatum(oldLogicalRelationId));
if (HeapTupleIsValid(oldClassTuple))
{
CacheInvalidateRelcacheByTuple(oldClassTuple);
ReleaseSysCache(oldClassTuple);
}
}
if (newLogicalRelationId != InvalidOid)
{
HeapTuple newClassTuple =
SearchSysCache1(RELOID, ObjectIdGetDatum(newLogicalRelationId));
if (HeapTupleIsValid(newClassTuple))
{
CacheInvalidateRelcacheByTuple(newClassTuple);
ReleaseSysCache(newClassTuple);
}
}
PG_RETURN_DATUM(PointerGetDatum(NULL));
}
/* initialize the infrastructure for the metadata cache */
static void
InitializeDistTableCache(void)
{
HASHCTL info;
/* make sure we've initialized CacheMemoryContext */
if (CacheMemoryContext == NULL)
{
CreateCacheMemoryContext();
}
/* build initial scan keys, copied for every relation scan */
memset(&DistPartitionScanKey, 0, sizeof(DistPartitionScanKey));
fmgr_info_cxt(F_OIDEQ,
&DistPartitionScanKey[0].sk_func,
CacheMemoryContext);
DistPartitionScanKey[0].sk_strategy = BTEqualStrategyNumber;
DistPartitionScanKey[0].sk_subtype = InvalidOid;
DistPartitionScanKey[0].sk_collation = InvalidOid;
DistPartitionScanKey[0].sk_attno = Anum_pg_dist_partition_logicalrelid;
memset(&DistShardScanKey, 0, sizeof(DistShardScanKey));
fmgr_info_cxt(F_OIDEQ,
&DistShardScanKey[0].sk_func,
CacheMemoryContext);
DistShardScanKey[0].sk_strategy = BTEqualStrategyNumber;
DistShardScanKey[0].sk_subtype = InvalidOid;
DistShardScanKey[0].sk_collation = InvalidOid;
DistShardScanKey[0].sk_attno = Anum_pg_dist_shard_logicalrelid;
/* initialize the hash table */
MemSet(&info, 0, sizeof(info));
info.keysize = sizeof(Oid);
info.entrysize = sizeof(DistTableCacheEntry);
info.hash = tag_hash;
DistTableCacheHash =
hash_create("Distributed Relation Cache", 32, &info,
HASH_ELEM | HASH_FUNCTION);
/* Watch for invalidation events. */
CacheRegisterRelcacheCallback(InvalidateDistRelationCacheCallback,
(Datum) 0);
}
/*
* ResetDistTableCacheEntry frees any out-of-band memory used by a cache entry,
* but does not free the entry itself.
*/
void
ResetDistTableCacheEntry(DistTableCacheEntry *cacheEntry)
{
if (cacheEntry->partitionKeyString != NULL)
{
pfree(cacheEntry->partitionKeyString);
cacheEntry->partitionKeyString = NULL;
}
if (cacheEntry->shardIntervalArrayLength > 0)
{
int i = 0;
for (i = 0; i < cacheEntry->shardIntervalArrayLength; i++)
{
ShardInterval *shardInterval = &cacheEntry->shardIntervalArray[i];
bool valueByVal = shardInterval->valueByVal;
if (!valueByVal)
{
if (shardInterval->minValueExists)
{
pfree(DatumGetPointer(shardInterval->minValue));
}
if (shardInterval->maxValueExists)
{
pfree(DatumGetPointer(shardInterval->maxValue));
}
}
}
pfree(cacheEntry->shardIntervalArray);
cacheEntry->shardIntervalArray = NULL;
cacheEntry->shardIntervalArrayLength = 0;
}
}
/*
* InvalidateDistRelationCacheCallback flushes cache entries when a relation
* is updated (or flushes the entire cache).
*/
static void
InvalidateDistRelationCacheCallback(Datum argument, Oid relationId)
{
/* invalidate either entire cache or a specific entry */
if (relationId == InvalidOid)
{
DistTableCacheEntry *cacheEntry = NULL;
HASH_SEQ_STATUS status;
hash_seq_init(&status, DistTableCacheHash);
while ((cacheEntry = (DistTableCacheEntry *) hash_seq_search(&status)) != NULL)
{
cacheEntry->isValid = false;
}
}
else
{
void *hashKey = (void *) &relationId;
bool foundInCache = false;
DistTableCacheEntry *cacheEntry = hash_search(DistTableCacheHash, hashKey,
HASH_FIND, &foundInCache);
if (foundInCache)
{
cacheEntry->isValid = false;
}
}
}
/*
* LookupDistPartitionTuple searches pg_dist_partition for relationId's entry
* and returns that or, if no matching entry was found, NULL.
*/
static HeapTuple
LookupDistPartitionTuple(Oid relationId)
{
Relation pgDistPartition = NULL;
HeapTuple distPartitionTuple = NULL;
HeapTuple currentPartitionTuple = NULL;
SysScanDesc scanDescriptor;
ScanKeyData scanKey[1];
pgDistPartition = heap_open(DistPartitionRelationId(), AccessShareLock);
/* copy scankey to local copy, it will be modified during the scan */
memcpy(scanKey, DistPartitionScanKey, sizeof(DistPartitionScanKey));
/* set scan arguments */
scanKey[0].sk_argument = ObjectIdGetDatum(relationId);
scanDescriptor = systable_beginscan(pgDistPartition,
DistPartitionLogicalRelidIndexId(),
true, NULL, 1, scanKey);
currentPartitionTuple = systable_getnext(scanDescriptor);
if (HeapTupleIsValid(currentPartitionTuple))
{
Assert(!HeapTupleHasNulls(currentPartitionTuple));
distPartitionTuple = heap_copytuple(currentPartitionTuple);
}
systable_endscan(scanDescriptor);
heap_close(pgDistPartition, NoLock);
return distPartitionTuple;
}
/*
* LookupDistShardTuples returns a list of all dist_shard tuples for the
* specified relation.
*/
static List *
LookupDistShardTuples(Oid relationId)
{
Relation pgDistShard = NULL;
List *distShardTupleList = NIL;
HeapTuple currentShardTuple = NULL;
SysScanDesc scanDescriptor;
ScanKeyData scanKey[1];
pgDistShard = heap_open(DistShardRelationId(), AccessShareLock);
/* copy scankey to local copy, it will be modified during the scan */
memcpy(scanKey, DistShardScanKey, sizeof(DistShardScanKey));
/* set scan arguments */
scanKey[0].sk_argument = ObjectIdGetDatum(relationId);
scanDescriptor = systable_beginscan(pgDistShard, DistShardLogicalRelidIndexId(), true,
NULL, 1, scanKey);
currentShardTuple = systable_getnext(scanDescriptor);
while (HeapTupleIsValid(currentShardTuple))
{
HeapTuple shardTupleCopy = heap_copytuple(currentShardTuple);
distShardTupleList = lappend(distShardTupleList, shardTupleCopy);
currentShardTuple = systable_getnext(scanDescriptor);
}
systable_endscan(scanDescriptor);
heap_close(pgDistShard, AccessShareLock);
return distShardTupleList;
}
/*
* GetPartitionTypeInputInfo populates output parameters with the interval type
* identifier and modifier for the specified partition key/method combination.
*/
static void
GetPartitionTypeInputInfo(char *partitionKeyString, char partitionMethod,
Oid *intervalTypeId, int32 *intervalTypeMod)
{
*intervalTypeId = InvalidOid;
*intervalTypeMod = -1;
switch (partitionMethod)
{
case DISTRIBUTE_BY_APPEND:
case DISTRIBUTE_BY_RANGE:
{
Node *partitionNode = stringToNode(partitionKeyString);
Var *partitionColumn = (Var *) partitionNode;
Assert(IsA(partitionNode, Var));
*intervalTypeId = partitionColumn->vartype;
*intervalTypeMod = partitionColumn->vartypmod;
break;
}
case DISTRIBUTE_BY_HASH:
{
*intervalTypeId = INT4OID;
break;
}
default:
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("unsupported table partition type: %c",
partitionMethod)));
}
}
}
/*
* TupleToShardInterval transforms the specified dist_shard tuple into a new
* ShardInterval using the provided descriptor and partition type information.
*/
static ShardInterval *
TupleToShardInterval(HeapTuple heapTuple, TupleDesc tupleDescriptor, Oid intervalTypeId,
int32 intervalTypeMod)
{
ShardInterval *shardInterval = NULL;
bool isNull = false;
bool minValueNull = false;
bool maxValueNull = false;
Oid inputFunctionId = InvalidOid;
Oid typeIoParam = InvalidOid;
Datum relationIdDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_logicalrelid,
tupleDescriptor, &isNull);
Datum shardIdDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardid,
tupleDescriptor, &isNull);
Datum storageTypeDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardstorage,
tupleDescriptor, &isNull);
Datum minValueTextDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardminvalue,
tupleDescriptor, &minValueNull);
Datum maxValueTextDatum = heap_getattr(heapTuple, Anum_pg_dist_shard_shardmaxvalue,
tupleDescriptor, &maxValueNull);
Oid relationId = DatumGetObjectId(relationIdDatum);
int64 shardId = DatumGetInt64(shardIdDatum);
char storageType = DatumGetChar(storageTypeDatum);
Datum minValue = 0;
Datum maxValue = 0;
bool minValueExists = false;
bool maxValueExists = false;
int16 intervalTypeLen = 0;
bool intervalByVal = false;
char intervalAlign = '0';
char intervalDelim = '0';
if (!minValueNull && !maxValueNull)
{
char *minValueString = TextDatumGetCString(minValueTextDatum);
char *maxValueString = TextDatumGetCString(maxValueTextDatum);
/* TODO: move this up the call stack to avoid per-tuple invocation? */
get_type_io_data(intervalTypeId, IOFunc_input, &intervalTypeLen, &intervalByVal,
&intervalAlign, &intervalDelim, &typeIoParam, &inputFunctionId);
/* finally convert min/max values to their actual types */
minValue = OidInputFunctionCall(inputFunctionId, minValueString,
typeIoParam, intervalTypeMod);
maxValue = OidInputFunctionCall(inputFunctionId, maxValueString,
typeIoParam, intervalTypeMod);
minValueExists = true;
maxValueExists = true;
}
shardInterval = CitusMakeNode(ShardInterval);
shardInterval->relationId = relationId;
shardInterval->storageType = storageType;
shardInterval->valueTypeId = intervalTypeId;
shardInterval->valueTypeLen = intervalTypeLen;
shardInterval->valueByVal = intervalByVal;
shardInterval->minValueExists = minValueExists;
shardInterval->maxValueExists = maxValueExists;
shardInterval->minValue = minValue;
shardInterval->maxValue = maxValue;
shardInterval->shardId = shardId;
return shardInterval;
}
/*
* CachedRelationLookup performs a cached lookup for the relation
* relationName, with the result cached in *cachedOid.
*
* NB: The way this is cached means the result will be wrong after the
* extension is dropped and reconnect. A reconnect fixes that though, so that
* seems acceptable.
*/
static void
CachedRelationLookup(const char *relationName, Oid *cachedOid)
{
if (*cachedOid == InvalidOid)
{
*cachedOid = get_relname_relid(relationName, PG_CATALOG_NAMESPACE);
if (*cachedOid == InvalidOid)
{
ereport(ERROR, (errmsg("cache lookup failed for %s, called to early?",
relationName)));
}
}
}

View File

@ -0,0 +1,148 @@
/*-------------------------------------------------------------------------
*
* multi_resowner.c
* CitusDB resource owner integration
*
* An extension can't directly add members to ResourceOwnerData. Instead we
* have to use the resource owner callback mechanism. Right now it's
* sufficient to have an array of referenced resources - there bascially are
* never more than a handful of entries, if that. If that changes we should
* probably rather use a hash table using the pointer value of the resource
* owner as key.
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "distributed/multi_server_executor.h"
#include "utils/memutils.h"
#include "utils/resowner_private.h"
#include "distributed/multi_resowner.h"
typedef struct JobDirectoryEntry {
ResourceOwner owner;
uint64 jobId;
} JobDirectoryEntry;
static bool RegisteredResownerCallback = false;
JobDirectoryEntry *RegisteredJobDirectories = NULL;
size_t NumRegisteredJobDirectories = 0;
size_t NumAllocatedJobDirectories = 0;
/*
* Resource owner callback - release resources still held by the resource
* owner.
*/
static void
MultiResourceOwnerReleaseCallback(ResourceReleasePhase phase,
bool isCommit,
bool isTopLevel,
void *arg)
{
int lastJobIndex = NumRegisteredJobDirectories - 1;
int jobIndex = 0;
if (phase == RESOURCE_RELEASE_AFTER_LOCKS)
{
/*
* Remove all remaining job directories, after locks have been
* released.
*/
for (jobIndex = lastJobIndex; jobIndex >= 0; jobIndex--)
{
JobDirectoryEntry *entry = &RegisteredJobDirectories[jobIndex];
if (entry->owner == CurrentResourceOwner)
{
RemoveJobDirectory(entry->jobId);
}
}
}
}
/*
* ResourceOwnerEnlargeJobDirectories makes sure that there is space to
* reference at least one more job directory for the resource owner. Note that
* we only expect one job directory per portal, but we still use an array
* here.
*
* This function is separate from the one actually inserting an entry because
* if we run out of memory, it's critical to do so *before* acquiring the
* resource.
*/
void
ResourceOwnerEnlargeJobDirectories(ResourceOwner owner)
{
int newMax = 0;
/* ensure callback is registered */
if (!RegisteredResownerCallback)
{
RegisterResourceReleaseCallback(MultiResourceOwnerReleaseCallback, NULL);
RegisteredResownerCallback = true;
}
if (RegisteredJobDirectories == NULL)
{
newMax = 16;
RegisteredJobDirectories = (JobDirectoryEntry *)
MemoryContextAlloc(TopMemoryContext, newMax * sizeof(JobDirectoryEntry));
NumAllocatedJobDirectories = newMax;
}
else if (NumRegisteredJobDirectories + 1 > NumAllocatedJobDirectories)
{
newMax = NumAllocatedJobDirectories * 2;
RegisteredJobDirectories = (JobDirectoryEntry *)
repalloc(RegisteredJobDirectories, newMax * sizeof(JobDirectoryEntry));
NumAllocatedJobDirectories = newMax;
}
}
/* Remembers that a temporary job directory is owned by a resource owner. */
void
ResourceOwnerRememberJobDirectory(ResourceOwner owner, uint64 jobId)
{
JobDirectoryEntry *entry = NULL;
Assert(NumRegisteredJobDirectories + 1 <= NumAllocatedJobDirectories);
entry = &RegisteredJobDirectories[NumRegisteredJobDirectories];
entry->owner = owner;
entry->jobId = jobId;
NumRegisteredJobDirectories++;
}
/* Forgets that a temporary job directory is owned by a resource owner. */
void
ResourceOwnerForgetJobDirectory(ResourceOwner owner, uint64 jobId)
{
int lastJobIndex = NumRegisteredJobDirectories - 1;
int jobIndex = 0;
for (jobIndex = lastJobIndex; jobIndex >= 0; jobIndex--)
{
JobDirectoryEntry *entry = &RegisteredJobDirectories[jobIndex];
if (entry->owner == owner && entry->jobId == jobId)
{
/* move all later entries one up */
while (jobIndex < lastJobIndex)
{
RegisteredJobDirectories[jobIndex] = RegisteredJobDirectories[jobIndex + 1];
jobIndex++;
}
NumRegisteredJobDirectories = lastJobIndex;
return;
}
}
elog(ERROR, "jobId " UINT64_FORMAT " is not owned by resource owner %p",
jobId, owner);
}

View File

@ -0,0 +1,118 @@
/*-------------------------------------------------------------------------
*
* resource_lock.c
* Locking Infrastructure for CitusDB.
*
* To avoid introducing a new type of locktag - that then could not be
* displayed by core functionality - we reuse advisory locks. If we'd just
* reused them directly we'd run into danger conflicting with user-defined
* advisory locks, but luckily advisory locks only two values for 'field4' in
* the locktag.
*
* Copyright (c) 2012-2015, Citus Data, Inc.
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include "distributed/resource_lock.h"
#include "storage/lmgr.h"
/*
* LockShardDistributionMetadata returns after grabbing a lock for distribution
* metadata related to the specified shard, blocking if required. ExclusiveLock
* and ShareLock modes are supported. Any locks acquired using this method are
* released at transaction end.
*/
void
LockShardDistributionMetadata(int64 shardId, LOCKMODE lockMode)
{
LOCKTAG tag;
const bool sessionLock = false;
const bool dontWait = false;
SET_LOCKTAG_SHARD_METADATA_RESOURCE(tag, MyDatabaseId, shardId);
(void) LockAcquire(&tag, lockMode, sessionLock, dontWait);
}
/*
* LockRelationDistributionMetadata returns after getting a the lock used for a
* relation's distribution metadata, blocking if required. Only ExclusiveLock
* and ShareLock modes are supported. Any locks acquired using this method are
* released at transaction end.
*/
void
LockRelationDistributionMetadata(Oid relationId, LOCKMODE lockMode)
{
Assert(lockMode == ExclusiveLock || lockMode == ShareLock);
(void) LockRelationOid(relationId, lockMode);
}
/*
* LockShardResource acquires a lock needed to modify data on a remote shard.
* This task may be assigned to multiple backends at the same time, so the lock
* manages any concurrency issues associated with shard file fetching and DML
* command execution.
*/
void
LockShardResource(uint64 shardId, LOCKMODE lockmode)
{
LOCKTAG tag;
const bool sessionLock = false;
const bool dontWait = false;
SET_LOCKTAG_SHARD_RESOURCE(tag, MyDatabaseId, shardId);
(void) LockAcquire(&tag, lockmode, sessionLock, dontWait);
}
/* Releases the lock associated with the relay file fetching/DML task. */
void
UnlockShardResource(uint64 shardId, LOCKMODE lockmode)
{
LOCKTAG tag;
const bool sessionLock = false;
SET_LOCKTAG_SHARD_RESOURCE(tag, MyDatabaseId, shardId);
LockRelease(&tag, lockmode, sessionLock);
}
/*
* LockJobResource acquires a lock for creating resources associated with the
* given jobId. This resource is typically a job schema (namespace), and less
* commonly a partition task directory.
*/
void
LockJobResource(uint64 jobId, LOCKMODE lockmode)
{
LOCKTAG tag;
const bool sessionLock = false;
const bool dontWait = false;
SET_LOCKTAG_JOB_RESOURCE(tag, MyDatabaseId, jobId);
(void) LockAcquire(&tag, lockmode, sessionLock, dontWait);
}
/* Releases the lock for resources associated with the given job id. */
void
UnlockJobResource(uint64 jobId, LOCKMODE lockmode)
{
LOCKTAG tag;
const bool sessionLock = false;
SET_LOCKTAG_JOB_RESOURCE(tag, MyDatabaseId, jobId);
LockRelease(&tag, lockmode, sessionLock);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,387 @@
/*-------------------------------------------------------------------------
*
* task_tracker_protocol.c
*
* The task tracker background process runs on every worker node. The following
* routines allow for the master node to assign tasks to the task tracker, check
* these tasks' statuses, and remove these tasks when they are no longer needed.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#include <time.h>
#include "access/xact.h"
#include "commands/dbcommands.h"
#include "commands/schemacmds.h"
#include "distributed/multi_client_executor.h"
#include "distributed/multi_server_executor.h"
#include "distributed/resource_lock.h"
#include "distributed/task_tracker.h"
#include "distributed/task_tracker_protocol.h"
#include "distributed/worker_protocol.h"
#include "storage/lwlock.h"
#include "storage/pmsignal.h"
#include "utils/builtins.h"
/* Local functions forward declarations */
static bool TaskTrackerRunning(void);
static void CreateJobSchema(StringInfo schemaName);
static void CreateTask(uint64 jobId, uint32 taskId, char *taskCallString);
static void UpdateTask(WorkerTask *workerTask, char *taskCallString);
static void CleanupTask(WorkerTask *workerTask);
/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(task_tracker_assign_task);
PG_FUNCTION_INFO_V1(task_tracker_task_status);
PG_FUNCTION_INFO_V1(task_tracker_cleanup_job);
/*
* task_tracker_assign_task creates a new task in the shared hash or updates an
* already existing task. The function also creates a schema for the job if it
* doesn't already exist.
*/
Datum
task_tracker_assign_task(PG_FUNCTION_ARGS)
{
uint64 jobId = PG_GETARG_INT64(0);
uint32 taskId = PG_GETARG_UINT32(1);
text *taskCallStringText = PG_GETARG_TEXT_P(2);
StringInfo jobSchemaName = JobSchemaName(jobId);
bool schemaExists = false;
WorkerTask *workerTask = NULL;
char *taskCallString = text_to_cstring(taskCallStringText);
uint32 taskCallStringLength = strlen(taskCallString);
/* check that we have a running task tracker on this host */
bool taskTrackerRunning = TaskTrackerRunning();
if (!taskTrackerRunning)
{
ereport(ERROR, (errcode(ERRCODE_CANNOT_CONNECT_NOW),
errmsg("the task tracker has been disabled or shut down")));
}
/* check that we have enough space in our shared hash for this string */
if (taskCallStringLength >= TASK_CALL_STRING_SIZE)
{
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("task call string exceeds maximum assignable length")));
}
/*
* If the schema does not exist, we create it. However, the schema does not
* become visible to other processes until the transaction commits, and we
* therefore do not release the resource lock in this case. Otherwise, the
* schema is already visible, and we immediately release the resource lock.
*/
LockJobResource(jobId, AccessExclusiveLock);
schemaExists = JobSchemaExists(jobSchemaName);
if (!schemaExists)
{
/* lock gets automatically released upon return from this function */
CreateJobSchema(jobSchemaName);
}
else
{
UnlockJobResource(jobId, AccessExclusiveLock);
}
LWLockAcquire(WorkerTasksSharedState->taskHashLock, LW_EXCLUSIVE);
/* check if we already have the task in our shared hash */
workerTask = WorkerTasksHashFind(jobId, taskId);
if (workerTask == NULL)
{
CreateTask(jobId, taskId, taskCallString);
}
else
{
UpdateTask(workerTask, taskCallString);
}
LWLockRelease(WorkerTasksSharedState->taskHashLock);
PG_RETURN_VOID();
}
/* Returns the task status of an already existing task. */
Datum
task_tracker_task_status(PG_FUNCTION_ARGS)
{
uint64 jobId = PG_GETARG_INT64(0);
uint32 taskId = PG_GETARG_UINT32(1);
WorkerTask *workerTask = NULL;
uint32 taskStatus = 0;
bool taskTrackerRunning = TaskTrackerRunning();
if (taskTrackerRunning)
{
LWLockAcquire(WorkerTasksSharedState->taskHashLock, LW_SHARED);
workerTask = WorkerTasksHashFind(jobId, taskId);
if (workerTask == NULL)
{
ereport(ERROR, (errmsg("could not find the worker task"),
errdetail("Task jobId: " UINT64_FORMAT " and taskId: %u",
jobId, taskId)));
}
taskStatus = (uint32) workerTask->taskStatus;
LWLockRelease(WorkerTasksSharedState->taskHashLock);
}
else
{
ereport(ERROR, (errcode(ERRCODE_CANNOT_CONNECT_NOW),
errmsg("the task tracker has been disabled or shut down")));
}
PG_RETURN_UINT32(taskStatus);
}
/*
* task_tracker_cleanup_job finds all tasks for the given job, and cleans up
* files, connections, and shared hash enties associated with these tasks.
*/
Datum
task_tracker_cleanup_job(PG_FUNCTION_ARGS)
{
uint64 jobId = PG_GETARG_INT64(0);
HASH_SEQ_STATUS status;
WorkerTask *currentTask = NULL;
StringInfo jobDirectoryName = NULL;
StringInfo jobSchemaName = NULL;
/*
* We first clean up any open connections, and remove tasks belonging to
* this job from the shared hash.
*/
LWLockAcquire(WorkerTasksSharedState->taskHashLock, LW_EXCLUSIVE);
hash_seq_init(&status, WorkerTasksSharedState->taskHash);
currentTask = (WorkerTask *) hash_seq_search(&status);
while (currentTask != NULL)
{
if (currentTask->jobId == jobId)
{
CleanupTask(currentTask);
}
currentTask = (WorkerTask *) hash_seq_search(&status);
}
LWLockRelease(WorkerTasksSharedState->taskHashLock);
/*
* We then delete the job directory and schema, if they exist. This cleans
* up all intermediate files and tables allocated for the job. Note that the
* schema drop call can block if another process is creating the schema or
* writing to a table within the schema.
*/
jobDirectoryName = JobDirectoryName(jobId);
RemoveDirectory(jobDirectoryName);
LockJobResource(jobId, AccessExclusiveLock);
jobSchemaName = JobSchemaName(jobId);
RemoveJobSchema(jobSchemaName);
UnlockJobResource(jobId, AccessExclusiveLock);
PG_RETURN_VOID();
}
/*
* TaskTrackerRunning checks if the task tracker process is running. To do this,
* the function checks if the task tracker is configured to start up, and infers
* from shared memory that the tracker hasn't received a shut down request.
*/
static bool
TaskTrackerRunning(void)
{
WorkerTask *workerTask = NULL;
bool postmasterAlive = true;
bool taskTrackerRunning = true;
/* if postmaster shut down, infer task tracker shut down from it */
postmasterAlive = PostmasterIsAlive();
if (!postmasterAlive)
{
return false;
}
/*
* When the task tracker receives a termination signal, it inserts a special
* marker task to the shared hash. We need to look up this marker task since
* the postmaster doesn't send a terminate signal to running backends.
*/
LWLockAcquire(WorkerTasksSharedState->taskHashLock, LW_SHARED);
workerTask = WorkerTasksHashFind(RESERVED_JOB_ID, SHUTDOWN_MARKER_TASK_ID);
if (workerTask != NULL)
{
taskTrackerRunning = false;
}
LWLockRelease(WorkerTasksSharedState->taskHashLock);
return taskTrackerRunning;
}
/*
* CreateJobSchema creates a job schema with the given schema name. Note that
* this function ensures that our pg_ prefixed schema names can be created.
* Further note that the created schema does not become visible to other
* processes until the transaction commits.
*/
static void
CreateJobSchema(StringInfo schemaName)
{
const char *queryString = NULL;
bool oldAllowSystemTableMods = false;
CreateSchemaStmt *createSchemaStmt = makeNode(CreateSchemaStmt);
createSchemaStmt->schemaname = schemaName->data;
#if (PG_VERSION_NUM >= 90500)
createSchemaStmt->authrole = NULL;
#else
createSchemaStmt->authid = NULL;
#endif
createSchemaStmt->schemaElts = NIL;
/* allow schema names that start with pg_ */
oldAllowSystemTableMods = allowSystemTableMods;
allowSystemTableMods = true;
CreateSchemaCommand(createSchemaStmt, queryString);
CommandCounterIncrement();
allowSystemTableMods = oldAllowSystemTableMods;
}
/*
* CreateTask creates a new task in shared hash, initializes the task, and sets
* the task to assigned state. Note that this function expects the caller to
* hold an exclusive lock over the shared hash.
*/
static void
CreateTask(uint64 jobId, uint32 taskId, char *taskCallString)
{
WorkerTask *workerTask = NULL;
uint32 assignmentTime = 0;
char *databaseName = get_database_name(MyDatabaseId);
/* increase task priority for cleanup tasks */
assignmentTime = (uint32) time(NULL);
if (taskId == JOB_CLEANUP_TASK_ID)
{
assignmentTime = HIGH_PRIORITY_TASK_TIME;
}
/* enter the worker task into shared hash and initialize the task */
workerTask = WorkerTasksHashEnter(jobId, taskId);
workerTask->assignedAt = assignmentTime;
strncpy(workerTask->taskCallString, taskCallString, TASK_CALL_STRING_SIZE);
workerTask->taskStatus = TASK_ASSIGNED;
workerTask->connectionId = INVALID_CONNECTION_ID;
workerTask->failureCount = 0;
strncpy(workerTask->databaseName, databaseName, NAMEDATALEN);
}
/*
* UpdateTask updates the call string text for an already existing task. Note
* that this function expects the caller to hold an exclusive lock over the
* shared hash.
*/
static void
UpdateTask(WorkerTask *workerTask, char *taskCallString)
{
TaskStatus taskStatus = TASK_STATUS_INVALID_FIRST;
taskStatus = workerTask->taskStatus;
Assert(taskStatus != TASK_STATUS_INVALID_FIRST);
/*
* 1. If the task has succeeded or has been canceled, we don't do anything.
* 2. If the task has permanently failed, we update the task call string,
* reset the failure count, and change the task's status to schedulable.
* 3. If the task is in conduit, we update the task call string, and reset
* the failure count.
*/
if (taskStatus == TASK_SUCCEEDED || taskStatus == TASK_CANCEL_REQUESTED ||
taskStatus == TASK_CANCELED)
{
; /* nothing to do */
}
else if (taskStatus == TASK_PERMANENTLY_FAILED)
{
strncpy(workerTask->taskCallString, taskCallString, TASK_CALL_STRING_SIZE);
workerTask->failureCount = 0;
workerTask->taskStatus = TASK_ASSIGNED;
}
else
{
strncpy(workerTask->taskCallString, taskCallString, TASK_CALL_STRING_SIZE);
workerTask->failureCount = 0;
}
}
/* Cleans up connection and shared hash entry associated with the given task. */
static void
CleanupTask(WorkerTask *workerTask)
{
WorkerTask *taskRemoved = NULL;
void *hashKey = (void *) workerTask;
/*
* If the connection is still valid, the master node decided to terminate
* the task prematurely. This can happen when the user wants to cancel the
* query, or when a speculatively executed task finishes elsewhere and the
* query completes.
*/
if (workerTask->connectionId != INVALID_CONNECTION_ID)
{
/*
* The task tracker process owns the connections to local backends, and
* we cannot interefere with those connections from another process. We
* therefore ask the task tracker to clean up the connection and to
* remove the task from the shared hash. Note that one of the cleaned up
* tasks will always be the clean-up task itself.
*/
ereport(DEBUG3, (errmsg("requesting cancel for worker task"),
errdetail("Task jobId: " UINT64_FORMAT " and taskId: %u",
workerTask->jobId, workerTask->taskId)));
workerTask->taskStatus = TASK_CANCEL_REQUESTED;
return;
}
/* remove the task from the shared hash */
taskRemoved = hash_search(WorkerTasksSharedState->taskHash, hashKey, HASH_REMOVE,
NULL);
if (taskRemoved == NULL)
{
ereport(FATAL, (errmsg("worker task hash corrupted")));
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,87 @@
/*-------------------------------------------------------------------------
*
* worker_file_access_protocol.c
*
* Routines for accessing file related information on this worker node.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "commands/defrem.h"
#include "distributed/master_protocol.h"
#include "distributed/worker_protocol.h"
#include "foreign/foreign.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(worker_foreign_file_path);
PG_FUNCTION_INFO_V1(worker_find_block_local_path);
/*
* worker_foreign_file_path resolves the foreign table for the given table name,
* and extracts and returns the file path associated with that foreign table.
*/
Datum
worker_foreign_file_path(PG_FUNCTION_ARGS)
{
text *foreignTableName = PG_GETARG_TEXT_P(0);
text *foreignFilePath = NULL;
Oid relationId = ResolveRelationId(foreignTableName);
ForeignTable *foreignTable = GetForeignTable(relationId);
ListCell *optionCell = NULL;
foreach(optionCell, foreignTable->options)
{
DefElem *option = (DefElem *) lfirst(optionCell);
char *optionName = option->defname;
int compareResult = strncmp(optionName, FOREIGN_FILENAME_OPTION, MAXPGPATH);
if (compareResult == 0)
{
char *optionValue = defGetString(option);
foreignFilePath = cstring_to_text(optionValue);
break;
}
}
/* check that we found the filename option */
if (foreignFilePath == NULL)
{
char *relationName = get_rel_name(relationId);
ereport(ERROR, (errmsg("could not find filename for foreign table: \"%s\"",
relationName)));
}
PG_RETURN_TEXT_P(foreignFilePath);
}
/*
* Protocol declaration for a function whose future implementation will find the
* given HDFS block's local file path.
*/
Datum
worker_find_block_local_path(PG_FUNCTION_ARGS)
{
int64 blockId = PG_GETARG_INT64(0);
ArrayType *dataDirectoryObject = PG_GETARG_ARRAYTYPE_P(1);
/* keep the compiler silent */
(void) blockId;
(void) dataDirectoryObject;
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("called function is currently unsupported")));
PG_RETURN_TEXT_P(NULL);
}

View File

@ -0,0 +1,547 @@
/*-------------------------------------------------------------------------
*
* worker_merge_protocol.c
*
* Routines for merging partitioned files into a single file or table. Merging
* files is one of the threee distributed execution primitives that we apply on
* worker nodes.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#ifdef HAVE_INTTYPES_H
#include <inttypes.h>
#endif
#include "access/htup_details.h"
#include "access/xact.h"
#include "catalog/dependency.h"
#include "catalog/pg_namespace.h"
#include "commands/copy.h"
#include "commands/tablecmds.h"
#include "distributed/worker_protocol.h"
#include "executor/spi.h"
#include "nodes/makefuncs.h"
#include "parser/parse_type.h"
#include "storage/lmgr.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
/* Local functions forward declarations */
static List * ArrayObjectToCStringList(ArrayType *arrayObject);
static void CreateTaskTable(StringInfo schemaName, StringInfo relationName,
List *columnNameList, List *columnTypeList);
static void CopyTaskFilesFromDirectory(StringInfo schemaName, StringInfo relationName,
StringInfo sourceDirectoryName);
/* exports for SQL callable functions */
PG_FUNCTION_INFO_V1(worker_merge_files_into_table);
PG_FUNCTION_INFO_V1(worker_merge_files_and_run_query);
PG_FUNCTION_INFO_V1(worker_cleanup_job_schema_cache);
/*
* worker_merge_files_into_table creates a task table within the job's schema,
* which should have already been created by the task tracker protocol, and
* copies files in its task directory into this table. If the schema doesn't
* exist, the function defaults to the 'public' schema. Note that, unlike
* partitioning functions, this function is not always idempotent. On success,
* the function creates the table and loads data, and subsequent calls to the
* function error out because the table already exist. On failure, the task
* table creation commands are rolled back, and the function can be called
* again.
*/
Datum
worker_merge_files_into_table(PG_FUNCTION_ARGS)
{
uint64 jobId = PG_GETARG_INT64(0);
uint32 taskId = PG_GETARG_UINT32(1);
ArrayType *columnNameObject = PG_GETARG_ARRAYTYPE_P(2);
ArrayType *columnTypeObject = PG_GETARG_ARRAYTYPE_P(3);
StringInfo jobSchemaName = JobSchemaName(jobId);
StringInfo taskTableName = TaskTableName(taskId);
StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
bool schemaExists = false;
List *columnNameList = NIL;
List *columnTypeList = NIL;
/* we should have the same number of column names and types */
int32 columnNameCount = ArrayObjectCount(columnNameObject);
int32 columnTypeCount = ArrayObjectCount(columnTypeObject);
if (columnNameCount != columnTypeCount)
{
ereport(ERROR, (errmsg("column name array size: %d and type array size: %d"
" do not match", columnNameCount, columnTypeCount)));
}
/*
* If the schema for the job isn't already created by the task tracker
* protocol, we fall to using the default 'public' schema.
*/
schemaExists = JobSchemaExists(jobSchemaName);
if (!schemaExists)
{
resetStringInfo(jobSchemaName);
appendStringInfoString(jobSchemaName, "public");
}
/* create the task table and copy files into the table */
columnNameList = ArrayObjectToCStringList(columnNameObject);
columnTypeList = ArrayObjectToCStringList(columnTypeObject);
CreateTaskTable(jobSchemaName, taskTableName, columnNameList, columnTypeList);
CopyTaskFilesFromDirectory(jobSchemaName, taskTableName, taskDirectoryName);
PG_RETURN_VOID();
}
/*
* worker_merge_files_and_run_query creates a merge task table within the job's
* schema, which should have already been created by the task tracker protocol.
* It copies files in its task directory into this table. Then it runs final
* query to create result table of the job.
*
* Note that here we followed a different approach to create a task table for merge
* files than worker_merge_files_into_table(). In future we should unify these
* two approaches. For this purpose creating a directory_fdw extension and using
* it would make sense. Then we can merge files with a query or without query
* through directory_fdw.
*/
Datum
worker_merge_files_and_run_query(PG_FUNCTION_ARGS)
{
uint64 jobId = PG_GETARG_INT64(0);
uint32 taskId = PG_GETARG_UINT32(1);
text *createMergeTableQueryText = PG_GETARG_TEXT_P(2);
text *createIntermediateTableQueryText = PG_GETARG_TEXT_P(3);
const char *createMergeTableQuery = text_to_cstring(createMergeTableQueryText);
const char *createIntermediateTableQuery =
text_to_cstring(createIntermediateTableQueryText);
StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
StringInfo jobSchemaName = JobSchemaName(jobId);
StringInfo intermediateTableName = TaskTableName(taskId);
StringInfo mergeTableName = makeStringInfo();
StringInfo setSearchPathString = makeStringInfo();
bool schemaExists = false;
int connected = 0;
int setSearchPathResult = 0;
int createMergeTableResult = 0;
int createIntermediateTableResult = 0;
int finished = 0;
/*
* If the schema for the job isn't already created by the task tracker
* protocol, we fall to using the default 'public' schema.
*/
schemaExists = JobSchemaExists(jobSchemaName);
if (!schemaExists)
{
resetStringInfo(jobSchemaName);
appendStringInfoString(jobSchemaName, "public");
}
appendStringInfo(setSearchPathString, SET_SEARCH_PATH_COMMAND, jobSchemaName->data);
connected = SPI_connect();
if (connected != SPI_OK_CONNECT)
{
ereport(ERROR, (errmsg("could not connect to SPI manager")));
}
setSearchPathResult = SPI_exec(setSearchPathString->data, 0);
if (setSearchPathResult < 0)
{
ereport(ERROR, (errmsg("execution was not successful \"%s\"",
setSearchPathString->data)));
}
createMergeTableResult = SPI_exec(createMergeTableQuery, 0);
if (createMergeTableResult < 0)
{
ereport(ERROR, (errmsg("execution was not successful \"%s\"",
createMergeTableQuery)));
}
appendStringInfo(mergeTableName, "%s%s", intermediateTableName->data,
MERGE_TABLE_SUFFIX);
CopyTaskFilesFromDirectory(jobSchemaName, mergeTableName, taskDirectoryName);
createIntermediateTableResult = SPI_exec(createIntermediateTableQuery, 0);
if (createIntermediateTableResult < 0)
{
ereport(ERROR, (errmsg("execution was not successful \"%s\"",
createIntermediateTableQuery)));
}
finished = SPI_finish();
if (finished != SPI_OK_FINISH)
{
ereport(ERROR, (errmsg("could not disconnect from SPI manager")));
}
PG_RETURN_VOID();
}
/*
* worker_cleanup_job_schema_cache walks over all schemas in the database, and
* removes schemas whose names start with the job schema prefix. Note that this
* function does not perform any locking; we expect it to be called at process
* start-up time before any merge tasks are run. Further note that this function
* runs within the scope of a particular database (template1, postgres) and can
* only delete schemas within that database.
*/
Datum
worker_cleanup_job_schema_cache(PG_FUNCTION_ARGS)
{
Relation pgNamespace = NULL;
HeapScanDesc scanDescriptor = NULL;
ScanKey scanKey = NULL;
int scanKeyCount = 0;
HeapTuple heapTuple = NULL;
pgNamespace = heap_open(NamespaceRelationId, AccessExclusiveLock);
scanDescriptor = heap_beginscan_catalog(pgNamespace, scanKeyCount, scanKey);
heapTuple = heap_getnext(scanDescriptor, ForwardScanDirection);
while (HeapTupleIsValid(heapTuple))
{
Form_pg_namespace schemaForm = (Form_pg_namespace) GETSTRUCT(heapTuple);
char *schemaName = NameStr(schemaForm->nspname);
char *jobSchemaFound = strstr(schemaName, JOB_SCHEMA_PREFIX);
if (jobSchemaFound != NULL)
{
StringInfo jobSchemaName = makeStringInfo();
appendStringInfoString(jobSchemaName, schemaName);
RemoveJobSchema(jobSchemaName);
}
heapTuple = heap_getnext(scanDescriptor, ForwardScanDirection);
}
heap_endscan(scanDescriptor);
heap_close(pgNamespace, AccessExclusiveLock);
PG_RETURN_VOID();
}
/* Constructs a standardized job schema name for the given job id. */
StringInfo
JobSchemaName(uint64 jobId)
{
/*
* We need to apply padding on our 64-bit job id, and therefore cannot use
* UINT64_FORMAT here.
*/
#ifdef HAVE_INTTYPES_H
StringInfo jobSchemaName = makeStringInfo();
appendStringInfo(jobSchemaName, "%s%0*"PRIu64,
JOB_SCHEMA_PREFIX, MIN_JOB_DIRNAME_WIDTH, jobId);
#else
StringInfo jobSchemaName = makeStringInfo();
appendStringInfo(jobSchemaName, "%s%0*llu",
JOB_SCHEMA_PREFIX, MIN_JOB_DIRNAME_WIDTH, jobId);
#endif
return jobSchemaName;
}
/* Constructs a standardized task table name for the given task id. */
StringInfo
TaskTableName(uint32 taskId)
{
StringInfo taskTableName = makeStringInfo();
appendStringInfo(taskTableName, "%s%0*u",
TASK_TABLE_PREFIX, MIN_TASK_FILENAME_WIDTH, taskId);
return taskTableName;
}
/* Creates a list of cstrings from a single dimensional array object. */
static List *
ArrayObjectToCStringList(ArrayType *arrayObject)
{
List *cstringList = NIL;
Datum *datumArray = DeconstructArrayObject(arrayObject);
int32 arraySize = ArrayObjectCount(arrayObject);
int32 arrayIndex = 0;
for (arrayIndex = 0; arrayIndex < arraySize; arrayIndex++)
{
Datum datum = datumArray[arrayIndex];
char *cstring = TextDatumGetCString(datum);
cstringList = lappend(cstringList, cstring);
}
Assert(cstringList != NIL);
return cstringList;
}
/* Checks if a schema with the given schema name exists. */
bool
JobSchemaExists(StringInfo schemaName)
{
Datum schemaNameDatum = CStringGetDatum(schemaName->data);
bool schemaExists = SearchSysCacheExists(NAMESPACENAME, schemaNameDatum, 0, 0, 0);
return schemaExists;
}
/* Removes the schema and all tables within the schema, if the schema exists. */
void
RemoveJobSchema(StringInfo schemaName)
{
Datum schemaNameDatum = CStringGetDatum(schemaName->data);
Oid schemaId = InvalidOid;
schemaId = GetSysCacheOid(NAMESPACENAME, schemaNameDatum, 0, 0, 0);
if (OidIsValid(schemaId))
{
ObjectAddress schemaObject = { 0, 0, 0 };
bool showNotices = false;
bool permissionsOK = pg_namespace_ownercheck(schemaId, GetUserId());
if (!permissionsOK)
{
aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_NAMESPACE, schemaName->data);
}
schemaObject.classId = NamespaceRelationId;
schemaObject.objectId = schemaId;
schemaObject.objectSubId = 0;
/*
* We first delete all tables in this schema. Rather than relying on the
* schema command, we call the dependency mechanism directly so that we
* can suppress notice messages that are typically displayed during
* cascading deletes.
*/
deleteWhatDependsOn(&schemaObject, showNotices);
CommandCounterIncrement();
/* drop the empty schema */
performDeletion(&schemaObject, DROP_RESTRICT, 0);
CommandCounterIncrement();
}
else
{
ereport(DEBUG2, (errmsg("schema \"%s\" does not exist, skipping",
schemaName->data)));
}
}
/* Creates a simple table that only defines columns, in the given schema. */
static void
CreateTaskTable(StringInfo schemaName, StringInfo relationName,
List *columnNameList, List *columnTypeList)
{
CreateStmt *createStatement = NULL;
RangeVar *relation = NULL;
List *columnDefinitionList = NIL;
Oid relationId = InvalidOid;
#if (PG_VERSION_NUM >= 90500)
ObjectAddress relationObject;
#endif
Assert(schemaName != NULL);
Assert(relationName != NULL);
/*
* This new relation doesn't log to WAL, as the table creation and data copy
* statements occur in the same transaction. Still, we want to make the
* relation unlogged once we upgrade to PostgreSQL 9.1.
*/
relation = makeRangeVar(schemaName->data, relationName->data, -1);
columnDefinitionList = ColumnDefinitionList(columnNameList, columnTypeList);
createStatement = CreateStatement(relation, columnDefinitionList);
#if (PG_VERSION_NUM >= 90500)
relationObject = DefineRelation(createStatement, RELKIND_RELATION, InvalidOid, NULL);
relationId = relationObject.objectId;
#else
relationId = DefineRelation(createStatement, RELKIND_RELATION, InvalidOid);
#endif
Assert(relationId != InvalidOid);
CommandCounterIncrement();
}
/*
* ColumnDefinitionList creates and returns a list of column definition objects
* from two lists of column names and types. As an example, this function takes
* in two single elements lists: "l_quantity" and "decimal(15, 2)". The function
* then returns a list with one column definition, where the column's name is
* l_quantity, its type is numeric, and the type modifier represents (15, 2).
*/
List *
ColumnDefinitionList(List *columnNameList, List *columnTypeList)
{
List *columnDefinitionList = NIL;
ListCell *columnNameCell = NULL;
ListCell *columnTypeCell = NULL;
forboth(columnNameCell, columnNameList, columnTypeCell, columnTypeList)
{
const char *columnName = (const char *) lfirst(columnNameCell);
const char *columnType = (const char *) lfirst(columnTypeCell);
/*
* We should have a SQL compatible column type declaration; we first
* convert this type to PostgreSQL's type identifiers and modifiers.
*/
Oid columnTypeId = InvalidOid;
int32 columnTypeMod = -1;
bool missingOK = false;
TypeName *typeName = NULL;
ColumnDef *columnDefinition = NULL;
parseTypeString(columnType, &columnTypeId, &columnTypeMod, missingOK);
typeName = makeTypeNameFromOid(columnTypeId, columnTypeMod);
/* we then create the column definition */
columnDefinition = makeNode(ColumnDef);
columnDefinition->colname = (char *) columnName;
columnDefinition->typeName = typeName;
columnDefinition->is_local = true;
columnDefinition->is_not_null = false;
columnDefinition->raw_default = NULL;
columnDefinition->cooked_default = NULL;
columnDefinition->constraints = NIL;
columnDefinitionList = lappend(columnDefinitionList, columnDefinition);
}
return columnDefinitionList;
}
/*
* CreateStatement creates and initializes a simple table create statement that
* only has column definitions.
*/
CreateStmt *
CreateStatement(RangeVar *relation, List *columnDefinitionList)
{
CreateStmt *createStatement = makeNode(CreateStmt);
createStatement->relation = relation;
createStatement->tableElts = columnDefinitionList;
createStatement->inhRelations = NIL;
createStatement->constraints = NIL;
createStatement->options = NIL;
createStatement->oncommit = ONCOMMIT_NOOP;
createStatement->tablespacename = NULL;
createStatement->if_not_exists = false;
return createStatement;
}
/*
* CopyTaskFilesFromDirectory finds all files in the given directory, except for
* those having an attempt suffix. The function then copies these files into the
* database table identified by the given schema and table name.
*/
static void
CopyTaskFilesFromDirectory(StringInfo schemaName, StringInfo relationName,
StringInfo sourceDirectoryName)
{
const char *directoryName = sourceDirectoryName->data;
struct dirent *directoryEntry = NULL;
uint64 copiedRowTotal = 0;
DIR *directory = AllocateDir(directoryName);
if (directory == NULL)
{
ereport(ERROR, (errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m", directoryName)));
}
directoryEntry = ReadDir(directory, directoryName);
for (; directoryEntry != NULL; directoryEntry = ReadDir(directory, directoryName))
{
const char *baseFilename = directoryEntry->d_name;
const char *queryString = NULL;
StringInfo fullFilename = NULL;
RangeVar *relation = NULL;
CopyStmt *copyStatement = NULL;
uint64 copiedRowCount = 0;
/* if system file or lingering task file, skip it */
if (strncmp(baseFilename, ".", MAXPGPATH) == 0 ||
strncmp(baseFilename, "..", MAXPGPATH) == 0 ||
strstr(baseFilename, ATTEMPT_FILE_SUFFIX) != NULL)
{
continue;
}
fullFilename = makeStringInfo();
appendStringInfo(fullFilename, "%s/%s", directoryName, baseFilename);
/* build relation object and copy statement */
relation = makeRangeVar(schemaName->data, relationName->data, -1);
copyStatement = CopyStatement(relation, fullFilename->data);
if (BinaryWorkerCopyFormat)
{
DefElem *copyOption = makeDefElem("format", (Node *) makeString("binary"));
copyStatement->options = list_make1(copyOption);
}
DoCopy(copyStatement, queryString, &copiedRowCount);
copiedRowTotal += copiedRowCount;
CommandCounterIncrement();
}
ereport(DEBUG2, (errmsg("copied " UINT64_FORMAT " rows into table: \"%s.%s\"",
copiedRowTotal, schemaName->data, relationName->data)));
FreeDir(directory);
}
/*
* CopyStatement creates and initializes a copy statement to read the given
* file's contents into the given table, using copy's standard text format.
*/
CopyStmt *
CopyStatement(RangeVar *relation, char *sourceFilename)
{
CopyStmt *copyStatement = makeNode(CopyStmt);
copyStatement->relation = relation;
copyStatement->query = NULL;
copyStatement->attlist = NIL;
copyStatement->options = NIL;
copyStatement->is_from = true;
copyStatement->is_program = false;
copyStatement->filename = sourceFilename;
return copyStatement;
}

File diff suppressed because it is too large Load Diff

2
src/bin/csql/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/psqlscan.c
/csql

40
src/bin/csql/Makefile Normal file
View File

@ -0,0 +1,40 @@
#-------------------------------------------------------------------------
#
# Makefile for src/bin/csql
#
# Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
# Portions Copyright (c) 1994, Regents of the University of California
#
# src/bin/csql/Makefile
#
#-------------------------------------------------------------------------
citusdb_subdir = src/bin/csql
citusdb_top_builddir = ../../..
PROGRAM = csql
PGFILEDESC = "csql - the CitusDB interactive terminal"
PGAPPICON=win32
OBJS =command.o common.o help.o input.o stringutils.o mainloop.o copy.o \
copy_options.o stage.o \
startup.o prompt.o variables.o large_obj.o print.o describe.o \
tab-complete.o mbprint.o dumputils.o keywords.o kwlookup.o \
sql_help.o \
$(WIN32RES)
PG_LIBS = $(libpq)
include $(citusdb_top_builddir)/Makefile.global
override CPPFLAGS += -I$(libpq_srcdir) -I$(top_srcdir)/src/bin/csql
# psqlscan is compiled as part of mainloop
mainloop.o: psqlscan.c
psqlscan.c: FLEXFLAGS = -Cfe -p -p
psqlscan.c: FLEX_NO_BACKUP=yes
clean: csql-clean
csql-clean:
rm -f csql$(X) $(OBJS) psqlscan.c lex.backup

3282
src/bin/csql/command.c Normal file

File diff suppressed because it is too large Load Diff

43
src/bin/csql/command.h Normal file
View File

@ -0,0 +1,43 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/command.h
*/
#ifndef COMMAND_H
#define COMMAND_H
#include "print.h"
#include "psqlscan.h"
typedef enum _backslashResult
{
PSQL_CMD_UNKNOWN = 0, /* not done parsing yet (internal only) */
PSQL_CMD_SEND, /* query complete; send off */
PSQL_CMD_SKIP_LINE, /* keep building query */
PSQL_CMD_TERMINATE, /* quit program */
PSQL_CMD_NEWEDIT, /* query buffer was changed (e.g., via \e) */
PSQL_CMD_ERROR /* the execution of the backslash command
* resulted in an error */
} backslashResult;
extern backslashResult HandleSlashCmds(PsqlScanState scan_state,
PQExpBuffer query_buf);
extern int process_file(char *filename, bool single_txn, bool use_relative_path);
extern bool do_pset(const char *param,
const char *value,
printQueryOpt *popt,
bool quiet);
extern void connection_warnings(bool in_startup);
extern void SyncVariables(void);
extern void UnsyncVariables(void);
#endif /* COMMAND_H */

1903
src/bin/csql/common.c Normal file

File diff suppressed because it is too large Load Diff

59
src/bin/csql/common.h Normal file
View File

@ -0,0 +1,59 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/common.h
*/
#ifndef COMMON_H
#define COMMON_H
#include "postgres_fe.h"
#include <setjmp.h>
#include "libpq-fe.h"
#include "print.h"
#define atooid(x) ((Oid) strtoul((x), NULL, 10))
extern bool openQueryOutputFile(const char *fname, FILE **fout, bool *is_pipe);
extern bool setQFout(const char *fname);
#if (PG_VERSION_NUM >= 90500)
extern void psql_error(const char *fmt,...) pg_attribute_printf(1, 2);
#else
extern void
psql_error(const char *fmt,...)
/* This lets gcc check the format string for consistency. */
__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
#endif
extern void NoticeProcessor(void *arg, const char *message);
extern volatile bool sigint_interrupt_enabled;
extern sigjmp_buf sigint_interrupt_jmp;
extern volatile bool cancel_pressed;
/* Note: cancel_pressed is defined in print.c, see that file for reasons */
extern void setup_cancel_handler(void);
extern void SetCancelConn(void);
extern void ResetCancelConn(void);
extern PGresult *PSQLexec(const char *query);
extern int PSQLexecWatch(const char *query, const printQueryOpt *opt);
extern bool SendQuery(const char *query);
extern bool is_superuser(void);
extern bool standard_strings(void);
extern const char *session_username(void);
extern void expand_tilde(char **filename);
extern bool recognized_connection_string(const char *connstr);
#endif /* COMMON_H */

595
src/bin/csql/copy.c Normal file
View File

@ -0,0 +1,595 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/copy.c
*/
#include "postgres_fe.h"
#include "copy.h"
#include <signal.h>
#include <sys/stat.h>
#ifndef WIN32
#include <unistd.h> /* for isatty */
#else
#include <io.h> /* I think */
#endif
#include "libpq-fe.h"
#include "pqexpbuffer.h"
#include "dumputils.h"
#include "settings.h"
#include "common.h"
#include "prompt.h"
/*
* Execute a \copy command (frontend copy). We have to open a file (or execute
* a command), then submit a COPY query to the backend and either feed it data
* from the file or route its response into the file.
*/
bool
do_copy(const char *args)
{
copy_options *options = NULL;
PQExpBufferData query = { NULL, 0, 0 };
FILE *copystream = NULL;
bool success = false;
bool fileClosed = false;
/* parse options */
options = parse_slash_copy(args);
if (!options)
return false;
/* open file stream to copy data into or out of */
copystream = OpenCopyStream(options);
if (copystream == NULL)
{
free_copy_options(options);
return false;
}
/* build the command we will send to the backend */
initPQExpBuffer(&query);
printfPQExpBuffer(&query, "COPY ");
appendPQExpBufferStr(&query, options->before_tofrom);
if (options->from)
appendPQExpBufferStr(&query, " FROM STDIN ");
else
appendPQExpBufferStr(&query, " TO STDOUT ");
if (options->after_tofrom)
appendPQExpBufferStr(&query, options->after_tofrom);
/* run it like a user command, but with copystream as data source/sink */
pset.copyStream = copystream;
success = SendQuery(query.data);
pset.copyStream = NULL;
termPQExpBuffer(&query);
/* close file stream */
fileClosed = CloseCopyStream(options, copystream);
if (!fileClosed)
{
success = false;
}
free_copy_options(options);
return success;
}
/*
* HandleCopyData executes client-side copy data protocols by dispatching the
* call to the appropriate copy protocol function. On successful execution of
* the protocol, the function returns true. Otherwise, the function returns
* false.
*
* Please note that we refactored this function from a previous version (v9.1)
* of PostgreSQL so that copy.c and stage.c could share the same code path. Now
* that do_copy uses SendQuery(), we should move or re-refactor this function.
*/
bool
HandleCopyData(PGconn *connection, ExecStatusType copyStatus, bool copyIsBinary,
FILE *copyStream, uint64 copySizeLimit)
{
ExecStatusType drainStatus = 0;
PGresult *drainResult = NULL;
bool copyOK = true;
if (copyStatus == PGRES_COPY_OUT)
{
SetCancelConn();
copyOK = handleCopyOut(connection, copyStream, &drainResult);
ResetCancelConn();
}
else if (copyStatus == PGRES_COPY_IN)
{
SetCancelConn();
copyOK = handleCopyIn(connection, copyStream, copyIsBinary,
&drainResult, copySizeLimit);
ResetCancelConn();
}
else if (copyStatus == PGRES_BAD_RESPONSE ||
copyStatus == PGRES_NONFATAL_ERROR ||
copyStatus == PGRES_FATAL_ERROR)
{
psql_error("\\copy: %s", PQerrorMessage(connection));
copyOK = false;
}
else
{
psql_error("\\copy: unexpected response (%d)\n", copyStatus);
copyOK = false;
}
PQclear(drainResult);
/*
* Make sure we drain all results from libpq. Otherwise, the connection may
* still be in ASYNC_BUSY state, leading to false readings in get_prompt().
*/
drainResult = PQgetResult(connection);
while (drainResult != NULL)
{
copyOK = false;
drainStatus = PQresultStatus(drainResult);
psql_error("\\copy: unexpected response (%d)\n", drainStatus);
/* if we are still in COPY IN state, try to get out of it */
if (drainStatus == PGRES_COPY_IN)
{
PQputCopyEnd(connection, _("trying to exit copy mode"));
}
PQclear(drainResult);
drainResult = PQgetResult(connection);
}
return copyOK;
}
/* Opens input or output stream to be used during copy command. */
FILE *
OpenCopyStream(const copy_options *options)
{
FILE *copyStream = NULL;
/* prepare to read or write the target file */
if (options->file && !options->program)
canonicalize_path(options->file);
if (options->from)
{
if (options->file)
{
if (options->program)
{
fflush(stdout);
fflush(stderr);
errno = 0;
copyStream = popen(options->file, PG_BINARY_R);
}
else
copyStream = fopen(options->file, PG_BINARY_R);
}
else if (!options->psql_inout)
copyStream = pset.cur_cmd_source;
else
copyStream = stdin;
}
else
{
if (options->file)
{
if (options->program)
{
fflush(stdout);
fflush(stderr);
errno = 0;
#ifndef WIN32
pqsignal(SIGPIPE, SIG_IGN);
#endif
copyStream = popen(options->file, PG_BINARY_W);
}
else
copyStream = fopen(options->file, PG_BINARY_W);
}
else if (!options->psql_inout)
copyStream = pset.queryFout;
else
copyStream = stdout;
}
if (!copyStream)
{
if (options->program)
psql_error("could not execute command \"%s\": %s\n",
options->file, strerror(errno));
else
psql_error("%s: %s\n",
options->file, strerror(errno));
return NULL;
}
if (!options->program)
{
struct stat st;
int result;
/* make sure the specified file is not a directory */
if ((result = fstat(fileno(copyStream), &st)) < 0)
psql_error("could not stat file \"%s\": %s\n",
options->file, strerror(errno));
if (result == 0 && S_ISDIR(st.st_mode))
psql_error("%s: cannot copy from/to a directory\n",
options->file);
if (result < 0 || S_ISDIR(st.st_mode))
{
fclose(copyStream);
return NULL;
}
}
return copyStream;
}
/* Closes file stream used during copy command, if any. */
bool
CloseCopyStream(const copy_options *options, FILE *copyStream)
{
bool success = true;
if (options->file != NULL)
{
if (options->program)
{
int pclose_rc = pclose(copyStream);
if (pclose_rc != 0)
{
if (pclose_rc < 0)
psql_error("could not close pipe to external command: %s\n",
strerror(errno));
else
{
char *reason = wait_result_to_str(pclose_rc);
psql_error("%s: %s\n", options->file,
reason ? reason : "");
if (reason)
free(reason);
}
success = false;
}
#ifndef WIN32
pqsignal(SIGPIPE, SIG_DFL);
#endif
}
else
{
if (fclose(copyStream) != 0)
{
psql_error("%s: %s\n", options->file, strerror(errno));
success = false;
}
}
}
return success;
}
/*
* Functions for handling COPY IN/OUT data transfer.
*
* If you want to use COPY TO STDOUT/FROM STDIN in your application,
* this is the code to steal ;)
*/
/*
* handleCopyOut
* receives data as a result of a COPY ... TO STDOUT command
*
* conn should be a database connection that you just issued COPY TO on
* and got back a PGRES_COPY_OUT result.
* copystream is the file stream for the data to go to.
* The final status for the COPY is returned into *res (but note
* we already reported the error, if it's not a success result).
*
* result is true if successful, false if not.
*/
bool
handleCopyOut(PGconn *conn, FILE *copystream, PGresult **res)
{
bool OK = true;
char *buf;
int ret;
for (;;)
{
ret = PQgetCopyData(conn, &buf, 0);
if (ret < 0)
break; /* done or server/connection error */
if (buf)
{
if (OK && fwrite(buf, 1, ret, copystream) != ret)
{
psql_error("could not write COPY data: %s\n",
strerror(errno));
/* complain only once, keep reading data from server */
OK = false;
}
PQfreemem(buf);
}
}
if (OK && fflush(copystream))
{
psql_error("could not write COPY data: %s\n",
strerror(errno));
OK = false;
}
if (ret == -2)
{
psql_error("COPY data transfer failed: %s", PQerrorMessage(conn));
OK = false;
}
/*
* Check command status and return to normal libpq state.
*
* If for some reason libpq is still reporting PGRES_COPY_OUT state, we
* would like to forcibly exit that state, since our caller would be
* unable to distinguish that situation from reaching the next COPY in a
* command string that happened to contain two consecutive COPY TO STDOUT
* commands. However, libpq provides no API for doing that, and in
* principle it's a libpq bug anyway if PQgetCopyData() returns -1 or -2
* but hasn't exited COPY_OUT state internally. So we ignore the
* possibility here.
*/
*res = PQgetResult(conn);
if (PQresultStatus(*res) != PGRES_COMMAND_OK)
{
psql_error("%s", PQerrorMessage(conn));
OK = false;
}
return OK;
}
/*
* handleCopyIn
* sends data to complete a COPY ... FROM STDIN command
*
* conn should be a database connection that you just issued COPY FROM on
* and got back a PGRES_COPY_IN result.
* copystream is the file stream to read the data from.
* isbinary can be set from PQbinaryTuples().
* The final status for the COPY is returned into *res (but note
* we already reported the error, if it's not a success result).
*
* result is true if successful, false if not.
*/
/* read chunk size for COPY IN - size set to double that of Hadoop's default */
#define COPYBUFSIZ 32768
bool
handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary,
PGresult **res, uint64 copySizeLimit)
{
bool OK;
const char *prompt;
char buf[COPYBUFSIZ];
uint64 bytesCopied = 0;
/*
* Establish longjmp destination for exiting from wait-for-input. (This is
* only effective while sigint_interrupt_enabled is TRUE.)
*/
if (sigsetjmp(sigint_interrupt_jmp, 1) != 0)
{
/* got here with longjmp */
/* Terminate data transfer */
PQputCopyEnd(conn,
(PQprotocolVersion(conn) < 3) ? NULL :
_("canceled by user"));
OK = false;
goto copyin_cleanup;
}
/* Prompt if interactive input */
if (isatty(fileno(copystream)))
{
if (!pset.quiet)
puts(_("Enter data to be copied followed by a newline.\n"
"End with a backslash and a period on a line by itself."));
prompt = get_prompt(PROMPT_COPY);
}
else
prompt = NULL;
OK = true;
if (isbinary)
{
/* interactive input probably silly, but give one prompt anyway */
if (prompt)
{
fputs(prompt, stdout);
fflush(stdout);
}
for (;;)
{
int buflen;
/* enable longjmp while waiting for input */
sigint_interrupt_enabled = true;
buflen = fread(buf, 1, COPYBUFSIZ, copystream);
sigint_interrupt_enabled = false;
if (buflen <= 0)
break;
if (PQputCopyData(conn, buf, buflen) <= 0)
{
OK = false;
break;
}
/* if size limit is set, copy at most that many bytes*/
bytesCopied += buflen;
if (copySizeLimit > 0 && bytesCopied >= copySizeLimit)
{
break;
}
}
}
else
{
bool copydone = false;
while (!copydone)
{ /* for each input line ... */
bool firstload;
bool linedone;
if (prompt)
{
fputs(prompt, stdout);
fflush(stdout);
}
firstload = true;
linedone = false;
while (!linedone)
{ /* for each bufferload in line ... */
int linelen = 0;
char *fgresult;
/* enable longjmp while waiting for input */
sigint_interrupt_enabled = true;
fgresult = fgets(buf, sizeof(buf), copystream);
sigint_interrupt_enabled = false;
if (!fgresult)
{
copydone = true;
break;
}
linelen = strlen(buf);
/* current line is done? */
if (linelen > 0 && buf[linelen - 1] == '\n')
linedone = true;
/* check for EOF marker, but not on a partial line */
if (firstload)
{
/*
* This code erroneously assumes '\.' on a line alone
* inside a quoted CSV string terminates the \copy.
* http://www.postgresql.org/message-id/E1TdNVQ-0001ju-GO@w
* rigleys.postgresql.org
*/
if (strcmp(buf, "\\.\n") == 0 ||
strcmp(buf, "\\.\r\n") == 0)
{
copydone = true;
break;
}
firstload = false;
}
if (PQputCopyData(conn, buf, linelen) <= 0)
{
OK = false;
copydone = true;
break;
}
else
{
bytesCopied += linelen;
}
}
if (copystream == pset.cur_cmd_source)
pset.lineno++;
/* if size limit is set, copy at most that many bytes */
if (copySizeLimit > 0 && bytesCopied >= copySizeLimit)
{
break;
}
}
}
/* Check for read error */
if (ferror(copystream))
OK = false;
/*
* Terminate data transfer. We can't send an error message if we're using
* protocol version 2.
*/
if (PQputCopyEnd(conn,
(OK || PQprotocolVersion(conn) < 3) ? NULL :
_("aborted because of read failure")) <= 0)
OK = false;
copyin_cleanup:
/*
* Check command status and return to normal libpq state.
*
* We do not want to return with the status still PGRES_COPY_IN: our
* caller would be unable to distinguish that situation from reaching the
* next COPY in a command string that happened to contain two consecutive
* COPY FROM STDIN commands. We keep trying PQputCopyEnd() in the hope
* it'll work eventually. (What's actually likely to happen is that in
* attempting to flush the data, libpq will eventually realize that the
* connection is lost. But that's fine; it will get us out of COPY_IN
* state, which is what we need.)
*/
while (*res = PQgetResult(conn), PQresultStatus(*res) == PGRES_COPY_IN)
{
OK = false;
PQclear(*res);
/* We can't send an error message if we're using protocol version 2 */
PQputCopyEnd(conn,
(PQprotocolVersion(conn) < 3) ? NULL :
_("trying to exit copy mode"));
}
if (PQresultStatus(*res) != PGRES_COMMAND_OK)
{
psql_error("%s", PQerrorMessage(conn));
OK = false;
}
return OK;
}

33
src/bin/csql/copy.h Normal file
View File

@ -0,0 +1,33 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/copy.h
*/
#ifndef COPY_H
#define COPY_H
#include "libpq-fe.h"
#include "copy_options.h"
#include "pqexpbuffer.h"
/* handler for \copy */
extern bool do_copy(const char *args);
/* lower level processors for copy in/out streams */
extern bool handleCopyOut(PGconn *conn, FILE *copystream,
PGresult **res);
extern bool handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary,
PGresult **res, uint64 copySizeLimit);
/* Function declarations shared between copy and stage commands */
bool HandleCopyData(PGconn *connection, ExecStatusType copyStatus,
bool copyIsBinary, FILE *copyStream, uint64 copySizeLimit);
FILE * OpenCopyStream(const copy_options *options);
bool CloseCopyStream(const copy_options *options, FILE *copyStream);
#endif

312
src/bin/csql/copy_options.c Normal file
View File

@ -0,0 +1,312 @@
/*
* csql - the CitusDB interactive terminal
* copy_options.c
* Routines for parsing copy and stage meta commands.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*/
#include "postgres_fe.h"
#include "copy_options.h"
#include "common.h"
#include "settings.h"
#include "stringutils.h"
/* Concatenates "more" onto "var", and frees the original value of *var. */
static void
xstrcat(char **var, const char *more)
{
char *newvar;
newvar = psprintf("%s%s", *var, more);
free(*var);
*var = newvar;
}
/*
* parse_slash_copy parses copy options from the given meta-command line. The
* function then returns a dynamically allocated structure with the options, or
* Null on parsing error.
*/
copy_options *
parse_slash_copy(const char *args)
{
struct copy_options *result;
char *token;
const char *whitespace = " \t\n\r";
char nonstd_backslash = standard_strings() ? 0 : '\\';
if (!args)
{
psql_error("\\copy: arguments required\n");
return NULL;
}
result = pg_malloc0(sizeof(struct copy_options));
result->before_tofrom = pg_strdup(""); /* initialize for appending */
token = strtokx(args, whitespace, ".,()", "\"",
0, false, false, pset.encoding);
if (!token)
goto error;
/* The following can be removed when we drop 7.3 syntax support */
if (pg_strcasecmp(token, "binary") == 0)
{
xstrcat(&result->before_tofrom, token);
token = strtokx(NULL, whitespace, ".,()", "\"",
0, false, false, pset.encoding);
if (!token)
goto error;
}
/* Handle COPY (SELECT) case */
if (token[0] == '(')
{
int parens = 1;
while (parens > 0)
{
xstrcat(&result->before_tofrom, " ");
xstrcat(&result->before_tofrom, token);
token = strtokx(NULL, whitespace, "()", "\"'",
nonstd_backslash, true, false, pset.encoding);
if (!token)
goto error;
if (token[0] == '(')
parens++;
else if (token[0] == ')')
parens--;
}
}
xstrcat(&result->before_tofrom, " ");
xstrcat(&result->before_tofrom, token);
token = strtokx(NULL, whitespace, ".,()", "\"",
0, false, false, pset.encoding);
if (!token)
goto error;
/*
* strtokx() will not have returned a multi-character token starting with
* '.', so we don't need strcmp() here. Likewise for '(', etc, below.
*/
if (token[0] == '.')
{
/* handle schema . table */
xstrcat(&result->before_tofrom, token);
token = strtokx(NULL, whitespace, ".,()", "\"",
0, false, false, pset.encoding);
if (!token)
goto error;
xstrcat(&result->before_tofrom, token);
token = strtokx(NULL, whitespace, ".,()", "\"",
0, false, false, pset.encoding);
if (!token)
goto error;
}
if (token[0] == '(')
{
/* handle parenthesized column list */
for (;;)
{
xstrcat(&result->before_tofrom, " ");
xstrcat(&result->before_tofrom, token);
token = strtokx(NULL, whitespace, "()", "\"",
0, false, false, pset.encoding);
if (!token)
goto error;
if (token[0] == ')')
break;
}
xstrcat(&result->before_tofrom, " ");
xstrcat(&result->before_tofrom, token);
token = strtokx(NULL, whitespace, ".,()", "\"",
0, false, false, pset.encoding);
if (!token)
goto error;
}
if (pg_strcasecmp(token, "from") == 0)
result->from = true;
else if (pg_strcasecmp(token, "to") == 0)
result->from = false;
else
goto error;
/* { 'filename' | PROGRAM 'command' | STDIN | STDOUT | PSTDIN | PSTDOUT } */
token = strtokx(NULL, whitespace, ";", "'",
0, false, false, pset.encoding);
if (!token)
goto error;
if (pg_strcasecmp(token, "program") == 0)
{
int toklen;
token = strtokx(NULL, whitespace, ";", "'",
0, false, false, pset.encoding);
if (!token)
goto error;
/*
* The shell command must be quoted. This isn't fool-proof, but
* catches most quoting errors.
*/
toklen = strlen(token);
if (token[0] != '\'' || toklen < 2 || token[toklen - 1] != '\'')
goto error;
strip_quotes(token, '\'', 0, pset.encoding);
result->program = true;
result->file = pg_strdup(token);
}
else if (pg_strcasecmp(token, "stdin") == 0 ||
pg_strcasecmp(token, "stdout") == 0)
{
result->file = NULL;
}
else if (pg_strcasecmp(token, "pstdin") == 0 ||
pg_strcasecmp(token, "pstdout") == 0)
{
result->psql_inout = true;
result->file = NULL;
}
else
{
/* filename can be optionally quoted */
strip_quotes(token, '\'', 0, pset.encoding);
result->file = pg_strdup(token);
expand_tilde(&result->file);
}
/* Collect the rest of the line (COPY options) */
token = strtokx(NULL, "", NULL, NULL,
0, false, false, pset.encoding);
if (token)
result->after_tofrom = pg_strdup(token);
/* set data staging options to null */
result->tableName = NULL;
result->columnList = NULL;
return result;
error:
if (token)
psql_error("\\copy: parse error at \"%s\"\n", token);
else
psql_error("\\copy: parse error at end of line\n");
free_copy_options(result);
return NULL;
}
/* Frees copy options. */
void
free_copy_options(copy_options * ptr)
{
if (!ptr)
return;
free(ptr->before_tofrom);
free(ptr->after_tofrom);
free(ptr->file);
free(ptr->tableName);
free(ptr->columnList);
free(ptr);
}
/*
* ParseStageOptions takes the given copy options, parses the additional options
* needed for the \stage command, and sets them in the copy options structure.
* The additional parsed options are the table name and the column list.
*/
copy_options *
ParseStageOptions(copy_options *copyOptions)
{
copy_options *stageOptions = NULL;
const char *whitespace = " \t\n\r";
char *tableName = NULL;
char *columnList = NULL;
char *token = NULL;
const char *beforeToFrom = copyOptions->before_tofrom;
Assert(beforeToFrom != NULL);
token = strtokx(beforeToFrom, whitespace, ".,()", "\"",
0, false, false, pset.encoding);
/*
* We should have errored out earlier if the token were null. Similarly, we
* should have errored out on the "\stage (select) to" case.
*/
Assert(token != NULL);
Assert(token[0] != '(');
/* we do not support PostgreSQL's 7.3 syntax */
if (pg_strcasecmp(token, "binary") == 0)
{
psql_error("\\stage: binary keyword before to/from is not supported\n");
Assert(false);
}
/* init table name and append either the table name or schema name */
tableName = pg_strdup("");
xstrcat(&tableName, token);
/* check for the schema.table use case */
token = strtokx(NULL, whitespace, ".,()", "\"", 0, false, false, pset.encoding);
if (token != NULL && token[0] == '.')
{
/* append the dot token */
xstrcat(&tableName, token);
token = strtokx(NULL, whitespace, ".,()", "\"", 0, false, false, pset.encoding);
Assert(token != NULL);
/* append the table name token */
xstrcat(&tableName, token);
token = strtokx(NULL, whitespace, ".,()", "\"", 0, false, false, pset.encoding);
}
/* check for the column list use case */
if (token != NULL && token[0] == '(')
{
/* init column list, and add columns */
columnList = pg_strdup("");
for (;;)
{
xstrcat(&columnList, " ");
xstrcat(&columnList, token);
token = strtokx(NULL, whitespace, "()", "\"", 0, false, false, pset.encoding);
Assert(token != NULL);
if (token[0] == ')')
{
break;
}
}
xstrcat(&columnList, " ");
xstrcat(&columnList, token);
}
/* finally set additional stage options */
stageOptions = copyOptions;
stageOptions->tableName = tableName;
stageOptions->columnList = columnList;
return stageOptions;
}

View File

@ -0,0 +1,60 @@
/*
* csql - the CitusDB interactive terminal
* copy_options.h
* Shared declarations for parsing copy and stage meta-commands. The stage
* meta-command borrows from copy's syntax, but does not yet support
* outputting table data to a file. Further, the stage command reuses copy's
* declarations to maintain compatibility with the copy command.
*
* Copyright (c) 2012, Citus Data, Inc.
*
* $Id$
*/
#ifndef COPY_OPTIONS_H
#define COPY_OPTIONS_H
#include "libpq-fe.h"
/*
* The documented syntax is:
* \copy tablename [(columnlist)] from|to filename [options]
* \copy ( select stmt ) to filename [options]
*
* where 'filename' can be one of the following:
* '<file path>' | PROGRAM '<command>' | stdin | stdout | pstdout | pstdout
*
* An undocumented fact is that you can still write BINARY before the
* tablename; this is a hangover from the pre-7.3 syntax. The options
* syntax varies across backend versions, but we avoid all that mess
* by just transmitting the stuff after the filename literally.
*
* table name can be double-quoted and can have a schema part.
* column names can be double-quoted.
* filename can be single-quoted like SQL literals.
* command must be single-quoted like SQL literals.
*
* returns a malloc'ed structure with the options, or NULL on parsing error
*/
typedef struct copy_options
{
char *before_tofrom; /* COPY string before TO/FROM */
char *after_tofrom; /* COPY string after TO/FROM filename */
char *file; /* NULL = stdin/stdout */
bool program; /* is 'file' a program to popen? */
bool psql_inout; /* true = use psql stdin/stdout */
bool from; /* true = FROM, false = TO */
char *tableName; /* table name to stage data to */
char *columnList; /* optional column list used in staging */
} copy_options;
/* Function declarations for parsing and freeing copy options */
copy_options * parse_slash_copy(const char *args);
void free_copy_options(copy_options * ptr);
copy_options * ParseStageOptions(copy_options *copyOptions);
#endif /* COPY_OPTIONS_H */

214
src/bin/csql/create_help.pl Normal file
View File

@ -0,0 +1,214 @@
#! /usr/bin/perl -w
#################################################################
# create_help.pl -- converts SGML docs to internal psql help
#
# Copyright (c) 2000-2015, PostgreSQL Global Development Group
#
# src/bin/psql/create_help.pl
#################################################################
#
# This script automatically generates the help on SQL in psql from
# the SGML docs. So far the format of the docs was consistent
# enough that this worked, but this here is by no means an SGML
# parser.
#
# Call: perl create_help.pl docdir sql_help
# The name of the header file doesn't matter to this script, but it
# sure does matter to the rest of the source.
#
use strict;
my $docdir = $ARGV[0] or die "$0: missing required argument: docdir\n";
my $hfile = $ARGV[1] . '.h'
or die "$0: missing required argument: output file\n";
my $cfile = $ARGV[1] . '.c';
my $hfilebasename;
if ($hfile =~ m!.*/([^/]+)$!)
{
$hfilebasename = $1;
}
else
{
$hfilebasename = $hfile;
}
my $define = $hfilebasename;
$define =~ tr/a-z/A-Z/;
$define =~ s/\W/_/g;
opendir(DIR, $docdir)
or die "$0: could not open documentation source dir '$docdir': $!\n";
open(HFILE, ">$hfile")
or die "$0: could not open output file '$hfile': $!\n";
open(CFILE, ">$cfile")
or die "$0: could not open output file '$cfile': $!\n";
print HFILE "/*
* *** Do not change this file by hand. It is automatically
* *** generated from the DocBook documentation.
*
* generated by
* $^X $0 @ARGV
*
*/
#ifndef $define
#define $define
#define N_(x) (x) /* gettext noop */
#include \"postgres_fe.h\"
#include \"pqexpbuffer.h\"
struct _helpStruct
{
const char *cmd; /* the command name */
const char *help; /* the help associated with it */
void (*syntaxfunc)(PQExpBuffer); /* function that prints the syntax associated with it */
int nl_count; /* number of newlines in syntax (for pager) */
};
";
print CFILE "/*
* *** Do not change this file by hand. It is automatically
* *** generated from the DocBook documentation.
*
* generated by
* $^X $0 @ARGV
*
*/
#include \"$hfile\"
";
my $maxlen = 0;
my %entries;
foreach my $file (sort readdir DIR)
{
my (@cmdnames, $cmddesc, $cmdsynopsis);
$file =~ /\.sgml$/ or next;
open(FILE, "$docdir/$file") or next;
my $filecontent = join('', <FILE>);
close FILE;
# Ignore files that are not for SQL language statements
$filecontent =~
m!<refmiscinfo>\s*SQL - Language Statements\s*</refmiscinfo>!i
or next;
# Collect multiple refnames
LOOP:
{
$filecontent =~ m!\G.*?<refname>\s*([a-z ]+?)\s*</refname>!cgis
and push @cmdnames, $1
and redo LOOP;
}
$filecontent =~ m!<refpurpose>\s*(.+?)\s*</refpurpose>!is
and $cmddesc = $1;
$filecontent =~ m!<synopsis>\s*(.+?)\s*</synopsis>!is
and $cmdsynopsis = $1;
if (@cmdnames && $cmddesc && $cmdsynopsis)
{
s/\"/\\"/g foreach @cmdnames;
$cmddesc =~ s/<[^>]+>//g;
$cmddesc =~ s/\s+/ /g;
$cmddesc =~ s/\"/\\"/g;
my @params = ();
my $nl_count = () = $cmdsynopsis =~ /\n/g;
$cmdsynopsis =~ m!</>!
and die "$0:$file: null end tag not supported in synopsis\n";
$cmdsynopsis =~ s/%/%%/g;
while ($cmdsynopsis =~ m!<(\w+)[^>]*>(.+?)</\1[^>]*>!)
{
my $match = $2;
$match =~ s/<[^>]+>//g;
$match =~ s/%%/%/g;
push @params, $match;
$cmdsynopsis =~ s!<(\w+)[^>]*>.+?</\1[^>]*>!%s!;
}
$cmdsynopsis =~ s/\r?\n/\\n/g;
$cmdsynopsis =~ s/\"/\\"/g;
foreach my $cmdname (@cmdnames)
{
$entries{$cmdname} = {
cmddesc => $cmddesc,
cmdsynopsis => $cmdsynopsis,
params => \@params,
nl_count => $nl_count };
$maxlen =
($maxlen >= length $cmdname) ? $maxlen : length $cmdname;
}
}
else
{
die "$0: parsing file '$file' failed (N='@cmdnames' D='$cmddesc')\n";
}
}
foreach (sort keys %entries)
{
my $prefix = "\t" x 5 . ' ';
my $id = $_;
$id =~ s/ /_/g;
my $synopsis = "\"$entries{$_}{cmdsynopsis}\"";
$synopsis =~ s/\\n/\\n"\n$prefix"/g;
my @args =
("buf", $synopsis, map("_(\"$_\")", @{ $entries{$_}{params} }));
print HFILE "extern void sql_help_$id(PQExpBuffer buf);\n";
print CFILE "void
sql_help_$id(PQExpBuffer buf)
{
\tappendPQExpBuffer(" . join(",\n$prefix", @args) . ");
}
";
}
print HFILE "
static const struct _helpStruct QL_HELP[] = {
";
foreach (sort keys %entries)
{
my $id = $_;
$id =~ s/ /_/g;
print HFILE " { \"$_\",
N_(\"$entries{$_}{cmddesc}\"),
sql_help_$id,
$entries{$_}{nl_count} },
";
}
print HFILE "
{ NULL, NULL, NULL } /* End of list marker */
};
#define QL_HELP_COUNT "
. scalar(keys %entries) . " /* number of help items */
#define QL_MAX_CMD_LEN $maxlen /* largest strlen(cmd) */
#endif /* $define */
";
close CFILE;
close HFILE;
closedir DIR;

4613
src/bin/csql/describe.c Normal file

File diff suppressed because it is too large Load Diff

102
src/bin/csql/describe.h Normal file
View File

@ -0,0 +1,102 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/describe.h
*/
#ifndef DESCRIBE_H
#define DESCRIBE_H
/* \da */
extern bool describeAggregates(const char *pattern, bool verbose, bool showSystem);
/* \db */
extern bool describeTablespaces(const char *pattern, bool verbose);
/* \df, \dfa, \dfn, \dft, \dfw, etc. */
extern bool describeFunctions(const char *functypes, const char *pattern, bool verbose, bool showSystem);
/* \dT */
extern bool describeTypes(const char *pattern, bool verbose, bool showSystem);
/* \do */
extern bool describeOperators(const char *pattern, bool verbose, bool showSystem);
/* \du, \dg */
extern bool describeRoles(const char *pattern, bool verbose);
/* \drds */
extern bool listDbRoleSettings(const char *pattern1, const char *pattern2);
/* \z (or \dp) */
extern bool permissionsList(const char *pattern);
/* \ddp */
extern bool listDefaultACLs(const char *pattern);
/* \dd */
extern bool objectDescription(const char *pattern, bool showSystem);
/* \d foo */
extern bool describeTableDetails(const char *pattern, bool verbose, bool showSystem);
/* \dF */
extern bool listTSConfigs(const char *pattern, bool verbose);
/* \dFp */
extern bool listTSParsers(const char *pattern, bool verbose);
/* \dFd */
extern bool listTSDictionaries(const char *pattern, bool verbose);
/* \dFt */
extern bool listTSTemplates(const char *pattern, bool verbose);
/* \l */
extern bool listAllDbs(const char *pattern, bool verbose);
/* \dt, \di, \ds, \dS, etc. */
extern bool listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSystem);
/* \dD */
extern bool listDomains(const char *pattern, bool verbose, bool showSystem);
/* \dc */
extern bool listConversions(const char *pattern, bool verbose, bool showSystem);
/* \dC */
extern bool listCasts(const char *pattern, bool verbose);
/* \dO */
extern bool listCollations(const char *pattern, bool verbose, bool showSystem);
/* \dn */
extern bool listSchemas(const char *pattern, bool verbose, bool showSystem);
/* \dew */
extern bool listForeignDataWrappers(const char *pattern, bool verbose);
/* \des */
extern bool listForeignServers(const char *pattern, bool verbose);
/* \deu */
extern bool listUserMappings(const char *pattern, bool verbose);
/* \det */
extern bool listForeignTables(const char *pattern, bool verbose);
/* \dL */
extern bool listLanguages(const char *pattern, bool verbose, bool showSystem);
/* \dx */
extern bool listExtensions(const char *pattern);
/* \dx+ */
extern bool listExtensionContents(const char *pattern);
/* \dy */
extern bool listEventTriggers(const char *pattern, bool verbose);
#endif /* DESCRIBE_H */

1243
src/bin/csql/dumputils.c Normal file

File diff suppressed because it is too large Load Diff

572
src/bin/csql/help.c Normal file
View File

@ -0,0 +1,572 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/help.c
*/
#include "postgres_fe.h"
#ifndef WIN32
#include <sys/types.h> /* (ditto) */
#include <unistd.h> /* for geteuid() */
#else
#include <win32.h>
#endif
#ifndef WIN32
#include <sys/ioctl.h> /* for ioctl() */
#endif
#ifdef HAVE_TERMIOS_H
#include <termios.h>
#endif
#include "common.h"
#include "common/username.h"
#include "help.h"
#include "input.h"
#include "settings.h"
#include "sql_help.h"
/*
* PLEASE:
* If you change something in this file, also make the same changes
* in the DocBook documentation, file ref/psql-ref.sgml. If you don't
* know how to do it, please find someone who can help you.
*/
/*
* usage
*
* print out command line arguments
*/
#define ON(var) (var ? _("on") : _("off"))
void
usage(unsigned short int pager)
{
const char *env;
const char *user;
char *errstr;
FILE *output;
/* Find default user, in case we need it. */
user = getenv("PGUSER");
if (!user)
{
user = get_user_name(&errstr);
if (!user)
{
psql_error("%s\n", errstr);
exit(EXIT_FAILURE);
}
}
output = PageOutput(59, pager ? &(pset.popt.topt) : NULL);
printf(_("csql is the CitusDB interactive terminal.\n\n"));
fprintf(output, _("Usage:\n"));
printf(_(" csql [OPTION]... [DBNAME [USERNAME]]\n\n"));
fprintf(output, _("General options:\n"));
/* Display default database */
env = getenv("PGDATABASE");
if (!env)
env = user;
fprintf(output, _(" -c, --command=COMMAND run only single command (SQL or internal) and exit\n"));
fprintf(output, _(" -d, --dbname=DBNAME database name to connect to (default: \"%s\")\n"), env);
fprintf(output, _(" -f, --file=FILENAME execute commands from file, then exit\n"));
fprintf(output, _(" -l, --list list available databases, then exit\n"));
fprintf(output, _(" -v, --set=, --variable=NAME=VALUE\n"
" set psql variable NAME to VALUE\n"
" (e.g., -v ON_ERROR_STOP=1)\n"));
fprintf(output, _(" -V, --version output version information, then exit\n"));
fprintf(output, _(" -X, --no-psqlrc do not read startup file (~/.psqlrc)\n"));
fprintf(output, _(" -1 (\"one\"), --single-transaction\n"
" execute as a single transaction (if non-interactive)\n"));
fprintf(output, _(" -?, --help[=options] show this help, then exit\n"));
fprintf(output, _(" --help=commands list backslash commands, then exit\n"));
fprintf(output, _(" --help=variables list special variables, then exit\n"));
fprintf(output, _("\nInput and output options:\n"));
fprintf(output, _(" -a, --echo-all echo all input from script\n"));
fprintf(output, _(" -b, --echo-errors echo failed commands\n"));
fprintf(output, _(" -e, --echo-queries echo commands sent to server\n"));
fprintf(output, _(" -E, --echo-hidden display queries that internal commands generate\n"));
fprintf(output, _(" -L, --log-file=FILENAME send session log to file\n"));
fprintf(output, _(" -n, --no-readline disable enhanced command line editing (readline)\n"));
fprintf(output, _(" -o, --output=FILENAME send query results to file (or |pipe)\n"));
fprintf(output, _(" -q, --quiet run quietly (no messages, only query output)\n"));
fprintf(output, _(" -s, --single-step single-step mode (confirm each query)\n"));
fprintf(output, _(" -S, --single-line single-line mode (end of line terminates SQL command)\n"));
fprintf(output, _("\nOutput format options:\n"));
fprintf(output, _(" -A, --no-align unaligned table output mode\n"));
fprintf(output, _(" -F, --field-separator=STRING\n"
" field separator for unaligned output (default: \"%s\")\n"),
DEFAULT_FIELD_SEP);
fprintf(output, _(" -H, --html HTML table output mode\n"));
fprintf(output, _(" -P, --pset=VAR[=ARG] set printing option VAR to ARG (see \\pset command)\n"));
fprintf(output, _(" -R, --record-separator=STRING\n"
" record separator for unaligned output (default: newline)\n"));
fprintf(output, _(" -t, --tuples-only print rows only\n"));
fprintf(output, _(" -T, --table-attr=TEXT set HTML table tag attributes (e.g., width, border)\n"));
fprintf(output, _(" -x, --expanded turn on expanded table output\n"));
fprintf(output, _(" -z, --field-separator-zero\n"
" set field separator for unaligned output to zero byte\n"));
fprintf(output, _(" -0, --record-separator-zero\n"
" set record separator for unaligned output to zero byte\n"));
fprintf(output, _("\nConnection options:\n"));
/* Display default host */
env = getenv("PGHOST");
fprintf(output, _(" -h, --host=HOSTNAME database server host or socket directory (default: \"%s\")\n"),
env ? env : _("local socket"));
/* Display default port */
env = getenv("PGPORT");
fprintf(output, _(" -p, --port=PORT database server port (default: \"%s\")\n"),
env ? env : DEF_PGPORT_STR);
/* Display default user */
env = getenv("PGUSER");
if (!env)
env = user;
fprintf(output, _(" -U, --username=USERNAME database user name (default: \"%s\")\n"), env);
fprintf(output, _(" -w, --no-password never prompt for password\n"));
fprintf(output, _(" -W, --password force password prompt (should happen automatically)\n"));
fprintf(output, _("\nFor more information, type \"\\?\" (for internal commands) or \"\\help\" (for SQL\n"
"commands) from within psql, or consult the psql section in the PostgreSQL\n"
"documentation.\n\n"));
fprintf(output, _("Report bugs to <pgsql-bugs@postgresql.org>.\n"));
ClosePager(output);
}
/*
* slashUsage
*
* print out help for the backslash commands
*/
void
slashUsage(unsigned short int pager)
{
FILE *output;
char *currdb;
currdb = PQdb(pset.db);
output = PageOutput(103, pager ? &(pset.popt.topt) : NULL);
/* if you add/remove a line here, change the row count above */
fprintf(output, _("General\n"));
fprintf(output, _(" \\copyright show PostgreSQL usage and distribution terms\n"));
fprintf(output, _(" \\g [FILE] or ; execute query (and send results to file or |pipe)\n"));
fprintf(output, _(" \\gset [PREFIX] execute query and store results in psql variables\n"));
fprintf(output, _(" \\q quit psql\n"));
fprintf(output, _(" \\watch [SEC] execute query every SEC seconds\n"));
fprintf(output, "\n");
fprintf(output, _("Help\n"));
fprintf(output, _(" \\? [commands] show help on backslash commands\n"));
fprintf(output, _(" \\? options show help on psql command-line options\n"));
fprintf(output, _(" \\? variables show help on special variables\n"));
fprintf(output, _(" \\h [NAME] help on syntax of SQL commands, * for all commands\n"));
fprintf(output, "\n");
fprintf(output, _("Query Buffer\n"));
fprintf(output, _(" \\e [FILE] [LINE] edit the query buffer (or file) with external editor\n"));
fprintf(output, _(" \\ef [FUNCNAME [LINE]] edit function definition with external editor\n"));
fprintf(output, _(" \\p show the contents of the query buffer\n"));
fprintf(output, _(" \\r reset (clear) the query buffer\n"));
#ifdef USE_READLINE
fprintf(output, _(" \\s [FILE] display history or save it to file\n"));
#endif
fprintf(output, _(" \\w FILE write query buffer to file\n"));
fprintf(output, "\n");
fprintf(output, _("Input/Output\n"));
fprintf(output, _(" \\copy ... perform SQL COPY with data stream to the client host\n"));
fprintf(output, _(" \\echo [STRING] write string to standard output\n"));
fprintf(output, _(" \\i FILE execute commands from file\n"));
fprintf(output, _(" \\ir FILE as \\i, but relative to location of current script\n"));
fprintf(output, _(" \\o [FILE] send all query results to file or |pipe\n"));
fprintf(output, _(" \\qecho [STRING] write string to query output stream (see \\o)\n"));
fprintf(output, "\n");
fprintf(output, _("Informational\n"));
fprintf(output, _(" (options: S = show system objects, + = additional detail)\n"));
fprintf(output, _(" \\d[S+] list tables, views, and sequences\n"));
fprintf(output, _(" \\d[S+] NAME describe table, view, sequence, or index\n"));
fprintf(output, _(" \\da[S] [PATTERN] list aggregates\n"));
fprintf(output, _(" \\db[+] [PATTERN] list tablespaces\n"));
fprintf(output, _(" \\dc[S+] [PATTERN] list conversions\n"));
fprintf(output, _(" \\dC[+] [PATTERN] list casts\n"));
fprintf(output, _(" \\dd[S] [PATTERN] show object descriptions not displayed elsewhere\n"));
fprintf(output, _(" \\ddp [PATTERN] list default privileges\n"));
fprintf(output, _(" \\dD[S+] [PATTERN] list domains\n"));
fprintf(output, _(" \\det[+] [PATTERN] list foreign tables\n"));
fprintf(output, _(" \\des[+] [PATTERN] list foreign servers\n"));
fprintf(output, _(" \\deu[+] [PATTERN] list user mappings\n"));
fprintf(output, _(" \\dew[+] [PATTERN] list foreign-data wrappers\n"));
fprintf(output, _(" \\df[antw][S+] [PATRN] list [only agg/normal/trigger/window] functions\n"));
fprintf(output, _(" \\dF[+] [PATTERN] list text search configurations\n"));
fprintf(output, _(" \\dFd[+] [PATTERN] list text search dictionaries\n"));
fprintf(output, _(" \\dFp[+] [PATTERN] list text search parsers\n"));
fprintf(output, _(" \\dFt[+] [PATTERN] list text search templates\n"));
fprintf(output, _(" \\dg[+] [PATTERN] list roles\n"));
fprintf(output, _(" \\di[S+] [PATTERN] list indexes\n"));
fprintf(output, _(" \\dl list large objects, same as \\lo_list\n"));
fprintf(output, _(" \\dL[S+] [PATTERN] list procedural languages\n"));
fprintf(output, _(" \\dm[S+] [PATTERN] list materialized views\n"));
fprintf(output, _(" \\dn[S+] [PATTERN] list schemas\n"));
fprintf(output, _(" \\do[S] [PATTERN] list operators\n"));
fprintf(output, _(" \\dO[S+] [PATTERN] list collations\n"));
fprintf(output, _(" \\dp [PATTERN] list table, view, and sequence access privileges\n"));
fprintf(output, _(" \\drds [PATRN1 [PATRN2]] list per-database role settings\n"));
fprintf(output, _(" \\ds[S+] [PATTERN] list sequences\n"));
fprintf(output, _(" \\dt[S+] [PATTERN] list tables\n"));
fprintf(output, _(" \\dT[S+] [PATTERN] list data types\n"));
fprintf(output, _(" \\du[+] [PATTERN] list roles\n"));
fprintf(output, _(" \\dv[S+] [PATTERN] list views\n"));
fprintf(output, _(" \\dE[S+] [PATTERN] list foreign tables\n"));
fprintf(output, _(" \\dx[+] [PATTERN] list extensions\n"));
fprintf(output, _(" \\dy [PATTERN] list event triggers\n"));
fprintf(output, _(" \\l[+] [PATTERN] list databases\n"));
fprintf(output, _(" \\sf[+] FUNCNAME show a function's definition\n"));
fprintf(output, _(" \\z [PATTERN] same as \\dp\n"));
fprintf(output, "\n");
fprintf(output, _("Formatting\n"));
fprintf(output, _(" \\a toggle between unaligned and aligned output mode\n"));
fprintf(output, _(" \\C [STRING] set table title, or unset if none\n"));
fprintf(output, _(" \\f [STRING] show or set field separator for unaligned query output\n"));
fprintf(output, _(" \\H toggle HTML output mode (currently %s)\n"),
ON(pset.popt.topt.format == PRINT_HTML));
fprintf(output, _(" \\pset [NAME [VALUE]] set table output option\n"
" (NAME := {format|border|expanded|fieldsep|fieldsep_zero|footer|null|\n"
" numericlocale|recordsep|recordsep_zero|tuples_only|title|tableattr|pager|\n"
" unicode_border_linestyle|unicode_column_linestyle|unicode_header_linestyle})\n"));
fprintf(output, _(" \\t [on|off] show only rows (currently %s)\n"),
ON(pset.popt.topt.tuples_only));
fprintf(output, _(" \\T [STRING] set HTML <table> tag attributes, or unset if none\n"));
fprintf(output, _(" \\x [on|off|auto] toggle expanded output (currently %s)\n"),
pset.popt.topt.expanded == 2 ? "auto" : ON(pset.popt.topt.expanded));
fprintf(output, "\n");
fprintf(output, _("Connection\n"));
if (currdb)
fprintf(output, _(" \\c[onnect] {[DBNAME|- USER|- HOST|- PORT|-] | conninfo}\n"
" connect to new database (currently \"%s\")\n"),
currdb);
else
fprintf(output, _(" \\c[onnect] {[DBNAME|- USER|- HOST|- PORT|-] | conninfo}\n"
" connect to new database (currently no connection)\n"));
fprintf(output, _(" \\encoding [ENCODING] show or set client encoding\n"));
fprintf(output, _(" \\password [USERNAME] securely change the password for a user\n"));
fprintf(output, _(" \\conninfo display information about current connection\n"));
fprintf(output, "\n");
fprintf(output, _("Operating System\n"));
fprintf(output, _(" \\cd [DIR] change the current working directory\n"));
fprintf(output, _(" \\setenv NAME [VALUE] set or unset environment variable\n"));
fprintf(output, _(" \\timing [on|off] toggle timing of commands (currently %s)\n"),
ON(pset.timing));
fprintf(output, _(" \\! [COMMAND] execute command in shell or start interactive shell\n"));
fprintf(output, "\n");
fprintf(output, _("Variables\n"));
fprintf(output, _(" \\prompt [TEXT] NAME prompt user to set internal variable\n"));
fprintf(output, _(" \\set [NAME [VALUE]] set internal variable, or list all if no parameters\n"));
fprintf(output, _(" \\unset NAME unset (delete) internal variable\n"));
fprintf(output, "\n");
fprintf(output, _("Large Objects\n"));
fprintf(output, _(" \\lo_export LOBOID FILE\n"
" \\lo_import FILE [COMMENT]\n"
" \\lo_list\n"
" \\lo_unlink LOBOID large object operations\n"));
ClosePager(output);
}
/*
* helpVariables
*
* show list of available variables (options) from command line
*/
void
helpVariables(unsigned short int pager)
{
FILE *output;
output = PageOutput(85, pager ? &(pset.popt.topt) : NULL);
fprintf(output, _("List of specially treated variables\n\n"));
fprintf(output, _("psql variables:\n"));
fprintf(output, _("Usage:\n"));
fprintf(output, _(" psql --set=NAME=VALUE\n or \\set NAME VALUE inside psql\n\n"));
fprintf(output, _(" AUTOCOMMIT if set, successful SQL commands are automatically committed\n"));
fprintf(output, _(" COMP_KEYWORD_CASE determines the case used to complete SQL key words\n"
" [lower, upper, preserve-lower, preserve-upper]\n"));
fprintf(output, _(" DBNAME the currently connected database name\n"));
fprintf(output, _(" ECHO controls what input is written to standard output\n"
" [all, errors, none, queries]\n"));
fprintf(output, _(" ECHO_HIDDEN if set, display internal queries executed by backslash commands;\n"
" if set to \"noexec\", just show without execution\n"));
fprintf(output, _(" ENCODING current client character set encoding\n"));
fprintf(output, _(" FETCH_COUNT the number of result rows to fetch and display at a time\n"
" (default: 0=unlimited)\n"));
fprintf(output, _(" HISTCONTROL controls command history [ignorespace, ignoredups, ignoreboth]\n"));
fprintf(output, _(" HISTFILE file name used to store the command history\n"));
fprintf(output, _(" HISTSIZE the number of commands to store in the command history\n"));
fprintf(output, _(" HOST the currently connected database server host\n"));
fprintf(output, _(" IGNOREEOF if unset, sending an EOF to interactive session terminates application\n"));
fprintf(output, _(" LASTOID value of the last affected OID\n"));
fprintf(output, _(" ON_ERROR_ROLLBACK if set, an error doesn't stop a transaction (uses implicit savepoints)\n"));
fprintf(output, _(" ON_ERROR_STOP stop batch execution after error\n"));
fprintf(output, _(" PORT server port of the current connection\n"));
fprintf(output, _(" PROMPT1 specifies the standard psql prompt\n"));
fprintf(output, _(" PROMPT2 specifies the prompt used when a statement continues from a previous line\n"));
fprintf(output, _(" PROMPT3 specifies the prompt used during COPY ... FROM STDIN\n"));
fprintf(output, _(" QUIET run quietly (same as -q option)\n"));
fprintf(output, _(" SINGLELINE end of line terminates SQL command mode (same as -S option)\n"));
fprintf(output, _(" SINGLESTEP single-step mode (same as -s option)\n"));
fprintf(output, _(" USER the currently connected database user\n"));
fprintf(output, _(" VERBOSITY controls verbosity of error reports [default, verbose, terse]\n"));
fprintf(output, _("\nDisplay settings:\n"));
fprintf(output, _("Usage:\n"));
fprintf(output, _(" psql --pset=NAME[=VALUE]\n or \\pset NAME [VALUE] inside psql\n\n"));
fprintf(output, _(" border border style (number)\n"));
fprintf(output, _(" columns target width for the wrapped format\n"));
fprintf(output, _(" expanded (or x) expanded output [on, off, auto]\n"));
fprintf(output, _(" fieldsep field separator for unaligned output (default \"%s\")\n"), DEFAULT_FIELD_SEP);
fprintf(output, _(" fieldsep_zero set field separator for unaligned output to zero byte\n"));
fprintf(output, _(" format set output format [unaligned, aligned, wrapped, html, asciidoc, ...]\n"));
fprintf(output, _(" footer enable or disable display of the table footer [on, off]\n"));
fprintf(output, _(" linestyle set the border line drawing style [ascii, old-ascii, unicode]\n"));
fprintf(output, _(" null set the string to be printed in place of a null value\n"));
fprintf(output, _(" numericlocale enable or disable display of a locale-specific character to separate\n"
" groups of digits [on, off]\n"));
fprintf(output, _(" pager control when an external pager is used [yes, no, always]\n"));
fprintf(output, _(" recordsep record (line) separator for unaligned output\n"));
fprintf(output, _(" recordsep_zero set record separator for unaligned output to zero byte\n"));
fprintf(output, _(" tableattr (or T) specify attributes for table tag in html format or proportional\n"
" column widths for left-aligned data types in latex-longtable format\n"));
fprintf(output, _(" title set the table title for any subsequently printed tables\n"));
fprintf(output, _(" tuples_only if set, only actual table data is shown\n"));
fprintf(output, _(" unicode_border_linestyle\n"
" unicode_column_linestyle\n"
" unicode_header_linestyle\n"
" set the style of Unicode line drawing [single, double]\n"));
fprintf(output, _("\nEnvironment variables:\n"));
fprintf(output, _("Usage:\n"));
#ifndef WIN32
fprintf(output, _(" NAME=VALUE [NAME=VALUE] psql ...\n or \\setenv NAME [VALUE] inside psql\n\n"));
#else
fprintf(output, _(" set NAME=VALUE\n psql ...\n or \\setenv NAME [VALUE] inside psql\n\n"));
#endif
fprintf(output, _(" COLUMNS number of columns for wrapped format\n"));
fprintf(output, _(" PAGER name of external pager program\n"));
fprintf(output, _(" PGAPPNAME same as the application_name connection parameter\n"));
fprintf(output, _(" PGDATABASE same as the dbname connection parameter\n"));
fprintf(output, _(" PGHOST same as the host connection parameter\n"));
fprintf(output, _(" PGPORT same as the port connection parameter\n"));
fprintf(output, _(" PGUSER same as the user connection parameter\n"));
fprintf(output, _(" PGPASSWORD connection password (not recommended)\n"));
fprintf(output, _(" PGPASSFILE password file name\n"));
fprintf(output, _(" PSQL_EDITOR, EDITOR, VISUAL\n"
" editor used by the \\e and \\ef commands\n"));
fprintf(output, _(" PSQL_EDITOR_LINENUMBER_ARG\n"
" how to specify a line number when invoking the editor\n"));
fprintf(output, _(" PSQL_HISTORY alternative location for the command history file\n"));
fprintf(output, _(" PSQLRC alternative location for the user's .psqlrc file\n"));
fprintf(output, _(" SHELL shell used by the \\! command\n"));
fprintf(output, _(" TMPDIR directory for temporary files\n"));
ClosePager(output);
}
/*
* helpSQL -- help with SQL commands
*
* Note: we assume caller removed any trailing spaces in "topic".
*/
void
helpSQL(const char *topic, unsigned short int pager)
{
#define VALUE_OR_NULL(a) ((a) ? (a) : "")
if (!topic || strlen(topic) == 0)
{
/* Print all the available command names */
int screen_width;
int ncolumns;
int nrows;
FILE *output;
int i;
int j;
#ifdef TIOCGWINSZ
struct winsize screen_size;
if (ioctl(fileno(stdout), TIOCGWINSZ, &screen_size) == -1)
screen_width = 80; /* ioctl failed, assume 80 */
else
screen_width = screen_size.ws_col;
#else
screen_width = 80; /* default assumption */
#endif
ncolumns = (screen_width - 3) / (QL_MAX_CMD_LEN + 1);
ncolumns = Max(ncolumns, 1);
nrows = (QL_HELP_COUNT + (ncolumns - 1)) / ncolumns;
output = PageOutput(nrows + 1, pager ? &(pset.popt.topt) : NULL);
fputs(_("Available help:\n"), output);
for (i = 0; i < nrows; i++)
{
fprintf(output, " ");
for (j = 0; j < ncolumns - 1; j++)
fprintf(output, "%-*s",
QL_MAX_CMD_LEN + 1,
VALUE_OR_NULL(QL_HELP[i + j * nrows].cmd));
if (i + j * nrows < QL_HELP_COUNT)
fprintf(output, "%s",
VALUE_OR_NULL(QL_HELP[i + j * nrows].cmd));
fputc('\n', output);
}
ClosePager(output);
}
else
{
int i,
j,
x = 0;
bool help_found = false;
FILE *output = NULL;
size_t len,
wordlen;
int nl_count = 0;
/*
* We first try exact match, then first + second words, then first
* word only.
*/
len = strlen(topic);
for (x = 1; x <= 3; x++)
{
if (x > 1) /* Nothing on first pass - try the opening
* word(s) */
{
wordlen = j = 1;
while (topic[j] != ' ' && j++ < len)
wordlen++;
if (x == 2)
{
j++;
while (topic[j] != ' ' && j++ <= len)
wordlen++;
}
if (wordlen >= len) /* Don't try again if the same word */
{
if (!output)
output = PageOutput(nl_count, pager ? &(pset.popt.topt) : NULL);
break;
}
len = wordlen;
}
/* Count newlines for pager */
for (i = 0; QL_HELP[i].cmd; i++)
{
if (pg_strncasecmp(topic, QL_HELP[i].cmd, len) == 0 ||
strcmp(topic, "*") == 0)
{
nl_count += 5 + QL_HELP[i].nl_count;
/* If we have an exact match, exit. Fixes \h SELECT */
if (pg_strcasecmp(topic, QL_HELP[i].cmd) == 0)
break;
}
}
if (!output)
output = PageOutput(nl_count, pager ? &(pset.popt.topt) : NULL);
for (i = 0; QL_HELP[i].cmd; i++)
{
if (pg_strncasecmp(topic, QL_HELP[i].cmd, len) == 0 ||
strcmp(topic, "*") == 0)
{
PQExpBufferData buffer;
initPQExpBuffer(&buffer);
QL_HELP[i].syntaxfunc(&buffer);
help_found = true;
fprintf(output, _("Command: %s\n"
"Description: %s\n"
"Syntax:\n%s\n\n"),
QL_HELP[i].cmd,
_(QL_HELP[i].help),
buffer.data);
/* If we have an exact match, exit. Fixes \h SELECT */
if (pg_strcasecmp(topic, QL_HELP[i].cmd) == 0)
break;
}
}
if (help_found) /* Don't keep trying if we got a match */
break;
}
if (!help_found)
fprintf(output, _("No help available for \"%s\".\nTry \\h with no arguments to see available help.\n"), topic);
ClosePager(output);
}
}
void
print_copyright(void)
{
puts(
"PostgreSQL Database Management System\n"
"(formerly known as Postgres, then as Postgres95)\n\n"
"Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group\n\n"
"Portions Copyright (c) 1994, The Regents of the University of California\n\n"
"Permission to use, copy, modify, and distribute this software and its\n"
"documentation for any purpose, without fee, and without a written agreement\n"
"is hereby granted, provided that the above copyright notice and this\n"
"paragraph and the following two paragraphs appear in all copies.\n\n"
"IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR\n"
"DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING\n"
"LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS\n"
"DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE\n"
"POSSIBILITY OF SUCH DAMAGE.\n\n"
"THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,\n"
"INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY\n"
"AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS\n"
"ON AN \"AS IS\" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO\n"
"PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.\n"
);
}

21
src/bin/csql/help.h Normal file
View File

@ -0,0 +1,21 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/help.h
*/
#ifndef HELP_H
#define HELP_H
void usage(unsigned short int pager);
void slashUsage(unsigned short int pager);
void helpVariables(unsigned short int pager);
void helpSQL(const char *topic, unsigned short int pager);
void print_copyright(void);
#endif

539
src/bin/csql/input.c Normal file
View File

@ -0,0 +1,539 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/input.c
*/
#include "postgres_fe.h"
#ifndef WIN32
#include <unistd.h>
#endif
#include <fcntl.h>
#include <limits.h>
#include "input.h"
#include "settings.h"
#include "tab-complete.h"
#include "common.h"
#ifndef WIN32
#define PSQLHISTORY ".psql_history"
#else
#define PSQLHISTORY "psql_history"
#endif
/* Runtime options for turning off readline and history */
/* (of course there is no runtime command for doing that :) */
#ifdef USE_READLINE
static bool useReadline;
static bool useHistory;
static char *psql_history;
static int history_lines_added;
/*
* Preserve newlines in saved queries by mapping '\n' to NL_IN_HISTORY
*
* It is assumed NL_IN_HISTORY will never be entered by the user
* nor appear inside a multi-byte string. 0x00 is not properly
* handled by the readline routines so it can not be used
* for this purpose.
*/
#define NL_IN_HISTORY 0x01
#endif
static void finishInput(void);
/*
* gets_interactive()
*
* Gets a line of interactive input, using readline if desired.
* The result is a malloc'd string.
*
* Caller *must* have set up sigint_interrupt_jmp before calling.
*/
char *
gets_interactive(const char *prompt)
{
#ifdef USE_READLINE
if (useReadline)
{
char *result;
/*
* Some versions of readline don't notice SIGWINCH signals that arrive
* when not actively reading input. The simplest fix is to always
* re-read the terminal size. This leaves a window for SIGWINCH to be
* missed between here and where readline() enables libreadline's
* signal handler, but that's probably short enough to be ignored.
*/
#ifdef HAVE_RL_RESET_SCREEN_SIZE
rl_reset_screen_size();
#endif
/* Enable SIGINT to longjmp to sigint_interrupt_jmp */
sigint_interrupt_enabled = true;
/* On some platforms, readline is declared as readline(char *) */
result = readline((char *) prompt);
/* Disable SIGINT again */
sigint_interrupt_enabled = false;
return result;
}
#endif
fputs(prompt, stdout);
fflush(stdout);
return gets_fromFile(stdin);
}
/*
* Append the line to the history buffer, making sure there is a trailing '\n'
*/
void
pg_append_history(const char *s, PQExpBuffer history_buf)
{
#ifdef USE_READLINE
if (useHistory && s)
{
appendPQExpBufferStr(history_buf, s);
if (!s[0] || s[strlen(s) - 1] != '\n')
appendPQExpBufferChar(history_buf, '\n');
}
#endif
}
/*
* Emit accumulated history entry to readline's history mechanism,
* then reset the buffer to empty.
*
* Note: we write nothing if history_buf is empty, so extra calls to this
* function don't hurt. There must have been at least one line added by
* pg_append_history before we'll do anything.
*/
void
pg_send_history(PQExpBuffer history_buf)
{
#ifdef USE_READLINE
static char *prev_hist = NULL;
char *s = history_buf->data;
int i;
/* Trim any trailing \n's (OK to scribble on history_buf) */
for (i = strlen(s) - 1; i >= 0 && s[i] == '\n'; i--)
;
s[i + 1] = '\0';
if (useHistory && s[0])
{
if (((pset.histcontrol & hctl_ignorespace) &&
s[0] == ' ') ||
((pset.histcontrol & hctl_ignoredups) &&
prev_hist && strcmp(s, prev_hist) == 0))
{
/* Ignore this line as far as history is concerned */
}
else
{
/* Save each previous line for ignoredups processing */
if (prev_hist)
free(prev_hist);
prev_hist = pg_strdup(s);
/* And send it to readline */
add_history(s);
/* Count lines added to history for use later */
history_lines_added++;
}
}
resetPQExpBuffer(history_buf);
#endif
}
/*
* gets_fromFile
*
* Gets a line of noninteractive input from a file (which could be stdin).
* The result is a malloc'd string, or NULL on EOF or input error.
*
* Caller *must* have set up sigint_interrupt_jmp before calling.
*
* Note: we re-use a static PQExpBuffer for each call. This is to avoid
* leaking memory if interrupted by SIGINT.
*/
char *
gets_fromFile(FILE *source)
{
static PQExpBuffer buffer = NULL;
char line[1024];
if (buffer == NULL) /* first time through? */
buffer = createPQExpBuffer();
else
resetPQExpBuffer(buffer);
for (;;)
{
char *result;
/* Enable SIGINT to longjmp to sigint_interrupt_jmp */
sigint_interrupt_enabled = true;
/* Get some data */
result = fgets(line, sizeof(line), source);
/* Disable SIGINT again */
sigint_interrupt_enabled = false;
/* EOF or error? */
if (result == NULL)
{
if (ferror(source))
{
psql_error("could not read from input file: %s\n",
strerror(errno));
return NULL;
}
break;
}
appendPQExpBufferStr(buffer, line);
if (PQExpBufferBroken(buffer))
{
psql_error("out of memory\n");
return NULL;
}
/* EOL? */
if (buffer->data[buffer->len - 1] == '\n')
{
buffer->data[buffer->len - 1] = '\0';
return pg_strdup(buffer->data);
}
}
if (buffer->len > 0) /* EOF after reading some bufferload(s) */
return pg_strdup(buffer->data);
/* EOF, so return null */
return NULL;
}
#ifdef USE_READLINE
/*
* Macros to iterate over each element of the history list in order
*
* You would think this would be simple enough, but in its inimitable fashion
* libedit has managed to break it: in libreadline we must use next_history()
* to go from oldest to newest, but in libedit we must use previous_history().
* To detect what to do, we make a trial call of previous_history(): if it
* fails, then either next_history() is what to use, or there's zero or one
* history entry so that it doesn't matter which direction we go.
*
* In case that wasn't disgusting enough: the code below is not as obvious as
* it might appear. In some libedit releases history_set_pos(0) fails until
* at least one add_history() call has been done. This is not an issue for
* printHistory() or encode_history(), which cannot be invoked before that has
* happened. In decode_history(), that's not so, and what actually happens is
* that we are sitting on the newest entry to start with, previous_history()
* fails, and we iterate over all the entries using next_history(). So the
* decode_history() loop iterates over the entries in the wrong order when
* using such a libedit release, and if there were another attempt to use
* BEGIN_ITERATE_HISTORY() before some add_history() call had happened, it
* wouldn't work. Fortunately we don't care about either of those things.
*
* Usage pattern is:
*
* BEGIN_ITERATE_HISTORY(varname);
* {
* loop body referencing varname->line;
* }
* END_ITERATE_HISTORY();
*/
#define BEGIN_ITERATE_HISTORY(VARNAME) \
do { \
HIST_ENTRY *VARNAME; \
bool use_prev_; \
\
history_set_pos(0); \
use_prev_ = (previous_history() != NULL); \
history_set_pos(0); \
for (VARNAME = current_history(); VARNAME != NULL; \
VARNAME = use_prev_ ? previous_history() : next_history()) \
{ \
(void) 0
#define END_ITERATE_HISTORY() \
} \
} while(0)
/*
* Convert newlines to NL_IN_HISTORY for safe saving in readline history file
*/
static void
encode_history(void)
{
BEGIN_ITERATE_HISTORY(cur_hist);
{
char *cur_ptr;
/* some platforms declare HIST_ENTRY.line as const char * */
for (cur_ptr = (char *) cur_hist->line; *cur_ptr; cur_ptr++)
{
if (*cur_ptr == '\n')
*cur_ptr = NL_IN_HISTORY;
}
}
END_ITERATE_HISTORY();
}
/*
* Reverse the above encoding
*/
static void
decode_history(void)
{
BEGIN_ITERATE_HISTORY(cur_hist);
{
char *cur_ptr;
/* some platforms declare HIST_ENTRY.line as const char * */
for (cur_ptr = (char *) cur_hist->line; *cur_ptr; cur_ptr++)
{
if (*cur_ptr == NL_IN_HISTORY)
*cur_ptr = '\n';
}
}
END_ITERATE_HISTORY();
}
#endif /* USE_READLINE */
/*
* Put any startup stuff related to input in here. It's good to maintain
* abstraction this way.
*
* The only "flag" right now is 1 for use readline & history.
*/
void
initializeInput(int flags)
{
#ifdef USE_READLINE
if (flags & 1)
{
const char *histfile;
char home[MAXPGPATH];
useReadline = true;
/* these two things must be done in this order: */
initialize_readline();
rl_initialize();
useHistory = true;
using_history();
history_lines_added = 0;
histfile = GetVariable(pset.vars, "HISTFILE");
if (histfile == NULL)
{
char *envhist;
envhist = getenv("PSQL_HISTORY");
if (envhist != NULL && strlen(envhist) > 0)
histfile = envhist;
}
if (histfile == NULL)
{
if (get_home_path(home))
psql_history = psprintf("%s/%s", home, PSQLHISTORY);
}
else
{
psql_history = pg_strdup(histfile);
expand_tilde(&psql_history);
}
if (psql_history)
{
read_history(psql_history);
decode_history();
}
}
#endif
atexit(finishInput);
}
/*
* This function saves the readline history when psql exits.
*
* fname: pathname of history file. (Should really be "const char *",
* but some ancient versions of readline omit the const-decoration.)
*
* max_lines: if >= 0, limit history file to that many entries.
*/
#ifdef USE_READLINE
static bool
saveHistory(char *fname, int max_lines)
{
int errnum;
/*
* Suppressing the write attempt when HISTFILE is set to /dev/null may
* look like a negligible optimization, but it's necessary on e.g. Darwin,
* where write_history will fail because it tries to chmod the target
* file.
*/
if (strcmp(fname, DEVNULL) != 0)
{
/*
* Encode \n, since otherwise readline will reload multiline history
* entries as separate lines. (libedit doesn't really need this, but
* we do it anyway since it's too hard to tell which implementation we
* are using.)
*/
encode_history();
/*
* On newer versions of libreadline, truncate the history file as
* needed and then append what we've added. This avoids overwriting
* history from other concurrent sessions (although there are still
* race conditions when two sessions exit at about the same time). If
* we don't have those functions, fall back to write_history().
*/
#if defined(HAVE_HISTORY_TRUNCATE_FILE) && defined(HAVE_APPEND_HISTORY)
{
int nlines;
int fd;
/* truncate previous entries if needed */
if (max_lines >= 0)
{
nlines = Max(max_lines - history_lines_added, 0);
(void) history_truncate_file(fname, nlines);
}
/* append_history fails if file doesn't already exist :-( */
fd = open(fname, O_CREAT | O_WRONLY | PG_BINARY, 0600);
if (fd >= 0)
close(fd);
/* append the appropriate number of lines */
if (max_lines >= 0)
nlines = Min(max_lines, history_lines_added);
else
nlines = history_lines_added;
errnum = append_history(nlines, fname);
if (errnum == 0)
return true;
}
#else /* don't have append support */
{
/* truncate what we have ... */
if (max_lines >= 0)
stifle_history(max_lines);
/* ... and overwrite file. Tough luck for concurrent sessions. */
errnum = write_history(fname);
if (errnum == 0)
return true;
}
#endif
psql_error("could not save history to file \"%s\": %s\n",
fname, strerror(errnum));
}
return false;
}
#endif
/*
* Print history to the specified file, or to the console if fname is NULL
* (psql \s command)
*
* We used to use saveHistory() for this purpose, but that doesn't permit
* use of a pager; moreover libedit's implementation behaves incompatibly
* (preferring to encode its output) and may fail outright when the target
* file is specified as /dev/tty.
*/
bool
printHistory(const char *fname, unsigned short int pager)
{
#ifdef USE_READLINE
FILE *output;
bool is_pager;
if (!useHistory)
return false;
if (fname == NULL)
{
/* use pager, if enabled, when printing to console */
output = PageOutput(INT_MAX, pager ? &(pset.popt.topt) : NULL);
is_pager = true;
}
else
{
output = fopen(fname, "w");
if (output == NULL)
{
psql_error("could not save history to file \"%s\": %s\n",
fname, strerror(errno));
return false;
}
is_pager = false;
}
BEGIN_ITERATE_HISTORY(cur_hist);
{
fprintf(output, "%s\n", cur_hist->line);
}
END_ITERATE_HISTORY();
if (is_pager)
ClosePager(output);
else
fclose(output);
return true;
#else
psql_error("history is not supported by this installation\n");
return false;
#endif
}
static void
finishInput(void)
{
#ifdef USE_READLINE
if (useHistory && psql_history)
{
int hist_size;
hist_size = GetVariableNum(pset.vars, "HISTSIZE", 500, -1, true);
(void) saveHistory(psql_history, hist_size);
free(psql_history);
psql_history = NULL;
}
#endif
}

51
src/bin/csql/input.h Normal file
View File

@ -0,0 +1,51 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/input.h
*/
#ifndef INPUT_H
#define INPUT_H
/*
* If some other file needs to have access to readline/history, include this
* file and save yourself all this work.
*
* USE_READLINE is the definite pointers regarding existence or not.
*/
#ifdef HAVE_LIBREADLINE
#define USE_READLINE 1
#if defined(HAVE_READLINE_READLINE_H)
#include <readline/readline.h>
#if defined(HAVE_READLINE_HISTORY_H)
#include <readline/history.h>
#endif
#elif defined(HAVE_EDITLINE_READLINE_H)
#include <editline/readline.h>
#if defined(HAVE_EDITLINE_HISTORY_H)
#include <editline/history.h>
#endif
#elif defined(HAVE_READLINE_H)
#include <readline.h>
#if defined(HAVE_HISTORY_H)
#include <history.h>
#endif
#endif /* HAVE_READLINE_READLINE_H, etc */
#endif /* HAVE_LIBREADLINE */
#include "pqexpbuffer.h"
char *gets_interactive(const char *prompt);
char *gets_fromFile(FILE *source);
void initializeInput(int flags);
bool printHistory(const char *fname, unsigned short int pager);
void pg_append_history(const char *s, PQExpBuffer history_buf);
void pg_send_history(PQExpBuffer history_buf);
#endif /* INPUT_H */

30
src/bin/csql/keywords.c Normal file
View File

@ -0,0 +1,30 @@
/*-------------------------------------------------------------------------
*
* keywords.c
* lexical token lookup for key words in PostgreSQL
*
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/bin/pg_dump/keywords.c
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include "parser/keywords.h"
/*
* We don't need the token number, so leave it out to avoid requiring other
* backend headers.
*/
#define PG_KEYWORD(a,b,c) {a,0,c},
const ScanKeyword FEScanKeywords[] = {
#include "parser/kwlist.h"
};
const int NumFEScanKeywords = lengthof(FEScanKeywords);

89
src/bin/csql/kwlookup.c Normal file
View File

@ -0,0 +1,89 @@
/*-------------------------------------------------------------------------
*
* kwlookup.c
* lexical token lookup for key words in PostgreSQL
*
* NB - this file is also used by ECPG and several frontend programs in
* src/bin/ including pg_dump and psql
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/parser/kwlookup.c
*
*-------------------------------------------------------------------------
*/
/* use c.h so this can be built as either frontend or backend */
#include "c.h"
#include <ctype.h>
#include "parser/keywords.h"
/*
* ScanKeywordLookup - see if a given word is a keyword
*
* Returns a pointer to the ScanKeyword table entry, or NULL if no match.
*
* The match is done case-insensitively. Note that we deliberately use a
* dumbed-down case conversion that will only translate 'A'-'Z' into 'a'-'z',
* even if we are in a locale where tolower() would produce more or different
* translations. This is to conform to the SQL99 spec, which says that
* keywords are to be matched in this way even though non-keyword identifiers
* receive a different case-normalization mapping.
*/
const ScanKeyword *
ScanKeywordLookup(const char *text,
const ScanKeyword *keywords,
int num_keywords)
{
int len,
i;
char word[NAMEDATALEN];
const ScanKeyword *low;
const ScanKeyword *high;
len = strlen(text);
/* We assume all keywords are shorter than NAMEDATALEN. */
if (len >= NAMEDATALEN)
return NULL;
/*
* Apply an ASCII-only downcasing. We must not use tolower() since it may
* produce the wrong translation in some locales (eg, Turkish).
*/
for (i = 0; i < len; i++)
{
char ch = text[i];
if (ch >= 'A' && ch <= 'Z')
ch += 'a' - 'A';
word[i] = ch;
}
word[len] = '\0';
/*
* Now do a binary search using plain strcmp() comparison.
*/
low = keywords;
high = keywords + (num_keywords - 1);
while (low <= high)
{
const ScanKeyword *middle;
int difference;
middle = low + (high - low) / 2;
difference = strcmp(middle->name, word);
if (difference == 0)
return middle;
else if (difference < 0)
low = middle + 1;
else
high = middle - 1;
}
return NULL;
}

321
src/bin/csql/large_obj.c Normal file
View File

@ -0,0 +1,321 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/large_obj.c
*/
#include "postgres_fe.h"
#include "large_obj.h"
#include "settings.h"
#include "common.h"
#if (PG_VERSION_NUM >= 90500)
static void print_lo_result(const char *fmt,...) pg_attribute_printf(1, 2);
#else
static void
print_lo_result(const char *fmt,...)
__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
#endif
static void
print_lo_result(const char *fmt,...)
{
va_list ap;
if (!pset.quiet)
{
if (pset.popt.topt.format == PRINT_HTML)
fputs("<p>", pset.queryFout);
va_start(ap, fmt);
vfprintf(pset.queryFout, fmt, ap);
va_end(ap);
if (pset.popt.topt.format == PRINT_HTML)
fputs("</p>\n", pset.queryFout);
else
fputs("\n", pset.queryFout);
}
if (pset.logfile)
{
va_start(ap, fmt);
vfprintf(pset.logfile, fmt, ap);
va_end(ap);
fputs("\n", pset.logfile);
}
}
/*
* Prepare to do a large-object operation. We *must* be inside a transaction
* block for all these operations, so start one if needed.
*
* Returns TRUE if okay, FALSE if failed. *own_transaction is set to indicate
* if we started our own transaction or not.
*/
static bool
start_lo_xact(const char *operation, bool *own_transaction)
{
PGTransactionStatusType tstatus;
PGresult *res;
*own_transaction = false;
if (!pset.db)
{
psql_error("%s: not connected to a database\n", operation);
return false;
}
tstatus = PQtransactionStatus(pset.db);
switch (tstatus)
{
case PQTRANS_IDLE:
/* need to start our own xact */
if (!(res = PSQLexec("BEGIN")))
return false;
PQclear(res);
*own_transaction = true;
break;
case PQTRANS_INTRANS:
/* use the existing xact */
break;
case PQTRANS_INERROR:
psql_error("%s: current transaction is aborted\n", operation);
return false;
default:
psql_error("%s: unknown transaction status\n", operation);
return false;
}
return true;
}
/*
* Clean up after a successful LO operation
*/
static bool
finish_lo_xact(const char *operation, bool own_transaction)
{
PGresult *res;
if (own_transaction && pset.autocommit)
{
/* close out our own xact */
if (!(res = PSQLexec("COMMIT")))
{
res = PSQLexec("ROLLBACK");
PQclear(res);
return false;
}
PQclear(res);
}
return true;
}
/*
* Clean up after a failed LO operation
*/
static bool
fail_lo_xact(const char *operation, bool own_transaction)
{
PGresult *res;
if (own_transaction && pset.autocommit)
{
/* close out our own xact */
res = PSQLexec("ROLLBACK");
PQclear(res);
}
return false; /* always */
}
/*
* do_lo_export()
*
* Write a large object to a file
*/
bool
do_lo_export(const char *loid_arg, const char *filename_arg)
{
int status;
bool own_transaction;
if (!start_lo_xact("\\lo_export", &own_transaction))
return false;
SetCancelConn();
status = lo_export(pset.db, atooid(loid_arg), filename_arg);
ResetCancelConn();
/* of course this status is documented nowhere :( */
if (status != 1)
{
psql_error("%s", PQerrorMessage(pset.db));
return fail_lo_xact("\\lo_export", own_transaction);
}
if (!finish_lo_xact("\\lo_export", own_transaction))
return false;
print_lo_result("lo_export");
return true;
}
/*
* do_lo_import()
*
* Copy large object from file to database
*/
bool
do_lo_import(const char *filename_arg, const char *comment_arg)
{
PGresult *res;
Oid loid;
char oidbuf[32];
bool own_transaction;
if (!start_lo_xact("\\lo_import", &own_transaction))
return false;
SetCancelConn();
loid = lo_import(pset.db, filename_arg);
ResetCancelConn();
if (loid == InvalidOid)
{
psql_error("%s", PQerrorMessage(pset.db));
return fail_lo_xact("\\lo_import", own_transaction);
}
/* insert description if given */
if (comment_arg)
{
char *cmdbuf;
char *bufptr;
size_t slen = strlen(comment_arg);
cmdbuf = malloc(slen * 2 + 256);
if (!cmdbuf)
return fail_lo_xact("\\lo_import", own_transaction);
sprintf(cmdbuf, "COMMENT ON LARGE OBJECT %u IS '", loid);
bufptr = cmdbuf + strlen(cmdbuf);
bufptr += PQescapeStringConn(pset.db, bufptr, comment_arg, slen, NULL);
strcpy(bufptr, "'");
if (!(res = PSQLexec(cmdbuf)))
{
free(cmdbuf);
return fail_lo_xact("\\lo_import", own_transaction);
}
PQclear(res);
free(cmdbuf);
}
if (!finish_lo_xact("\\lo_import", own_transaction))
return false;
print_lo_result("lo_import %u", loid);
sprintf(oidbuf, "%u", loid);
SetVariable(pset.vars, "LASTOID", oidbuf);
return true;
}
/*
* do_lo_unlink()
*
* removes a large object out of the database
*/
bool
do_lo_unlink(const char *loid_arg)
{
int status;
Oid loid = atooid(loid_arg);
bool own_transaction;
if (!start_lo_xact("\\lo_unlink", &own_transaction))
return false;
SetCancelConn();
status = lo_unlink(pset.db, loid);
ResetCancelConn();
if (status == -1)
{
psql_error("%s", PQerrorMessage(pset.db));
return fail_lo_xact("\\lo_unlink", own_transaction);
}
if (!finish_lo_xact("\\lo_unlink", own_transaction))
return false;
print_lo_result("lo_unlink %u", loid);
return true;
}
/*
* do_lo_list()
*
* Show all large objects in database with comments
*/
bool
do_lo_list(void)
{
PGresult *res;
char buf[1024];
printQueryOpt myopt = pset.popt;
if (pset.sversion >= 90000)
{
snprintf(buf, sizeof(buf),
"SELECT oid as \"%s\",\n"
" pg_catalog.pg_get_userbyid(lomowner) as \"%s\",\n"
" pg_catalog.obj_description(oid, 'pg_largeobject') as \"%s\"\n"
" FROM pg_catalog.pg_largeobject_metadata "
" ORDER BY oid",
gettext_noop("ID"),
gettext_noop("Owner"),
gettext_noop("Description"));
}
else
{
snprintf(buf, sizeof(buf),
"SELECT loid as \"%s\",\n"
" pg_catalog.obj_description(loid, 'pg_largeobject') as \"%s\"\n"
"FROM (SELECT DISTINCT loid FROM pg_catalog.pg_largeobject) x\n"
"ORDER BY 1",
gettext_noop("ID"),
gettext_noop("Description"));
}
res = PSQLexec(buf);
if (!res)
return false;
myopt.topt.tuples_only = false;
myopt.nullPrint = NULL;
myopt.title = _("Large objects");
myopt.translate_header = true;
printQuery(res, &myopt, pset.queryFout, false, pset.logfile);
PQclear(res);
return true;
}

16
src/bin/csql/large_obj.h Normal file
View File

@ -0,0 +1,16 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/large_obj.h
*/
#ifndef LARGE_OBJ_H
#define LARGE_OBJ_H
bool do_lo_export(const char *loid_arg, const char *filename_arg);
bool do_lo_import(const char *filename_arg, const char *comment_arg);
bool do_lo_unlink(const char *loid_arg);
bool do_lo_list(void);
#endif /* LARGE_OBJ_H */

462
src/bin/csql/mainloop.c Normal file
View File

@ -0,0 +1,462 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/mainloop.c
*/
#include "postgres_fe.h"
#include "mainloop.h"
#include "command.h"
#include "common.h"
#include "input.h"
#include "settings.h"
#include "mb/pg_wchar.h"
/*
* Main processing loop for reading lines of input
* and sending them to the backend.
*
* This loop is re-entrant. May be called by \i command
* which reads input from a file.
*/
int
MainLoop(FILE *source)
{
PsqlScanState scan_state; /* lexer working state */
volatile PQExpBuffer query_buf; /* buffer for query being accumulated */
volatile PQExpBuffer previous_buf; /* if there isn't anything in the new
* buffer yet, use this one for \e,
* etc. */
PQExpBuffer history_buf; /* earlier lines of a multi-line command, not
* yet saved to readline history */
char *line; /* current line of input */
int added_nl_pos;
bool success;
bool line_saved_in_history;
volatile int successResult = EXIT_SUCCESS;
volatile backslashResult slashCmdStatus = PSQL_CMD_UNKNOWN;
volatile promptStatus_t prompt_status = PROMPT_READY;
volatile int count_eof = 0;
volatile bool die_on_error = false;
/* Save the prior command source */
FILE *prev_cmd_source;
bool prev_cmd_interactive;
uint64 prev_lineno;
/* Save old settings */
prev_cmd_source = pset.cur_cmd_source;
prev_cmd_interactive = pset.cur_cmd_interactive;
prev_lineno = pset.lineno;
/* Establish new source */
pset.cur_cmd_source = source;
pset.cur_cmd_interactive = ((source == stdin) && !pset.notty);
pset.lineno = 0;
pset.stmt_lineno = 1;
/* Create working state */
scan_state = psql_scan_create();
query_buf = createPQExpBuffer();
previous_buf = createPQExpBuffer();
history_buf = createPQExpBuffer();
if (PQExpBufferBroken(query_buf) ||
PQExpBufferBroken(previous_buf) ||
PQExpBufferBroken(history_buf))
{
psql_error("out of memory\n");
exit(EXIT_FAILURE);
}
/* main loop to get queries and execute them */
while (successResult == EXIT_SUCCESS)
{
/*
* Clean up after a previous Control-C
*/
if (cancel_pressed)
{
if (!pset.cur_cmd_interactive)
{
/*
* You get here if you stopped a script with Ctrl-C.
*/
successResult = EXIT_USER;
break;
}
cancel_pressed = false;
}
/*
* Establish longjmp destination for exiting from wait-for-input. We
* must re-do this each time through the loop for safety, since the
* jmpbuf might get changed during command execution.
*/
if (sigsetjmp(sigint_interrupt_jmp, 1) != 0)
{
/* got here with longjmp */
/* reset parsing state */
psql_scan_finish(scan_state);
psql_scan_reset(scan_state);
resetPQExpBuffer(query_buf);
resetPQExpBuffer(history_buf);
count_eof = 0;
slashCmdStatus = PSQL_CMD_UNKNOWN;
prompt_status = PROMPT_READY;
pset.stmt_lineno = 1;
cancel_pressed = false;
if (pset.cur_cmd_interactive)
putc('\n', stdout);
else
{
successResult = EXIT_USER;
break;
}
}
fflush(stdout);
/*
* get another line
*/
if (pset.cur_cmd_interactive)
{
/* May need to reset prompt, eg after \r command */
if (query_buf->len == 0)
prompt_status = PROMPT_READY;
line = gets_interactive(get_prompt(prompt_status));
}
else
{
line = gets_fromFile(source);
if (!line && ferror(source))
successResult = EXIT_FAILURE;
}
/*
* query_buf holds query already accumulated. line is the malloc'd
* new line of input (note it must be freed before looping around!)
*/
/* No more input. Time to quit, or \i done */
if (line == NULL)
{
if (pset.cur_cmd_interactive)
{
/* This tries to mimic bash's IGNOREEOF feature. */
count_eof++;
if (count_eof < GetVariableNum(pset.vars, "IGNOREEOF", 0, 10, false))
{
if (!pset.quiet)
printf(_("Use \"\\q\" to leave %s.\n"), pset.progname);
continue;
}
puts(pset.quiet ? "" : "\\q");
}
break;
}
count_eof = 0;
pset.lineno++;
/* ignore UTF-8 Unicode byte-order mark */
if (pset.lineno == 1 && pset.encoding == PG_UTF8 && strncmp(line, "\xef\xbb\xbf", 3) == 0)
memmove(line, line + 3, strlen(line + 3) + 1);
/* Detect attempts to run custom-format dumps as SQL scripts */
if (pset.lineno == 1 && !pset.cur_cmd_interactive &&
strncmp(line, "PGDMP", 5) == 0)
{
free(line);
puts(_("The input is a PostgreSQL custom-format dump.\n"
"Use the pg_restore command-line client to restore this dump to a database.\n"));
fflush(stdout);
successResult = EXIT_FAILURE;
break;
}
/* no further processing of empty lines, unless within a literal */
if (line[0] == '\0' && !psql_scan_in_quote(scan_state))
{
free(line);
continue;
}
/* A request for help? Be friendly and give them some guidance */
if (pset.cur_cmd_interactive && query_buf->len == 0 &&
pg_strncasecmp(line, "help", 4) == 0 &&
(line[4] == '\0' || line[4] == ';' || isspace((unsigned char) line[4])))
{
free(line);
puts(_("You are using csql, the command-line interface to CitusDB."));
printf(_("Type: \\copyright for distribution terms\n"
" \\h for help with SQL commands\n"
" \\? for help with csql commands\n"
" \\g or terminate with semicolon to execute query\n"
" \\q to quit\n"));
fflush(stdout);
continue;
}
/* echo back if flag is set, unless interactive */
if (pset.echo == PSQL_ECHO_ALL && !pset.cur_cmd_interactive)
{
puts(line);
fflush(stdout);
}
/* insert newlines into query buffer between source lines */
if (query_buf->len > 0)
{
appendPQExpBufferChar(query_buf, '\n');
added_nl_pos = query_buf->len;
}
else
added_nl_pos = -1; /* flag we didn't add one */
/* Setting this will not have effect until next line. */
die_on_error = pset.on_error_stop;
/*
* Parse line, looking for command separators.
*/
psql_scan_setup(scan_state, line, strlen(line));
success = true;
line_saved_in_history = false;
while (success || !die_on_error)
{
PsqlScanResult scan_result;
promptStatus_t prompt_tmp = prompt_status;
size_t pos_in_query;
char *tmp_line;
pos_in_query = query_buf->len;
scan_result = psql_scan(scan_state, query_buf, &prompt_tmp);
prompt_status = prompt_tmp;
if (PQExpBufferBroken(query_buf))
{
psql_error("out of memory\n");
exit(EXIT_FAILURE);
}
/*
* Increase statement line number counter for each linebreak added
* to the query buffer by the last psql_scan() call. There only
* will be ones to add when navigating to a statement in
* readline's history containing newlines.
*/
tmp_line = query_buf->data + pos_in_query;
while (*tmp_line != '\0')
{
if (*(tmp_line++) == '\n')
pset.stmt_lineno++;
}
if (scan_result == PSCAN_EOL)
pset.stmt_lineno++;
/*
* Send command if semicolon found, or if end of line and we're in
* single-line mode.
*/
if (scan_result == PSCAN_SEMICOLON ||
(scan_result == PSCAN_EOL && pset.singleline))
{
/*
* Save query in history. We use history_buf to accumulate
* multi-line queries into a single history entry.
*/
if (pset.cur_cmd_interactive && !line_saved_in_history)
{
pg_append_history(line, history_buf);
pg_send_history(history_buf);
line_saved_in_history = true;
}
/* execute query */
success = SendQuery(query_buf->data);
slashCmdStatus = success ? PSQL_CMD_SEND : PSQL_CMD_ERROR;
pset.stmt_lineno = 1;
/* transfer query to previous_buf by pointer-swapping */
{
PQExpBuffer swap_buf = previous_buf;
previous_buf = query_buf;
query_buf = swap_buf;
}
resetPQExpBuffer(query_buf);
added_nl_pos = -1;
/* we need not do psql_scan_reset() here */
}
else if (scan_result == PSCAN_BACKSLASH)
{
/* handle backslash command */
/*
* If we added a newline to query_buf, and nothing else has
* been inserted in query_buf by the lexer, then strip off the
* newline again. This avoids any change to query_buf when a
* line contains only a backslash command. Also, in this
* situation we force out any previous lines as a separate
* history entry; we don't want SQL and backslash commands
* intermixed in history if at all possible.
*/
if (query_buf->len == added_nl_pos)
{
query_buf->data[--query_buf->len] = '\0';
pg_send_history(history_buf);
}
added_nl_pos = -1;
/* save backslash command in history */
if (pset.cur_cmd_interactive && !line_saved_in_history)
{
pg_append_history(line, history_buf);
pg_send_history(history_buf);
line_saved_in_history = true;
}
/* execute backslash command */
slashCmdStatus = HandleSlashCmds(scan_state,
query_buf->len > 0 ?
query_buf : previous_buf);
success = slashCmdStatus != PSQL_CMD_ERROR;
pset.stmt_lineno = 1;
if ((slashCmdStatus == PSQL_CMD_SEND || slashCmdStatus == PSQL_CMD_NEWEDIT) &&
query_buf->len == 0)
{
/* copy previous buffer to current for handling */
appendPQExpBufferStr(query_buf, previous_buf->data);
}
if (slashCmdStatus == PSQL_CMD_SEND)
{
success = SendQuery(query_buf->data);
/* transfer query to previous_buf by pointer-swapping */
{
PQExpBuffer swap_buf = previous_buf;
previous_buf = query_buf;
query_buf = swap_buf;
}
resetPQExpBuffer(query_buf);
/* flush any paren nesting info after forced send */
psql_scan_reset(scan_state);
}
else if (slashCmdStatus == PSQL_CMD_NEWEDIT)
{
/* rescan query_buf as new input */
psql_scan_finish(scan_state);
free(line);
line = pg_strdup(query_buf->data);
resetPQExpBuffer(query_buf);
/* reset parsing state since we are rescanning whole line */
psql_scan_reset(scan_state);
psql_scan_setup(scan_state, line, strlen(line));
line_saved_in_history = false;
prompt_status = PROMPT_READY;
}
else if (slashCmdStatus == PSQL_CMD_TERMINATE)
break;
}
/* fall out of loop if lexer reached EOL */
if (scan_result == PSCAN_INCOMPLETE ||
scan_result == PSCAN_EOL)
break;
}
/* Add line to pending history if we didn't execute anything yet */
if (pset.cur_cmd_interactive && !line_saved_in_history)
pg_append_history(line, history_buf);
psql_scan_finish(scan_state);
free(line);
if (slashCmdStatus == PSQL_CMD_TERMINATE)
{
successResult = EXIT_SUCCESS;
break;
}
if (!pset.cur_cmd_interactive)
{
if (!success && die_on_error)
successResult = EXIT_USER;
/* Have we lost the db connection? */
else if (!pset.db)
successResult = EXIT_BADCONN;
}
} /* while !endoffile/session */
/*
* Process query at the end of file without a semicolon
*/
if (query_buf->len > 0 && !pset.cur_cmd_interactive &&
successResult == EXIT_SUCCESS)
{
/* save query in history */
if (pset.cur_cmd_interactive)
pg_send_history(history_buf);
/* execute query */
success = SendQuery(query_buf->data);
if (!success && die_on_error)
successResult = EXIT_USER;
else if (pset.db == NULL)
successResult = EXIT_BADCONN;
}
/*
* Let's just make real sure the SIGINT handler won't try to use
* sigint_interrupt_jmp after we exit this routine. If there is an outer
* MainLoop instance, it will reset sigint_interrupt_jmp to point to
* itself at the top of its loop, before any further interactive input
* happens.
*/
sigint_interrupt_enabled = false;
destroyPQExpBuffer(query_buf);
destroyPQExpBuffer(previous_buf);
destroyPQExpBuffer(history_buf);
psql_scan_destroy(scan_state);
pset.cur_cmd_source = prev_cmd_source;
pset.cur_cmd_interactive = prev_cmd_interactive;
pset.lineno = prev_lineno;
return successResult;
} /* MainLoop() */
/*
* psqlscan.c is #include'd here instead of being compiled on its own.
* This is because we need postgres_fe.h to be read before any system
* include files, else things tend to break on platforms that have
* multiple infrastructures for stdio.h and so on. flex is absolutely
* uncooperative about that, so we can't compile psqlscan.c on its own.
*/
#include "psqlscan.c"

15
src/bin/csql/mainloop.h Normal file
View File

@ -0,0 +1,15 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/mainloop.h
*/
#ifndef MAINLOOP_H
#define MAINLOOP_H
#include "postgres_fe.h"
int MainLoop(FILE *source);
#endif /* MAINLOOP_H */

398
src/bin/csql/mbprint.c Normal file
View File

@ -0,0 +1,398 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/mbprint.c
*
* XXX this file does not really belong in psql/. Perhaps move to libpq?
* It also seems that the mbvalidate function is redundant with existing
* functionality.
*/
#include "postgres_fe.h"
#include "mbprint.h"
#ifndef PGSCRIPTS
#include "settings.h"
#endif
/*
* To avoid version-skew problems, this file must not use declarations
* from pg_wchar.h: the encoding IDs we are dealing with are determined
* by the libpq.so we are linked with, and that might not match the
* numbers we see at compile time. (If this file were inside libpq,
* the problem would go away...)
*
* Hence, we have our own definition of pg_wchar, and we get the values
* of any needed encoding IDs on-the-fly.
*/
typedef unsigned int pg_wchar;
static int
pg_get_utf8_id(void)
{
static int utf8_id = -1;
if (utf8_id < 0)
utf8_id = pg_char_to_encoding("utf8");
return utf8_id;
}
#define PG_UTF8 pg_get_utf8_id()
/*
* Convert a UTF-8 character to a Unicode code point.
* This is a one-character version of pg_utf2wchar_with_len.
*
* No error checks here, c must point to a long-enough string.
*/
static pg_wchar
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
return (pg_wchar) c[0];
else if ((*c & 0xe0) == 0xc0)
return (pg_wchar) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
return (pg_wchar) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
return (pg_wchar) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
else
/* that is an invalid code on purpose */
return 0xffffffff;
}
/*
* Unicode 3.1 compliant validation : for each category, it checks the
* combination of each byte to make sure it maps to a valid range. It also
* returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
* 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
*/
static int
utf_charcheck(const unsigned char *c)
{
if ((*c & 0x80) == 0)
return 1;
else if ((*c & 0xe0) == 0xc0)
{
/* two-byte char */
if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
return 2;
return -1;
}
else if ((*c & 0xf0) == 0xe0)
{
/* three-byte char */
if (((c[1] & 0xc0) == 0x80) &&
(((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
((c[2] & 0xc0) == 0x80))
{
int z = c[0] & 0x0f;
int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
int lx = yx & 0x7f;
/* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
if (((z == 0x0f) &&
(((yx & 0xffe) == 0xffe) ||
(((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
((z == 0x0d) && ((yx & 0xb00) == 0x800)))
return -1;
return 3;
}
return -1;
}
else if ((*c & 0xf8) == 0xf0)
{
int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
/* four-byte char */
if (((c[1] & 0xc0) == 0x80) &&
(u > 0x00) && (u <= 0x10) &&
((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
{
/* test for 0xzzzzfffe/0xzzzzfffff */
if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
((c[3] & 0x3e) == 0x3e))
return -1;
return 4;
}
return -1;
}
return -1;
}
static void
mb_utf_validate(unsigned char *pwcs)
{
unsigned char *p = pwcs;
while (*pwcs)
{
int len;
if ((len = utf_charcheck(pwcs)) > 0)
{
if (p != pwcs)
{
int i;
for (i = 0; i < len; i++)
*p++ = *pwcs++;
}
else
{
pwcs += len;
p += len;
}
}
else
/* we skip the char */
pwcs++;
}
if (p != pwcs)
*p = '\0';
}
/*
* public functions : wcswidth and mbvalidate
*/
/*
* pg_wcswidth is the dumb display-width function.
* It assumes that everything will appear on one line.
* OTOH it is easier to use than pg_wcssize if this applies to you.
*/
int
pg_wcswidth(const char *pwcs, size_t len, int encoding)
{
int width = 0;
while (len > 0)
{
int chlen,
chwidth;
chlen = PQmblen(pwcs, encoding);
if (len < (size_t) chlen)
break; /* Invalid string */
chwidth = PQdsplen(pwcs, encoding);
if (chwidth > 0)
width += chwidth;
pwcs += chlen;
len -= chlen;
}
return width;
}
/*
* pg_wcssize takes the given string in the given encoding and returns three
* values:
* result_width: Width in display characters of the longest line in string
* result_height: Number of lines in display output
* result_format_size: Number of bytes required to store formatted
* representation of string
*
* This MUST be kept in sync with pg_wcsformat!
*/
void
pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
int *result_width, int *result_height, int *result_format_size)
{
int w,
chlen = 0,
linewidth = 0;
int width = 0;
int height = 1;
int format_size = 0;
for (; *pwcs && len > 0; pwcs += chlen)
{
chlen = PQmblen((const char *) pwcs, encoding);
if (len < (size_t) chlen)
break;
w = PQdsplen((const char *) pwcs, encoding);
if (chlen == 1) /* single-byte char */
{
if (*pwcs == '\n') /* Newline */
{
if (linewidth > width)
width = linewidth;
linewidth = 0;
height += 1;
format_size += 1; /* For NUL char */
}
else if (*pwcs == '\r') /* Linefeed */
{
linewidth += 2;
format_size += 2;
}
else if (*pwcs == '\t') /* Tab */
{
do
{
linewidth++;
format_size++;
} while (linewidth % 8 != 0);
}
else if (w < 0) /* Other control char */
{
linewidth += 4;
format_size += 4;
}
else /* Output it as-is */
{
linewidth += w;
format_size += 1;
}
}
else if (w < 0) /* Non-ascii control char */
{
linewidth += 6; /* \u0000 */
format_size += 6;
}
else /* All other chars */
{
linewidth += w;
format_size += chlen;
}
len -= chlen;
}
if (linewidth > width)
width = linewidth;
format_size += 1; /* For NUL char */
/* Set results */
if (result_width)
*result_width = width;
if (result_height)
*result_height = height;
if (result_format_size)
*result_format_size = format_size;
}
/*
* Format a string into one or more "struct lineptr" lines.
* lines[i].ptr == NULL indicates the end of the array.
*
* This MUST be kept in sync with pg_wcssize!
*/
void
pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
struct lineptr * lines, int count)
{
int w,
chlen = 0;
int linewidth = 0;
unsigned char *ptr = lines->ptr; /* Pointer to data area */
for (; *pwcs && len > 0; pwcs += chlen)
{
chlen = PQmblen((const char *) pwcs, encoding);
if (len < (size_t) chlen)
break;
w = PQdsplen((const char *) pwcs, encoding);
if (chlen == 1) /* single-byte char */
{
if (*pwcs == '\n') /* Newline */
{
*ptr++ = '\0';
lines->width = linewidth;
linewidth = 0;
lines++;
count--;
if (count <= 0)
exit(1); /* Screwup */
/* make next line point to remaining memory */
lines->ptr = ptr;
}
else if (*pwcs == '\r') /* Linefeed */
{
strcpy((char *) ptr, "\\r");
linewidth += 2;
ptr += 2;
}
else if (*pwcs == '\t') /* Tab */
{
do
{
*ptr++ = ' ';
linewidth++;
} while (linewidth % 8 != 0);
}
else if (w < 0) /* Other control char */
{
sprintf((char *) ptr, "\\x%02X", *pwcs);
linewidth += 4;
ptr += 4;
}
else /* Output it as-is */
{
linewidth += w;
*ptr++ = *pwcs;
}
}
else if (w < 0) /* Non-ascii control char */
{
if (encoding == PG_UTF8)
sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
else
{
/*
* This case cannot happen in the current code because only
* UTF-8 signals multibyte control characters. But we may need
* to support it at some stage
*/
sprintf((char *) ptr, "\\u????");
}
ptr += 6;
linewidth += 6;
}
else /* All other chars */
{
int i;
for (i = 0; i < chlen; i++)
*ptr++ = pwcs[i];
linewidth += w;
}
len -= chlen;
}
lines->width = linewidth;
*ptr++ = '\0'; /* Terminate formatted string */
if (count <= 0)
exit(1); /* Screwup */
(lines + 1)->ptr = NULL; /* terminate line array */
}
unsigned char *
mbvalidate(unsigned char *pwcs, int encoding)
{
if (encoding == PG_UTF8)
mb_utf_validate(pwcs);
else
{
/*
* other encodings needing validation should add their own routines
* here
*/
}
return pwcs;
}

18
src/bin/csql/mbprint.h Normal file
View File

@ -0,0 +1,18 @@
/* src/bin/psql/mbprint.h */
#ifndef MBPRINT_H
#define MBPRINT_H
struct lineptr
{
unsigned char *ptr;
int width;
};
extern unsigned char *mbvalidate(unsigned char *pwcs, int encoding);
extern int pg_wcswidth(const char *pwcs, size_t len, int encoding);
extern void pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, struct lineptr * lines, int count);
extern void pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
int *width, int *height, int *format_size);
#endif /* MBPRINT_H */

3495
src/bin/csql/print.c Normal file

File diff suppressed because it is too large Load Diff

206
src/bin/csql/print.h Normal file
View File

@ -0,0 +1,206 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/print.h
*/
#ifndef PRINT_H
#define PRINT_H
#include "libpq-fe.h"
enum printFormat
{
PRINT_NOTHING = 0, /* to make sure someone initializes this */
PRINT_UNALIGNED,
PRINT_ALIGNED,
PRINT_WRAPPED,
PRINT_HTML,
PRINT_ASCIIDOC,
PRINT_LATEX,
PRINT_LATEX_LONGTABLE,
PRINT_TROFF_MS
/* add your favourite output format here ... */
};
typedef struct printTextLineFormat
{
/* Line drawing characters to be used in various contexts */
const char *hrule; /* horizontal line character */
const char *leftvrule; /* left vertical line (+horizontal) */
const char *midvrule; /* intra-column vertical line (+horizontal) */
const char *rightvrule; /* right vertical line (+horizontal) */
} printTextLineFormat;
typedef enum printTextRule
{
/* Additional context for selecting line drawing characters */
PRINT_RULE_TOP, /* top horizontal line */
PRINT_RULE_MIDDLE, /* intra-data horizontal line */
PRINT_RULE_BOTTOM, /* bottom horizontal line */
PRINT_RULE_DATA /* data line (hrule is unused here) */
} printTextRule;
typedef enum printTextLineWrap
{
/* Line wrapping conditions */
PRINT_LINE_WRAP_NONE, /* No wrapping */
PRINT_LINE_WRAP_WRAP, /* Wraparound due to overlength line */
PRINT_LINE_WRAP_NEWLINE /* Newline in data */
} printTextLineWrap;
typedef struct printTextFormat
{
/* A complete line style */
const char *name; /* for display purposes */
printTextLineFormat lrule[4]; /* indexed by enum printTextRule */
const char *midvrule_nl; /* vertical line for continue after newline */
const char *midvrule_wrap; /* vertical line for wrapped data */
const char *midvrule_blank; /* vertical line for blank data */
const char *header_nl_left; /* left mark after newline */
const char *header_nl_right; /* right mark for newline */
const char *nl_left; /* left mark after newline */
const char *nl_right; /* right mark for newline */
const char *wrap_left; /* left mark after wrapped data */
const char *wrap_right; /* right mark for wrapped data */
bool wrap_right_border; /* use right-hand border for wrap
* marks when border=0? */
} printTextFormat;
typedef enum unicode_linestyle
{
UNICODE_LINESTYLE_SINGLE = 0,
UNICODE_LINESTYLE_DOUBLE
} unicode_linestyle;
struct separator
{
char *separator;
bool separator_zero;
};
typedef struct printTableOpt
{
enum printFormat format; /* see enum above */
unsigned short int expanded;/* expanded/vertical output (if supported by
* output format); 0=no, 1=yes, 2=auto */
unsigned short int border; /* Print a border around the table. 0=none,
* 1=dividing lines, 2=full */
unsigned short int pager; /* use pager for output (if to stdout and
* stdout is a tty) 0=off 1=on 2=always */
int pager_min_lines;/* don't use pager unless there are at least
* this many lines */
bool tuples_only; /* don't output headers, row counts, etc. */
bool start_table; /* print start decoration, eg <table> */
bool stop_table; /* print stop decoration, eg </table> */
bool default_footer; /* allow "(xx rows)" default footer */
unsigned long prior_records; /* start offset for record counters */
const printTextFormat *line_style; /* line style (NULL for default) */
struct separator fieldSep; /* field separator for unaligned text mode */
struct separator recordSep; /* record separator for unaligned text mode */
bool numericLocale; /* locale-aware numeric units separator and
* decimal marker */
char *tableAttr; /* attributes for HTML <table ...> */
int encoding; /* character encoding */
int env_columns; /* $COLUMNS on psql start, 0 is unset */
int columns; /* target width for wrapped format */
unicode_linestyle unicode_border_linestyle;
unicode_linestyle unicode_column_linestyle;
unicode_linestyle unicode_header_linestyle;
} printTableOpt;
/*
* Table footers are implemented as a singly-linked list.
*
* This is so that you don't need to know the number of footers in order to
* initialise the printTableContent struct, which is very convenient when
* preparing complex footers (as in describeOneTableDetails).
*/
typedef struct printTableFooter
{
char *data;
struct printTableFooter *next;
} printTableFooter;
/*
* The table content struct holds all the information which will be displayed
* by printTable().
*/
typedef struct printTableContent
{
const printTableOpt *opt;
const char *title; /* May be NULL */
int ncolumns; /* Specified in Init() */
int nrows; /* Specified in Init() */
const char **headers; /* NULL-terminated array of header strings */
const char **header; /* Pointer to the last added header */
const char **cells; /* NULL-terminated array of cell content
* strings */
const char **cell; /* Pointer to the last added cell */
long cellsadded; /* Number of cells added this far */
bool *cellmustfree; /* true for cells that need to be free()d */
printTableFooter *footers; /* Pointer to the first footer */
printTableFooter *footer; /* Pointer to the last added footer */
char *aligns; /* Array of alignment specifiers; 'l' or 'r',
* one per column */
char *align; /* Pointer to the last added alignment */
} printTableContent;
typedef struct printQueryOpt
{
printTableOpt topt; /* the options above */
char *nullPrint; /* how to print null entities */
bool quote; /* quote all values as much as possible */
char *title; /* override title */
char **footers; /* override footer (default is "(xx rows)") */
bool translate_header; /* do gettext on column headers */
const bool *translate_columns; /* translate_columns[i-1] => do
* gettext on col i */
int n_translate_columns; /* length of translate_columns[] */
} printQueryOpt;
extern const printTextFormat pg_asciiformat;
extern const printTextFormat pg_asciiformat_old;
extern const printTextFormat pg_utf8format;
extern void disable_sigpipe_trap(void);
extern void restore_sigpipe_trap(void);
extern void set_sigpipe_trap_state(bool ignore);
extern FILE *PageOutput(int lines, const printTableOpt *topt);
extern void ClosePager(FILE *pagerpipe);
extern void html_escaped_print(const char *in, FILE *fout);
extern void printTableInit(printTableContent *const content,
const printTableOpt *opt, const char *title,
const int ncolumns, const int nrows);
extern void printTableAddHeader(printTableContent *const content,
char *header, const bool translate, const char align);
extern void printTableAddCell(printTableContent *const content,
char *cell, const bool translate, const bool mustfree);
extern void printTableAddFooter(printTableContent *const content,
const char *footer);
extern void printTableSetFooter(printTableContent *const content,
const char *footer);
extern void printTableCleanup(printTableContent *const content);
extern void printTable(const printTableContent *cont,
FILE *fout, bool is_pager, FILE *flog);
extern void printQuery(const PGresult *result, const printQueryOpt *opt,
FILE *fout, bool is_pager, FILE *flog);
extern void setDecimalLocale(void);
extern const printTextFormat *get_line_style(const printTableOpt *opt);
extern void refresh_utf8format(const printTableOpt *opt);
#ifndef __CYGWIN__
#define DEFAULT_PAGER "more"
#else
#define DEFAULT_PAGER "less"
#endif
#endif /* PRINT_H */

325
src/bin/csql/prompt.c Normal file
View File

@ -0,0 +1,325 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/prompt.c
*/
#include "postgres_fe.h"
#ifdef WIN32
#include <io.h>
#include <win32.h>
#endif
#ifdef HAVE_UNIX_SOCKETS
#include <unistd.h>
#include <netdb.h>
#endif
#include "common.h"
#include "input.h"
#include "prompt.h"
#include "settings.h"
/*--------------------------
* get_prompt
*
* Returns a statically allocated prompt made by interpolating certain
* tcsh style escape sequences into pset.vars "PROMPT1|2|3".
* (might not be completely multibyte safe)
*
* Defined interpolations are:
* %M - database server "hostname.domainname", "[local]" for AF_UNIX
* sockets, "[local:/dir/name]" if not default
* %m - like %M, but hostname only (before first dot), or always "[local]"
* %> - database server port number
* %n - database user name
* %/ - current database
* %~ - like %/ but "~" when database name equals user name
* %# - "#" if superuser, ">" otherwise
* %R - in prompt1 normally =, or ^ if single line mode,
* or a ! if session is not connected to a database;
* in prompt2 -, *, ', or ";
* in prompt3 nothing
* %x - transaction status: empty, *, !, ? (unknown or no connection)
* %l - The line number inside the current statement, starting from 1.
* %? - the error code of the last query (not yet implemented)
* %% - a percent sign
*
* %[0-9] - the character with the given decimal code
* %0[0-7] - the character with the given octal code
* %0x[0-9A-Fa-f] - the character with the given hexadecimal code
*
* %`command` - The result of executing command in /bin/sh with trailing
* newline stripped.
* %:name: - The value of the psql variable 'name'
* (those will not be rescanned for more escape sequences!)
*
* %[ ... %] - tell readline that the contained text is invisible
*
* If the application-wide prompts become NULL somehow, the returned string
* will be empty (not NULL!).
*--------------------------
*/
char *
get_prompt(promptStatus_t status)
{
#define MAX_PROMPT_SIZE 256
static char destination[MAX_PROMPT_SIZE + 1];
char buf[MAX_PROMPT_SIZE + 1];
bool esc = false;
const char *p;
const char *prompt_string = "? ";
switch (status)
{
case PROMPT_READY:
prompt_string = pset.prompt1;
break;
case PROMPT_CONTINUE:
case PROMPT_SINGLEQUOTE:
case PROMPT_DOUBLEQUOTE:
case PROMPT_DOLLARQUOTE:
case PROMPT_COMMENT:
case PROMPT_PAREN:
prompt_string = pset.prompt2;
break;
case PROMPT_COPY:
prompt_string = pset.prompt3;
break;
}
destination[0] = '\0';
for (p = prompt_string;
*p && strlen(destination) < sizeof(destination) - 1;
p++)
{
memset(buf, 0, sizeof(buf));
if (esc)
{
switch (*p)
{
/* Current database */
case '/':
if (pset.db)
strlcpy(buf, PQdb(pset.db), sizeof(buf));
break;
case '~':
if (pset.db)
{
const char *var;
if (strcmp(PQdb(pset.db), PQuser(pset.db)) == 0 ||
((var = getenv("PGDATABASE")) && strcmp(var, PQdb(pset.db)) == 0))
strlcpy(buf, "~", sizeof(buf));
else
strlcpy(buf, PQdb(pset.db), sizeof(buf));
}
break;
/* DB server hostname (long/short) */
case 'M':
case 'm':
if (pset.db)
{
const char *host = PQhost(pset.db);
/* INET socket */
if (host && host[0] && !is_absolute_path(host))
{
strlcpy(buf, host, sizeof(buf));
if (*p == 'm')
buf[strcspn(buf, ".")] = '\0';
}
#ifdef HAVE_UNIX_SOCKETS
/* UNIX socket */
else
{
if (!host
|| strcmp(host, DEFAULT_PGSOCKET_DIR) == 0
|| *p == 'm')
strlcpy(buf, "[local]", sizeof(buf));
else
snprintf(buf, sizeof(buf), "[local:%s]", host);
}
#endif
}
break;
/* DB server port number */
case '>':
if (pset.db && PQport(pset.db))
strlcpy(buf, PQport(pset.db), sizeof(buf));
break;
/* DB server user name */
case 'n':
if (pset.db)
strlcpy(buf, session_username(), sizeof(buf));
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
*buf = (char) strtol(p, (char **) &p, 8);
--p;
break;
case 'R':
switch (status)
{
case PROMPT_READY:
if (!pset.db)
buf[0] = '!';
else if (!pset.singleline)
buf[0] = '=';
else
buf[0] = '^';
break;
case PROMPT_CONTINUE:
buf[0] = '-';
break;
case PROMPT_SINGLEQUOTE:
buf[0] = '\'';
break;
case PROMPT_DOUBLEQUOTE:
buf[0] = '"';
break;
case PROMPT_DOLLARQUOTE:
buf[0] = '$';
break;
case PROMPT_COMMENT:
buf[0] = '*';
break;
case PROMPT_PAREN:
buf[0] = '(';
break;
default:
buf[0] = '\0';
break;
}
break;
case 'x':
if (!pset.db)
buf[0] = '?';
else
switch (PQtransactionStatus(pset.db))
{
case PQTRANS_IDLE:
buf[0] = '\0';
break;
case PQTRANS_ACTIVE:
case PQTRANS_INTRANS:
buf[0] = '*';
break;
case PQTRANS_INERROR:
buf[0] = '!';
break;
default:
buf[0] = '?';
break;
}
break;
case 'l':
snprintf(buf, sizeof(buf), UINT64_FORMAT, pset.stmt_lineno);
break;
case '?':
/* not here yet */
break;
case '#':
if (is_superuser())
buf[0] = '#';
else
buf[0] = '>';
break;
/* execute command */
case '`':
{
FILE *fd;
char *file = pg_strdup(p + 1);
int cmdend;
cmdend = strcspn(file, "`");
file[cmdend] = '\0';
fd = popen(file, "r");
if (fd)
{
if (fgets(buf, sizeof(buf), fd) == NULL)
buf[0] = '\0';
pclose(fd);
}
if (strlen(buf) > 0 && buf[strlen(buf) - 1] == '\n')
buf[strlen(buf) - 1] = '\0';
free(file);
p += cmdend + 1;
break;
}
/* interpolate variable */
case ':':
{
char *name;
const char *val;
int nameend;
name = pg_strdup(p + 1);
nameend = strcspn(name, ":");
name[nameend] = '\0';
val = GetVariable(pset.vars, name);
if (val)
strlcpy(buf, val, sizeof(buf));
free(name);
p += nameend + 1;
break;
}
case '[':
case ']':
#if defined(USE_READLINE) && defined(RL_PROMPT_START_IGNORE)
/*
* readline >=4.0 undocumented feature: non-printing
* characters in prompt strings must be marked as such, in
* order to properly display the line during editing.
*/
buf[0] = (*p == '[') ? RL_PROMPT_START_IGNORE : RL_PROMPT_END_IGNORE;
buf[1] = '\0';
#endif /* USE_READLINE */
break;
default:
buf[0] = *p;
buf[1] = '\0';
break;
}
esc = false;
}
else if (*p == '%')
esc = true;
else
{
buf[0] = *p;
buf[1] = '\0';
esc = false;
}
if (!esc)
strlcat(destination, buf, sizeof(destination));
}
return destination;
}

25
src/bin/csql/prompt.h Normal file
View File

@ -0,0 +1,25 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/prompt.h
*/
#ifndef PROMPT_H
#define PROMPT_H
typedef enum _promptStatus
{
PROMPT_READY,
PROMPT_CONTINUE,
PROMPT_COMMENT,
PROMPT_SINGLEQUOTE,
PROMPT_DOUBLEQUOTE,
PROMPT_DOLLARQUOTE,
PROMPT_PAREN,
PROMPT_COPY
} promptStatus_t;
char *get_prompt(promptStatus_t status);
#endif /* PROMPT_H */

View File

@ -0,0 +1,8 @@
--
-- system-wide psql configuration file
--
-- This file is read before the .psqlrc file in the user's home directory.
--
-- Copy this to your installation's sysconf directory and rename it psqlrc.
-- The sysconf directory can be identified via "pg_config --sysconfdir".
--

64
src/bin/csql/psqlscan.h Normal file
View File

@ -0,0 +1,64 @@
/*
* psql - the PostgreSQL interactive terminal
*
* Copyright (c) 2000-2015, PostgreSQL Global Development Group
*
* src/bin/psql/psqlscan.h
*/
#ifndef PSQLSCAN_H
#define PSQLSCAN_H
#include "pqexpbuffer.h"
#include "prompt.h"
/* Abstract type for lexer's internal state */
typedef struct PsqlScanStateData *PsqlScanState;
/* Termination states for psql_scan() */
typedef enum
{
PSCAN_SEMICOLON, /* found command-ending semicolon */
PSCAN_BACKSLASH, /* found backslash command */
PSCAN_INCOMPLETE, /* end of line, SQL statement incomplete */
PSCAN_EOL /* end of line, SQL possibly complete */
} PsqlScanResult;
/* Different ways for scan_slash_option to handle parameter words */
enum slash_option_type
{
OT_NORMAL, /* normal case */
OT_SQLID, /* treat as SQL identifier */
OT_SQLIDHACK, /* SQL identifier, but don't downcase */
OT_FILEPIPE, /* it's a filename or pipe */
OT_WHOLE_LINE, /* just snarf the rest of the line */
OT_NO_EVAL /* no expansion of backticks or variables */
};
extern PsqlScanState psql_scan_create(void);
extern void psql_scan_destroy(PsqlScanState state);
extern void psql_scan_setup(PsqlScanState state,
const char *line, int line_len);
extern void psql_scan_finish(PsqlScanState state);
extern PsqlScanResult psql_scan(PsqlScanState state,
PQExpBuffer query_buf,
promptStatus_t *prompt);
extern void psql_scan_reset(PsqlScanState state);
extern bool psql_scan_in_quote(PsqlScanState state);
extern char *psql_scan_slash_command(PsqlScanState state);
extern char *psql_scan_slash_option(PsqlScanState state,
enum slash_option_type type,
char *quote,
bool semicolon);
extern void psql_scan_slash_command_end(PsqlScanState state);
#endif /* PSQLSCAN_H */

1988
src/bin/csql/psqlscan.l Normal file

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More