Merge branch 'main' into sqlancer-test-gha

pull/6697/head
Gokhan Gulbiz 2023-04-04 16:00:28 +03:00 committed by GitHub
commit 201d976a3b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
290 changed files with 24462 additions and 5360 deletions

View File

@ -6,7 +6,7 @@ orbs:
parameters:
image_suffix:
type: string
default: '-vc4b1573'
default: '-v087ecd7'
pg13_version:
type: string
default: '13.10'
@ -201,6 +201,9 @@ jobs:
- run:
name: 'Check if all GUCs are sorted alphabetically'
command: ci/check_gucs_are_alphabetically_sorted.sh
- run:
name: 'Check for missing downgrade scripts'
command: ci/check_migration_files.sh
check-sql-snapshots:
docker:
@ -266,6 +269,41 @@ jobs:
- coverage:
flags: 'test_<< parameters.old_pg_major >>_<< parameters.new_pg_major >>,upgrade'
test-pytest:
description: Runs pytest based tests
parameters:
pg_major:
description: 'postgres major version'
type: integer
image:
description: 'docker image to use as for the tests'
type: string
default: citus/failtester
image_tag:
description: 'docker image tag to use'
type: string
docker:
- image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>'
working_directory: /home/circleci/project
steps:
- checkout
- attach_workspace:
at: .
- install_extension:
pg_major: << parameters.pg_major >>
- configure
- enable_core
- run:
name: 'Run pytest'
command: |
gosu circleci \
make -C src/test/regress check-pytest
no_output_timeout: 2m
- stack_trace
- coverage:
flags: 'test_<< parameters.pg_major >>,pytest'
test-arbitrary-configs:
description: Runs tests on arbitrary configs
parallelism: 6
@ -452,6 +490,10 @@ jobs:
pg_major: << parameters.pg_major >>
- configure
- enable_core
- run:
name: 'Install DBI.pm'
command: |
apt-get update && apt-get install libdbi-perl && apt-get install libdbd-pg-perl
- run:
name: 'Run Test'
command: |
@ -551,7 +593,7 @@ jobs:
testForDebugging="<< parameters.test >>"
if [ -z "$testForDebugging" ]; then
detected_changes=$(git diff origin/main... --name-only --diff-filter=AM | (grep 'src/test/regress/sql/.*.sql\|src/test/regress/spec/.*.spec' || true))
detected_changes=$(git diff origin/main... --name-only --diff-filter=AM | (grep 'src/test/regress/sql/.*\.sql\|src/test/regress/spec/.*\.spec\|src/test/regress/citus_tests/test/test_.*\.py' || true))
tests=${detected_changes}
else
tests=$testForDebugging;
@ -854,38 +896,30 @@ workflows:
image: citus/failtester
make: check-failure
- tap-test-citus: &tap-test-citus-13
name: 'test-13_tap-recovery'
suite: recovery
- test-pytest:
name: 'test-13_pytest'
pg_major: 13
image_tag: '<< pipeline.parameters.pg13_version >>'
requires: [build-13]
- tap-test-citus:
<<: *tap-test-citus-13
name: 'test-13_tap-columnar-freezing'
suite: columnar_freezing
- tap-test-citus: &tap-test-citus-14
name: 'test-14_tap-recovery'
suite: recovery
- test-pytest:
name: 'test-14_pytest'
pg_major: 14
image_tag: '<< pipeline.parameters.pg14_version >>'
requires: [build-14]
- tap-test-citus:
<<: *tap-test-citus-14
name: 'test-14_tap-columnar-freezing'
suite: columnar_freezing
- tap-test-citus: &tap-test-citus-15
name: 'test-15_tap-recovery'
suite: recovery
- test-pytest:
name: 'test-15_pytest'
pg_major: 15
image_tag: '<< pipeline.parameters.pg15_version >>'
requires: [build-15]
- tap-test-citus:
<<: *tap-test-citus-15
name: 'test-15_tap-columnar-freezing'
suite: columnar_freezing
name: 'test-15_tap-cdc'
suite: cdc
pg_major: 15
image_tag: '<< pipeline.parameters.pg15_version >>'
requires: [build-15]
- test-arbitrary-configs:
name: 'test-13_check-arbitrary-configs'
@ -936,8 +970,6 @@ workflows:
- test-13_check-follower-cluster
- test-13_check-columnar
- test-13_check-columnar-isolation
- test-13_tap-recovery
- test-13_tap-columnar-freezing
- test-13_check-failure
- test-13_check-enterprise
- test-13_check-enterprise-isolation
@ -956,8 +988,6 @@ workflows:
- test-14_check-follower-cluster
- test-14_check-columnar
- test-14_check-columnar-isolation
- test-14_tap-recovery
- test-14_tap-columnar-freezing
- test-14_check-failure
- test-14_check-enterprise
- test-14_check-enterprise-isolation
@ -976,8 +1006,6 @@ workflows:
- test-15_check-follower-cluster
- test-15_check-columnar
- test-15_check-columnar-isolation
- test-15_tap-recovery
- test-15_tap-columnar-freezing
- test-15_check-failure
- test-15_check-enterprise
- test-15_check-enterprise-isolation

View File

@ -17,7 +17,7 @@ trim_trailing_whitespace = true
insert_final_newline = unset
trim_trailing_whitespace = unset
[*.{sql,sh,py}]
[*.{sql,sh,py,toml}]
indent_style = space
indent_size = 4
tab_width = 4

View File

@ -1,7 +1,6 @@
[flake8]
# E203 is ignored for black
# E402 is ignored because of te way we do relative imports
extend-ignore = E203, E402
extend-ignore = E203
# black will truncate to 88 characters usually, but long string literals it
# might keep. That's fine in most cases unless it gets really excessive.
max-line-length = 150

View File

@ -157,7 +157,6 @@ jobs:
apt-get update -y
## Install required packages to execute packaging tools for deb based distros
apt install python3-dev python3-pip -y
sudo apt-get purge -y python3-yaml
python3 -m pip install --upgrade pip setuptools==57.5.0
apt-get install python3-dev python3-pip -y
apt-get purge -y python3-yaml
./.github/packaging/validate_build_output.sh "deb"

View File

@ -283,6 +283,14 @@ actually run in CI. This is most commonly forgotten for newly added CI tests
that the developer only ran locally. It also checks that all CI scripts have a
section in this `README.md` file and that they include `ci/ci_helpers.sh`.
## `check_migration_files.sh`
A branch that touches a set of upgrade scripts is also expected to touch
corresponding downgrade scripts as well. If this script fails, read the output
and make sure you update the downgrade scripts in the printed list. If you
really don't need a downgrade to run any SQL. You can write a comment in the
file explaining why a downgrade step is not necessary.
## `disallow_c_comments_in_migrations.sh`
We do not use C-style comments in migration files as the stripped

33
ci/check_migration_files.sh Executable file
View File

@ -0,0 +1,33 @@
#! /bin/bash
set -euo pipefail
# shellcheck disable=SC1091
source ci/ci_helpers.sh
# This file checks for the existence of downgrade scripts for every upgrade script that is changed in the branch.
# create list of migration files for upgrades
upgrade_files=$(git diff --name-only origin/main | { grep "src/backend/distributed/sql/citus--.*sql" || exit 0 ; })
downgrade_files=$(git diff --name-only origin/main | { grep "src/backend/distributed/sql/downgrades/citus--.*sql" || exit 0 ; })
ret_value=0
for file in $upgrade_files
do
# There should always be 2 matches, and no need to avoid splitting here
# shellcheck disable=SC2207
versions=($(grep --only-matching --extended-regexp "[0-9]+\.[0-9]+[-.][0-9]+" <<< "$file"))
from_version=${versions[0]};
to_version=${versions[1]};
downgrade_migration_file="src/backend/distributed/sql/downgrades/citus--$to_version--$from_version.sql"
# check for the existence of migration scripts
if [[ $(grep --line-regexp --count "$downgrade_migration_file" <<< "$downgrade_files") == 0 ]]
then
echo "$file is updated, but $downgrade_migration_file is not updated in branch"
ret_value=1
fi
done
exit $ret_value;

View File

@ -3,3 +3,35 @@ profile = 'black'
[tool.black]
include = '(src/test/regress/bin/diff-filter|\.pyi?|\.ipynb)$'
[tool.pytest.ini_options]
addopts = [
"--import-mode=importlib",
"--showlocals",
"--tb=short",
]
pythonpath = 'src/test/regress/citus_tests'
asyncio_mode = 'auto'
# Make test discovery quicker from the root dir of the repo
testpaths = ['src/test/regress/citus_tests/test']
# Make test discovery quicker from other directories than root directory
norecursedirs = [
'*.egg',
'.*',
'build',
'venv',
'ci',
'vendor',
'backend',
'bin',
'include',
'tmp_*',
'results',
'expected',
'sql',
'spec',
'data',
'__pycache__',
]

3
src/backend/columnar/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# The directory used to store columnar sql files after pre-processing them
# with 'cpp' in build-time, see src/backend/columnar/Makefile.
/build/

View File

@ -10,14 +10,51 @@ OBJS += \
MODULE_big = citus_columnar
EXTENSION = citus_columnar
columnar_sql_files = $(patsubst $(citus_abs_srcdir)/%,%,$(wildcard $(citus_abs_srcdir)/sql/*.sql))
columnar_downgrade_sql_files = $(patsubst $(citus_abs_srcdir)/%,%,$(wildcard $(citus_abs_srcdir)/sql/downgrades/*.sql))
DATA = $(columnar_sql_files) \
$(columnar_downgrade_sql_files)
template_sql_files = $(patsubst $(citus_abs_srcdir)/%,%,$(wildcard $(citus_abs_srcdir)/sql/*.sql))
template_downgrade_sql_files = $(patsubst $(citus_abs_srcdir)/sql/downgrades/%,%,$(wildcard $(citus_abs_srcdir)/sql/downgrades/*.sql))
generated_sql_files = $(patsubst %,$(citus_abs_srcdir)/build/%,$(template_sql_files))
generated_downgrade_sql_files += $(patsubst %,$(citus_abs_srcdir)/build/sql/%,$(template_downgrade_sql_files))
DATA_built = $(generated_sql_files)
PG_CPPFLAGS += -I$(libpq_srcdir) -I$(safestringlib_srcdir)/include
include $(citus_top_builddir)/Makefile.global
.PHONY: install-all
SQL_DEPDIR=.deps/sql
SQL_BUILDDIR=build/sql
$(generated_sql_files): $(citus_abs_srcdir)/build/%: %
@mkdir -p $(citus_abs_srcdir)/$(SQL_DEPDIR) $(citus_abs_srcdir)/$(SQL_BUILDDIR)
@# -MF is used to store dependency files(.Po) in another directory for separation
@# -MT is used to change the target of the rule emitted by dependency generation.
@# -P is used to inhibit generation of linemarkers in the output from the preprocessor.
@# -undef is used to not predefine any system-specific or GCC-specific macros.
@# `man cpp` for further information
cd $(citus_abs_srcdir) && cpp -undef -w -P -MMD -MP -MF$(SQL_DEPDIR)/$(*F).Po -MT$@ $< > $@
$(generated_downgrade_sql_files): $(citus_abs_srcdir)/build/sql/%: sql/downgrades/%
@mkdir -p $(citus_abs_srcdir)/$(SQL_DEPDIR) $(citus_abs_srcdir)/$(SQL_BUILDDIR)
@# -MF is used to store dependency files(.Po) in another directory for separation
@# -MT is used to change the target of the rule emitted by dependency generation.
@# -P is used to inhibit generation of linemarkers in the output from the preprocessor.
@# -undef is used to not predefine any system-specific or GCC-specific macros.
@# `man cpp` for further information
cd $(citus_abs_srcdir) && cpp -undef -w -P -MMD -MP -MF$(SQL_DEPDIR)/$(*F).Po -MT$@ $< > $@
.PHONY: install install-downgrades install-all
cleanup-before-install:
rm -f $(DESTDIR)$(datadir)/$(datamoduledir)/citus_columnar.control
rm -f $(DESTDIR)$(datadir)/$(datamoduledir)/columnar--*
rm -f $(DESTDIR)$(datadir)/$(datamoduledir)/citus_columnar--*
install: cleanup-before-install
# install and install-downgrades should be run sequentially
install-all: install
$(MAKE) install-downgrades
install-downgrades: $(generated_downgrade_sql_files)
$(INSTALL_DATA) $(generated_downgrade_sql_files) '$(DESTDIR)$(datadir)/$(datamoduledir)/'

View File

@ -1 +1,19 @@
-- citus_columnar--11.1-1--11.2-1
#include "udfs/columnar_ensure_am_depends_catalog/11.2-1.sql"
DELETE FROM pg_depend
WHERE classid = 'pg_am'::regclass::oid
AND objid IN (select oid from pg_am where amname = 'columnar')
AND objsubid = 0
AND refclassid = 'pg_class'::regclass::oid
AND refobjid IN (
'columnar_internal.stripe_first_row_number_idx'::regclass::oid,
'columnar_internal.chunk_group_pkey'::regclass::oid,
'columnar_internal.chunk_pkey'::regclass::oid,
'columnar_internal.options_pkey'::regclass::oid,
'columnar_internal.stripe_first_row_number_idx'::regclass::oid,
'columnar_internal.stripe_pkey'::regclass::oid
)
AND refobjsubid = 0
AND deptype = 'n';

View File

@ -1 +1,4 @@
-- citus_columnar--11.2-1--11.1-1
-- Note that we intentionally do not re-insert the pg_depend records that we
-- deleted via citus_columnar--11.1-1--11.2-1.sql.

View File

@ -0,0 +1,43 @@
CREATE OR REPLACE FUNCTION columnar_internal.columnar_ensure_am_depends_catalog()
RETURNS void
LANGUAGE plpgsql
SET search_path = pg_catalog
AS $func$
BEGIN
INSERT INTO pg_depend
WITH columnar_schema_members(relid) AS (
SELECT pg_class.oid AS relid FROM pg_class
WHERE relnamespace =
COALESCE(
(SELECT pg_namespace.oid FROM pg_namespace WHERE nspname = 'columnar_internal'),
(SELECT pg_namespace.oid FROM pg_namespace WHERE nspname = 'columnar')
)
AND relname IN ('chunk',
'chunk_group',
'options',
'storageid_seq',
'stripe')
)
SELECT -- Define a dependency edge from "columnar table access method" ..
'pg_am'::regclass::oid as classid,
(select oid from pg_am where amname = 'columnar') as objid,
0 as objsubid,
-- ... to some objects registered as regclass and that lives in
-- "columnar" schema. That contains catalog tables and the sequences
-- created in "columnar" schema.
--
-- Given the possibility of user might have created their own objects
-- in columnar schema, we explicitly specify list of objects that we
-- are interested in.
'pg_class'::regclass::oid as refclassid,
columnar_schema_members.relid as refobjid,
0 as refobjsubid,
'n' as deptype
FROM columnar_schema_members
-- Avoid inserting duplicate entries into pg_depend.
EXCEPT TABLE pg_depend;
END;
$func$;
COMMENT ON FUNCTION columnar_internal.columnar_ensure_am_depends_catalog()
IS 'internal function responsible for creating dependencies from columnar '
'table access method to the rel objects in columnar schema';

View File

@ -1,4 +1,4 @@
CREATE OR REPLACE FUNCTION citus_internal.columnar_ensure_am_depends_catalog()
CREATE OR REPLACE FUNCTION columnar_internal.columnar_ensure_am_depends_catalog()
RETURNS void
LANGUAGE plpgsql
SET search_path = pg_catalog
@ -14,22 +14,17 @@ BEGIN
)
AND relname IN ('chunk',
'chunk_group',
'chunk_group_pkey',
'chunk_pkey',
'options',
'options_pkey',
'storageid_seq',
'stripe',
'stripe_first_row_number_idx',
'stripe_pkey')
'stripe')
)
SELECT -- Define a dependency edge from "columnar table access method" ..
'pg_am'::regclass::oid as classid,
(select oid from pg_am where amname = 'columnar') as objid,
0 as objsubid,
-- ... to each object that is registered to pg_class and that lives
-- in "columnar" schema. That contains catalog tables, indexes
-- created on them and the sequences created in "columnar" schema.
-- ... to some objects registered as regclass and that lives in
-- "columnar" schema. That contains catalog tables and the sequences
-- created in "columnar" schema.
--
-- Given the possibility of user might have created their own objects
-- in columnar schema, we explicitly specify list of objects that we
@ -43,6 +38,6 @@ BEGIN
EXCEPT TABLE pg_depend;
END;
$func$;
COMMENT ON FUNCTION citus_internal.columnar_ensure_am_depends_catalog()
COMMENT ON FUNCTION columnar_internal.columnar_ensure_am_depends_catalog()
IS 'internal function responsible for creating dependencies from columnar '
'table access method to the rel objects in columnar schema';

View File

@ -32,7 +32,13 @@ OBJS += \
$(patsubst $(citus_abs_srcdir)/%.c,%.o,$(foreach dir,$(SUBDIRS), $(sort $(wildcard $(citus_abs_srcdir)/$(dir)/*.c))))
# be explicit about the default target
all:
.PHONY: cdc
all: cdc
cdc:
echo "running cdc make"
$(MAKE) DECODER=pgoutput -C cdc all
NO_PGXS = 1
@ -81,11 +87,19 @@ endif
.PHONY: clean-full install install-downgrades install-all
clean: clean-cdc
clean-cdc:
$(MAKE) DECODER=pgoutput -C cdc clean
cleanup-before-install:
rm -f $(DESTDIR)$(datadir)/$(datamoduledir)/citus.control
rm -f $(DESTDIR)$(datadir)/$(datamoduledir)/citus--*
install: cleanup-before-install
install: cleanup-before-install install-cdc
install-cdc:
$(MAKE) DECODER=pgoutput -C cdc install
# install and install-downgrades should be run sequentially
install-all: install
@ -96,4 +110,5 @@ install-downgrades: $(generated_downgrade_sql_files)
clean-full:
$(MAKE) clean
$(MAKE) -C cdc clean-full
rm -rf $(safestringlib_builddir)

View File

@ -0,0 +1,26 @@
ifndef DECODER
DECODER = pgoutput
endif
MODULE_big = citus_$(DECODER)
citus_subdir = src/backend/distributed/cdc
citus_top_builddir = ../../../..
citus_decoders_dir = $(DESTDIR)$(pkglibdir)/citus_decoders
OBJS += cdc_decoder.o cdc_decoder_utils.o
include $(citus_top_builddir)/Makefile.global
override CFLAGS += -DDECODER=\"$(DECODER)\" -I$(citus_abs_top_srcdir)/include
override CPPFLAGS += -DDECODER=\"$(DECODER)\" -I$(citus_abs_top_srcdir)/include
install: install-cdc
clean: clean-cdc
install-cdc:
mkdir -p '$(citus_decoders_dir)'
$(INSTALL_SHLIB) citus_$(DECODER).so '$(citus_decoders_dir)/$(DECODER).so'
clean-cdc:
rm -f '$(DESTDIR)$(datadir)/$(datamoduledir)/citus_decoders/$(DECODER).so'

View File

@ -0,0 +1,500 @@
/*-------------------------------------------------------------------------
*
* cdc_decoder.c
* CDC Decoder plugin for Citus
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "cdc_decoder_utils.h"
#include "postgres.h"
#include "fmgr.h"
#include "access/genam.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_publication.h"
#include "commands/extension.h"
#include "common/hashfn.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/typcache.h"
PG_MODULE_MAGIC;
extern void _PG_output_plugin_init(OutputPluginCallbacks *cb);
static LogicalDecodeChangeCB ouputPluginChangeCB;
static void InitShardToDistributedTableMap(void);
static void PublishDistributedTableChanges(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn,
Relation relation,
ReorderBufferChange *change);
static bool replication_origin_filter_cb(LogicalDecodingContext *ctx, RepOriginId
origin_id);
static void TranslateChangesIfSchemaChanged(Relation relation, Relation targetRelation,
ReorderBufferChange *change);
static void TranslateAndPublishRelationForCDC(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn,
Relation relation,
ReorderBufferChange *change, Oid shardId,
Oid targetRelationid);
typedef struct
{
uint64 shardId;
Oid distributedTableId;
bool isReferenceTable;
bool isNull;
} ShardIdHashEntry;
static HTAB *shardToDistributedTableMap = NULL;
static void cdc_change_cb(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change);
/* build time macro for base decoder plugin name for CDC and Shard Split. */
#ifndef DECODER
#define DECODER "pgoutput"
#endif
#define DECODER_INIT_FUNCTION_NAME "_PG_output_plugin_init"
#define CITUS_SHARD_TRANSFER_SLOT_PREFIX "citus_shard_"
#define CITUS_SHARD_TRANSFER_SLOT_PREFIX_SIZE (sizeof(CITUS_SHARD_TRANSFER_SLOT_PREFIX) - \
1)
/*
* Postgres uses 'pgoutput' as default plugin for logical replication.
* We want to reuse Postgres pgoutput's functionality as much as possible.
* Hence we load all the functions of this plugin and override as required.
*/
void
_PG_output_plugin_init(OutputPluginCallbacks *cb)
{
elog(LOG, "Initializing CDC decoder");
/*
* We build custom .so files whose name matches common decoders (pgoutput, wal2json)
* and place them in $libdir/citus_decoders/ such that administrators can configure
* dynamic_library_path to include this directory, and users can then use the
* regular decoder names when creating replications slots.
*
* To load the original decoder, we need to remove citus_decoders/ from the
* dynamic_library_path.
*/
char *originalDLP = Dynamic_library_path;
Dynamic_library_path = RemoveCitusDecodersFromPaths(Dynamic_library_path);
LogicalOutputPluginInit plugin_init =
(LogicalOutputPluginInit) (void *)
load_external_function(DECODER,
DECODER_INIT_FUNCTION_NAME,
false, NULL);
if (plugin_init == NULL)
{
elog(ERROR, "output plugins have to declare the _PG_output_plugin_init symbol");
}
/* in case this session is used for different replication slots */
Dynamic_library_path = originalDLP;
/* ask the output plugin to fill the callback struct */
plugin_init(cb);
/* Initialize the Shard Id to Distributed Table id mapping hash table.*/
InitShardToDistributedTableMap();
/* actual pgoutput callback function will be called */
ouputPluginChangeCB = cb->change_cb;
cb->change_cb = cdc_change_cb;
cb->filter_by_origin_cb = replication_origin_filter_cb;
}
/*
* Check if the replication slot is for Shard transfer by checking for prefix.
*/
inline static
bool
IsShardTransferSlot(char *replicationSlotName)
{
return strncmp(replicationSlotName, CITUS_SHARD_TRANSFER_SLOT_PREFIX,
CITUS_SHARD_TRANSFER_SLOT_PREFIX_SIZE) == 0;
}
/*
* shard_split_and_cdc_change_cb function emits the incoming tuple change
* to the appropriate destination shard.
*/
static void
cdc_change_cb(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change)
{
/*
* If Citus has not been loaded yet, pass the changes
* through to the undrelying decoder plugin.
*/
if (!CdcCitusHasBeenLoaded())
{
ouputPluginChangeCB(ctx, txn, relation, change);
return;
}
/* check if the relation is publishable.*/
if (!is_publishable_relation(relation))
{
return;
}
char *replicationSlotName = ctx->slot->data.name.data;
if (replicationSlotName == NULL)
{
elog(ERROR, "Replication slot name is NULL!");
return;
}
/* If the slot is for internal shard operations, call the base plugin's call back. */
if (IsShardTransferSlot(replicationSlotName))
{
ouputPluginChangeCB(ctx, txn, relation, change);
return;
}
/* Transalate the changes from shard to distributes table and publish. */
PublishDistributedTableChanges(ctx, txn, relation, change);
}
/*
* InitShardToDistributedTableMap initializes the hash table that is used to
* translate the changes in the shard table to the changes in the distributed table.
*/
static void
InitShardToDistributedTableMap()
{
HASHCTL info;
memset(&info, 0, sizeof(info));
info.keysize = sizeof(uint64);
info.entrysize = sizeof(ShardIdHashEntry);
info.hash = tag_hash;
info.hcxt = CurrentMemoryContext;
int hashFlags = (HASH_ELEM | HASH_CONTEXT | HASH_FUNCTION);
shardToDistributedTableMap = hash_create("CDC Decoder translation hash table", 1024,
&info, hashFlags);
}
/*
* AddShardIdToHashTable adds the shardId to the hash table.
*/
static Oid
AddShardIdToHashTable(uint64 shardId, ShardIdHashEntry *entry)
{
entry->shardId = shardId;
entry->distributedTableId = CdcLookupShardRelationFromCatalog(shardId, true);
entry->isReferenceTable = CdcPartitionMethodViaCatalog(entry->distributedTableId) ==
'n';
return entry->distributedTableId;
}
static Oid
LookupDistributedTableIdForShardId(uint64 shardId, bool *isReferenceTable)
{
bool found;
Oid distributedTableId = InvalidOid;
ShardIdHashEntry *entry = (ShardIdHashEntry *) hash_search(shardToDistributedTableMap,
&shardId,
HASH_ENTER,
&found);
if (found)
{
distributedTableId = entry->distributedTableId;
}
else
{
distributedTableId = AddShardIdToHashTable(shardId, entry);
}
*isReferenceTable = entry->isReferenceTable;
return distributedTableId;
}
/*
* replication_origin_filter_cb call back function filters out publication of changes
* originated from any other node other than the current node. This is
* identified by the "origin_id" of the changes. The origin_id is set to
* a non-zero value in the origin node as part of WAL replication for internal
* operations like shard split/moves/create_distributed_table etc.
*/
static bool
replication_origin_filter_cb(LogicalDecodingContext *ctx, RepOriginId origin_id)
{
return (origin_id != InvalidRepOriginId);
}
/*
* This function is responsible for translating the changes in the shard table to
* the changes in the shell table and publishing the changes as a change to the
* distributed table so that CDD clients are not aware of the shard tables. It also
* handles schema changes to the distributed table.
*/
static void
TranslateAndPublishRelationForCDC(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change, Oid
shardId, Oid targetRelationid)
{
/* Get the distributed table's relation for this shard.*/
Relation targetRelation = RelationIdGetRelation(targetRelationid);
/*
* Check if there has been a schema change (such as a dropped column), by comparing
* the number of attributes in the shard table and the shell table.
*/
TranslateChangesIfSchemaChanged(relation, targetRelation, change);
/*
* Publish the change to the shard table as the change in the distributed table,
* so that the CDC client can see the change in the distributed table,
* instead of the shard table, by calling the pgoutput's callback function.
*/
ouputPluginChangeCB(ctx, txn, targetRelation, change);
RelationClose(targetRelation);
}
/*
* PublishChangesIfCdcSlot checks if the current slot is a CDC slot. If so, it publishes
* the changes as the change for the distributed table instead of shard.
* If not, it returns false. It also skips the Citus metadata tables.
*/
static void
PublishDistributedTableChanges(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change)
{
char *shardRelationName = RelationGetRelationName(relation);
/* Skip publishing CDC changes for any system relations in pg_catalog*/
if (relation->rd_rel->relnamespace == PG_CATALOG_NAMESPACE)
{
return;
}
/* Check if the relation is a distributed table by checking for shard name. */
uint64 shardId = CdcExtractShardIdFromTableName(shardRelationName, true);
/* If this relation is not distributed, call the pgoutput's callback and return. */
if (shardId == INVALID_SHARD_ID)
{
ouputPluginChangeCB(ctx, txn, relation, change);
return;
}
bool isReferenceTable = false;
Oid distRelationId = LookupDistributedTableIdForShardId(shardId, &isReferenceTable);
if (distRelationId == InvalidOid)
{
ouputPluginChangeCB(ctx, txn, relation, change);
return;
}
/* Publish changes for reference table only from the coordinator node. */
if (isReferenceTable && !CdcIsCoordinator())
{
return;
}
/* translate and publish from shard relation to distributed table relation for CDC. */
TranslateAndPublishRelationForCDC(ctx, txn, relation, change, shardId,
distRelationId);
}
/*
* GetTupleForTargetSchemaForCdc returns a heap tuple with the data from sourceRelationTuple
* to match the schema in targetRelDesc. Either or both source and target relations may have
* dropped columns. This function handles it by adding NULL values for dropped columns in
* target relation and skipping dropped columns in source relation. It returns a heap tuple
* adjusted to the current schema of the target relation.
*/
static HeapTuple
GetTupleForTargetSchemaForCdc(HeapTuple sourceRelationTuple,
TupleDesc sourceRelDesc,
TupleDesc targetRelDesc)
{
/* Allocate memory for sourceValues and sourceNulls arrays. */
Datum *sourceValues = (Datum *) palloc0(sourceRelDesc->natts * sizeof(Datum));
bool *sourceNulls = (bool *) palloc0(sourceRelDesc->natts * sizeof(bool));
/* Deform the source tuple to sourceValues and sourceNulls arrays. */
heap_deform_tuple(sourceRelationTuple, sourceRelDesc, sourceValues,
sourceNulls);
/* This is the next field to Read in the source relation */
uint32 sourceIndex = 0;
uint32 targetIndex = 0;
/* Allocate memory for sourceValues and sourceNulls arrays. */
Datum *targetValues = (Datum *) palloc0(targetRelDesc->natts * sizeof(Datum));
bool *targetNulls = (bool *) palloc0(targetRelDesc->natts * sizeof(bool));
/* Loop through all source and target attributes one by one and handle any dropped attributes.*/
while (targetIndex < targetRelDesc->natts)
{
/* If this target attribute has been dropped, add a NULL attribute in targetValues and continue.*/
if (TupleDescAttr(targetRelDesc, targetIndex)->attisdropped)
{
Datum nullDatum = (Datum) 0;
targetValues[targetIndex] = nullDatum;
targetNulls[targetIndex] = true;
targetIndex++;
}
/* If this source attribute has been dropped, just skip this source attribute.*/
else if (TupleDescAttr(sourceRelDesc, sourceIndex)->attisdropped)
{
sourceIndex++;
continue;
}
/* If both source and target attributes are not dropped, add the attribute field to targetValues. */
else if (sourceIndex < sourceRelDesc->natts)
{
targetValues[targetIndex] = sourceValues[sourceIndex];
targetNulls[targetIndex] = sourceNulls[sourceIndex];
sourceIndex++;
targetIndex++;
}
else
{
/* If there are no more source fields, add a NULL field in targetValues. */
Datum nullDatum = (Datum) 0;
targetValues[targetIndex] = nullDatum;
targetNulls[targetIndex] = true;
targetIndex++;
}
}
/* Form a new tuple from the target values created by the above loop. */
HeapTuple targetRelationTuple = heap_form_tuple(targetRelDesc, targetValues,
targetNulls);
return targetRelationTuple;
}
/* HasSchemaChanged function returns if there any schema changes between source and target relations.*/
static bool
HasSchemaChanged(TupleDesc sourceRelationDesc, TupleDesc targetRelationDesc)
{
bool hasSchemaChanged = (sourceRelationDesc->natts != targetRelationDesc->natts);
if (hasSchemaChanged)
{
return true;
}
for (uint32 i = 0; i < sourceRelationDesc->natts; i++)
{
if (TupleDescAttr(sourceRelationDesc, i)->attisdropped ||
TupleDescAttr(targetRelationDesc, i)->attisdropped)
{
hasSchemaChanged = true;
break;
}
}
return hasSchemaChanged;
}
/*
* TranslateChangesIfSchemaChanged translates the tuples ReorderBufferChange
* if there is a schema change between source and target relations.
*/
static void
TranslateChangesIfSchemaChanged(Relation sourceRelation, Relation targetRelation,
ReorderBufferChange *change)
{
TupleDesc sourceRelationDesc = RelationGetDescr(sourceRelation);
TupleDesc targetRelationDesc = RelationGetDescr(targetRelation);
/* if there are no changes between source and target relations, return. */
if (!HasSchemaChanged(sourceRelationDesc, targetRelationDesc))
{
return;
}
/* Check the ReorderBufferChange's action type and handle them accordingly.*/
switch (change->action)
{
case REORDER_BUFFER_CHANGE_INSERT:
{
/* For insert action, only new tuple should always be translated*/
HeapTuple sourceRelationNewTuple = &(change->data.tp.newtuple->tuple);
HeapTuple targetRelationNewTuple = GetTupleForTargetSchemaForCdc(
sourceRelationNewTuple, sourceRelationDesc, targetRelationDesc);
change->data.tp.newtuple->tuple = *targetRelationNewTuple;
break;
}
/*
* For update changes both old and new tuples need to be translated for target relation
* if the REPLICA IDENTITY is set to FULL. Otherwise, only the new tuple needs to be
* translated for target relation.
*/
case REORDER_BUFFER_CHANGE_UPDATE:
{
/* For update action, new tuple should always be translated*/
/* Get the new tuple from the ReorderBufferChange, and translate it to target relation. */
HeapTuple sourceRelationNewTuple = &(change->data.tp.newtuple->tuple);
HeapTuple targetRelationNewTuple = GetTupleForTargetSchemaForCdc(
sourceRelationNewTuple, sourceRelationDesc, targetRelationDesc);
change->data.tp.newtuple->tuple = *targetRelationNewTuple;
/*
* Format oldtuple according to the target relation. If the column values of replica
* identiy change, then the old tuple is non-null and needs to be formatted according
* to the target relation schema.
*/
if (change->data.tp.oldtuple != NULL)
{
HeapTuple sourceRelationOldTuple = &(change->data.tp.oldtuple->tuple);
HeapTuple targetRelationOldTuple = GetTupleForTargetSchemaForCdc(
sourceRelationOldTuple,
sourceRelationDesc,
targetRelationDesc);
change->data.tp.oldtuple->tuple = *targetRelationOldTuple;
}
break;
}
case REORDER_BUFFER_CHANGE_DELETE:
{
/* For delete action, only old tuple should be translated*/
HeapTuple sourceRelationOldTuple = &(change->data.tp.oldtuple->tuple);
HeapTuple targetRelationOldTuple = GetTupleForTargetSchemaForCdc(
sourceRelationOldTuple,
sourceRelationDesc,
targetRelationDesc);
change->data.tp.oldtuple->tuple = *targetRelationOldTuple;
break;
}
default:
{
/* Do nothing for other action types. */
break;
}
}
}

View File

@ -0,0 +1,432 @@
/*-------------------------------------------------------------------------
*
* cdc_decoder_utils.c
* CDC Decoder plugin utility functions for Citus
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "commands/extension.h"
#include "fmgr.h"
#include "miscadmin.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "common/hashfn.h"
#include "common/string.h"
#include "utils/fmgroids.h"
#include "utils/typcache.h"
#include "utils/lsyscache.h"
#include "catalog/pg_namespace.h"
#include "cdc_decoder_utils.h"
#include "distributed/pg_dist_partition.h"
#include "distributed/pg_dist_shard.h"
#include "distributed/relay_utility.h"
static int32 LocalGroupId = -1;
static Oid PgDistLocalGroupRelationId = InvalidOid;
static Oid PgDistShardRelationId = InvalidOid;
static Oid PgDistShardShardidIndexId = InvalidOid;
static Oid PgDistPartitionRelationId = InvalidOid;
static Oid PgDistPartitionLogicalrelidIndexId = InvalidOid;
static bool IsCitusExtensionLoaded = false;
#define COORDINATOR_GROUP_ID 0
#define InvalidRepOriginId 0
#define Anum_pg_dist_local_groupid 1
#define GROUP_ID_UPGRADING -2
static Oid DistLocalGroupIdRelationId(void);
static int32 CdcGetLocalGroupId(void);
static HeapTuple CdcPgDistPartitionTupleViaCatalog(Oid relationId);
/*
* DistLocalGroupIdRelationId returns the relation id of the pg_dist_local_group
*/
static Oid
DistLocalGroupIdRelationId(void)
{
if (PgDistLocalGroupRelationId == InvalidOid)
{
PgDistLocalGroupRelationId = get_relname_relid("pg_dist_local_group",
PG_CATALOG_NAMESPACE);
}
return PgDistLocalGroupRelationId;
}
/*
* DistShardRelationId returns the relation id of the pg_dist_shard
*/
static Oid
DistShardRelationId(void)
{
if (PgDistShardRelationId == InvalidOid)
{
PgDistShardRelationId = get_relname_relid("pg_dist_shard", PG_CATALOG_NAMESPACE);
}
return PgDistShardRelationId;
}
/*
* DistShardRelationId returns the relation id of the pg_dist_shard
*/
static Oid
DistShardShardidIndexId(void)
{
if (PgDistShardShardidIndexId == InvalidOid)
{
PgDistShardShardidIndexId = get_relname_relid("pg_dist_shard_shardid_index",
PG_CATALOG_NAMESPACE);
}
return PgDistShardShardidIndexId;
}
/*
* DistShardRelationId returns the relation id of the pg_dist_shard
*/
static Oid
DistPartitionRelationId(void)
{
if (PgDistPartitionRelationId == InvalidOid)
{
PgDistPartitionRelationId = get_relname_relid("pg_dist_partition",
PG_CATALOG_NAMESPACE);
}
return PgDistPartitionRelationId;
}
static Oid
DistPartitionLogicalRelidIndexId(void)
{
if (PgDistPartitionLogicalrelidIndexId == InvalidOid)
{
PgDistPartitionLogicalrelidIndexId = get_relname_relid(
"pg_dist_partition_logicalrelid_index", PG_CATALOG_NAMESPACE);
}
return PgDistPartitionLogicalrelidIndexId;
}
/*
* CdcIsCoordinator function returns true if this node is identified as the
* schema/coordinator/master node of the cluster.
*/
bool
CdcIsCoordinator(void)
{
return (CdcGetLocalGroupId() == COORDINATOR_GROUP_ID);
}
/*
* CdcCitusHasBeenLoaded function returns true if the citus extension has been loaded.
*/
bool
CdcCitusHasBeenLoaded()
{
if (!IsCitusExtensionLoaded)
{
IsCitusExtensionLoaded = (get_extension_oid("citus", true) != InvalidOid);
}
return IsCitusExtensionLoaded;
}
/*
* ExtractShardIdFromTableName tries to extract shard id from the given table name,
* and returns the shard id if table name is formatted as shard name.
* Else, the function returns INVALID_SHARD_ID.
*/
uint64
CdcExtractShardIdFromTableName(const char *tableName, bool missingOk)
{
char *shardIdStringEnd = NULL;
/* find the last underscore and increment for shardId string */
char *shardIdString = strrchr(tableName, SHARD_NAME_SEPARATOR);
if (shardIdString == NULL && !missingOk)
{
ereport(ERROR, (errmsg("could not extract shardId from table name \"%s\"",
tableName)));
}
else if (shardIdString == NULL && missingOk)
{
return INVALID_SHARD_ID;
}
shardIdString++;
errno = 0;
uint64 shardId = strtoull(shardIdString, &shardIdStringEnd, 0);
if (errno != 0 || (*shardIdStringEnd != '\0'))
{
if (!missingOk)
{
ereport(ERROR, (errmsg("could not extract shardId from table name \"%s\"",
tableName)));
}
else
{
return INVALID_SHARD_ID;
}
}
return shardId;
}
/*
* CdcGetLocalGroupId returns the group identifier of the local node. The function assumes
* that pg_dist_local_node_group has exactly one row and has at least one column.
* Otherwise, the function errors out.
*/
static int32
CdcGetLocalGroupId(void)
{
ScanKeyData scanKey[1];
int scanKeyCount = 0;
int32 groupId = 0;
/*
* Already set the group id, no need to read the heap again.
*/
if (LocalGroupId != -1)
{
return LocalGroupId;
}
Oid localGroupTableOid = DistLocalGroupIdRelationId();
if (localGroupTableOid == InvalidOid)
{
return 0;
}
Relation pgDistLocalGroupId = table_open(localGroupTableOid, AccessShareLock);
SysScanDesc scanDescriptor = systable_beginscan(pgDistLocalGroupId,
InvalidOid, false,
NULL, scanKeyCount, scanKey);
TupleDesc tupleDescriptor = RelationGetDescr(pgDistLocalGroupId);
HeapTuple heapTuple = systable_getnext(scanDescriptor);
if (HeapTupleIsValid(heapTuple))
{
bool isNull = false;
Datum groupIdDatum = heap_getattr(heapTuple,
Anum_pg_dist_local_groupid,
tupleDescriptor, &isNull);
groupId = DatumGetInt32(groupIdDatum);
/* set the local cache variable */
LocalGroupId = groupId;
}
else
{
/*
* Upgrade is happening. When upgrading postgres, pg_dist_local_group is
* temporarily empty before citus_finish_pg_upgrade() finishes execution.
*/
groupId = GROUP_ID_UPGRADING;
}
systable_endscan(scanDescriptor);
table_close(pgDistLocalGroupId, AccessShareLock);
return groupId;
}
/*
* CdcLookupShardRelationFromCatalog returns the logical relation oid a shard belongs to.
*
* Errors out if the shardId does not exist and missingOk is false.
* Returns InvalidOid if the shardId does not exist and missingOk is true.
*/
Oid
CdcLookupShardRelationFromCatalog(int64 shardId, bool missingOk)
{
ScanKeyData scanKey[1];
int scanKeyCount = 1;
Form_pg_dist_shard shardForm = NULL;
Relation pgDistShard = table_open(DistShardRelationId(), AccessShareLock);
Oid relationId = InvalidOid;
ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
SysScanDesc scanDescriptor = systable_beginscan(pgDistShard,
DistShardShardidIndexId(), true,
NULL, scanKeyCount, scanKey);
HeapTuple heapTuple = systable_getnext(scanDescriptor);
if (!HeapTupleIsValid(heapTuple) && !missingOk)
{
ereport(ERROR, (errmsg("could not find valid entry for shard "
UINT64_FORMAT, shardId)));
}
if (!HeapTupleIsValid(heapTuple))
{
relationId = InvalidOid;
}
else
{
shardForm = (Form_pg_dist_shard) GETSTRUCT(heapTuple);
relationId = shardForm->logicalrelid;
}
systable_endscan(scanDescriptor);
table_close(pgDistShard, NoLock);
return relationId;
}
/*
* CdcPgDistPartitionTupleViaCatalog is a helper function that searches
* pg_dist_partition for the given relationId. The caller is responsible
* for ensuring that the returned heap tuple is valid before accessing
* its fields.
*/
static HeapTuple
CdcPgDistPartitionTupleViaCatalog(Oid relationId)
{
const int scanKeyCount = 1;
ScanKeyData scanKey[1];
bool indexOK = true;
Relation pgDistPartition = table_open(DistPartitionRelationId(), AccessShareLock);
ScanKeyInit(&scanKey[0], Anum_pg_dist_partition_logicalrelid,
BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(relationId));
SysScanDesc scanDescriptor = systable_beginscan(pgDistPartition,
DistPartitionLogicalRelidIndexId(),
indexOK, NULL, scanKeyCount, scanKey);
HeapTuple partitionTuple = systable_getnext(scanDescriptor);
if (HeapTupleIsValid(partitionTuple))
{
/* callers should have the tuple in their memory contexts */
partitionTuple = heap_copytuple(partitionTuple);
}
systable_endscan(scanDescriptor);
table_close(pgDistPartition, AccessShareLock);
return partitionTuple;
}
/*
* CdcPartitionMethodViaCatalog gets a relationId and returns the partition
* method column from pg_dist_partition via reading from catalog.
*/
char
CdcPartitionMethodViaCatalog(Oid relationId)
{
HeapTuple partitionTuple = CdcPgDistPartitionTupleViaCatalog(relationId);
if (!HeapTupleIsValid(partitionTuple))
{
return DISTRIBUTE_BY_INVALID;
}
Datum datumArray[Natts_pg_dist_partition];
bool isNullArray[Natts_pg_dist_partition];
Relation pgDistPartition = table_open(DistPartitionRelationId(), AccessShareLock);
TupleDesc tupleDescriptor = RelationGetDescr(pgDistPartition);
heap_deform_tuple(partitionTuple, tupleDescriptor, datumArray, isNullArray);
if (isNullArray[Anum_pg_dist_partition_partmethod - 1])
{
/* partition method cannot be NULL, still let's make sure */
heap_freetuple(partitionTuple);
table_close(pgDistPartition, NoLock);
return DISTRIBUTE_BY_INVALID;
}
Datum partitionMethodDatum = datumArray[Anum_pg_dist_partition_partmethod - 1];
char partitionMethodChar = DatumGetChar(partitionMethodDatum);
heap_freetuple(partitionTuple);
table_close(pgDistPartition, NoLock);
return partitionMethodChar;
}
/*
* RemoveCitusDecodersFromPaths removes a path ending in citus_decoders
* from the given input paths.
*/
char *
RemoveCitusDecodersFromPaths(char *paths)
{
if (strlen(paths) == 0)
{
/* dynamic_library_path is empty */
return paths;
}
StringInfo newPaths = makeStringInfo();
char *remainingPaths = paths;
for (;;)
{
int pathLength = 0;
char *pathStart = first_path_var_separator(remainingPaths);
if (pathStart == remainingPaths)
{
/*
* This will error out in find_in_dynamic_libpath, return
* original value here.
*/
return paths;
}
else if (pathStart == NULL)
{
/* final path */
pathLength = strlen(remainingPaths);
}
else
{
/* more paths remaining */
pathLength = pathStart - remainingPaths;
}
char *currentPath = palloc(pathLength + 1);
strlcpy(currentPath, remainingPaths, pathLength + 1);
canonicalize_path(currentPath);
if (!pg_str_endswith(currentPath, "/citus_decoders"))
{
appendStringInfo(newPaths, "%s%s", newPaths->len > 0 ? ":" : "", currentPath);
}
if (remainingPaths[pathLength] == '\0')
{
/* end of string */
break;
}
remainingPaths += pathLength + 1;
}
return newPaths->data;
}

View File

@ -0,0 +1,34 @@
/*-------------------------------------------------------------------------
*
* cdc_decoder_utils.h
* Utility functions and declerations for cdc decoder.
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#ifndef CITUS_CDC_DECODER_H
#define CITUS_CDC_DECODER_H
#include "postgres.h"
#include "fmgr.h"
#include "replication/logical.h"
#include "c.h"
#define InvalidRepOriginId 0
#define INVALID_SHARD_ID 0
bool CdcIsCoordinator(void);
uint64 CdcExtractShardIdFromTableName(const char *tableName, bool missingOk);
Oid CdcLookupShardRelationFromCatalog(int64 shardId, bool missingOk);
char CdcPartitionMethodViaCatalog(Oid relationId);
bool CdcCitusHasBeenLoaded(void);
char * RemoveCitusDecodersFromPaths(char *paths);
#endif /* CITUS_CDC_DECODER_UTILS_H */

View File

@ -55,6 +55,7 @@
#include "distributed/multi_partitioning_utils.h"
#include "distributed/reference_table_utils.h"
#include "distributed/relation_access_tracking.h"
#include "distributed/replication_origin_session_utils.h"
#include "distributed/shared_library_init.h"
#include "distributed/shard_utils.h"
#include "distributed/worker_protocol.h"
@ -183,6 +184,7 @@ static TableConversionReturn * AlterDistributedTable(TableConversionParameters *
static TableConversionReturn * AlterTableSetAccessMethod(
TableConversionParameters *params);
static TableConversionReturn * ConvertTable(TableConversionState *con);
static TableConversionReturn * ConvertTableInternal(TableConversionState *con);
static bool SwitchToSequentialAndLocalExecutionIfShardNameTooLong(char *relationName,
char *longestShardName);
static void DropIndexesNotSupportedByColumnar(Oid relationId,
@ -215,7 +217,10 @@ static bool WillRecreateForeignKeyToReferenceTable(Oid relationId,
CascadeToColocatedOption cascadeOption);
static void WarningsForDroppingForeignKeysWithDistributedTables(Oid relationId);
static void ErrorIfUnsupportedCascadeObjects(Oid relationId);
static List * WrapTableDDLCommands(List *commandStrings);
static bool DoesCascadeDropUnsupportedObject(Oid classId, Oid id, HTAB *nodeMap);
static TableConversionReturn * CopyTableConversionReturnIntoCurrentContext(
TableConversionReturn *tableConversionReturn);
PG_FUNCTION_INFO_V1(undistribute_table);
PG_FUNCTION_INFO_V1(alter_distributed_table);
@ -402,7 +407,11 @@ UndistributeTable(TableConversionParameters *params)
params->conversionType = UNDISTRIBUTE_TABLE;
params->shardCountIsNull = true;
TableConversionState *con = CreateTableConversion(params);
return ConvertTable(con);
SetupReplicationOriginLocalSession();
TableConversionReturn *conv = ConvertTable(con);
ResetReplicationOriginLocalSession();
return conv;
}
@ -441,6 +450,7 @@ AlterDistributedTable(TableConversionParameters *params)
ereport(DEBUG1, (errmsg("setting multi shard modify mode to sequential")));
SetLocalMultiShardModifyModeToSequential();
}
return ConvertTable(con);
}
@ -511,9 +521,9 @@ AlterTableSetAccessMethod(TableConversionParameters *params)
/*
* ConvertTable is used for converting a table into a new table with different properties.
* The conversion is done by creating a new table, moving everything to the new table and
* dropping the old one. So the oid of the table is not preserved.
* ConvertTableInternal is used for converting a table into a new table with different
* properties. The conversion is done by creating a new table, moving everything to the
* new table and dropping the old one. So the oid of the table is not preserved.
*
* The new table will have the same name, columns and rows. It will also have partitions,
* views, sequences of the old table. Finally it will have everything created by
@ -532,7 +542,7 @@ AlterTableSetAccessMethod(TableConversionParameters *params)
* in case you add a new way to return from this function.
*/
TableConversionReturn *
ConvertTable(TableConversionState *con)
ConvertTableInternal(TableConversionState *con)
{
InTableTypeConversionFunctionCall = true;
@ -595,9 +605,18 @@ ConvertTable(TableConversionState *con)
List *justBeforeDropCommands = NIL;
List *attachPartitionCommands = NIL;
postLoadCommands =
list_concat(postLoadCommands,
GetViewCreationTableDDLCommandsOfTable(con->relationId));
List *createViewCommands = GetViewCreationCommandsOfTable(con->relationId);
postLoadCommands = list_concat(postLoadCommands,
WrapTableDDLCommands(createViewCommands));
/* need to add back to publications after dropping the original table */
bool isAdd = true;
List *alterPublicationCommands =
GetAlterPublicationDDLCommandsForTable(con->relationId, isAdd);
postLoadCommands = list_concat(postLoadCommands,
WrapTableDDLCommands(alterPublicationCommands));
List *foreignKeyCommands = NIL;
if (con->conversionType == ALTER_DISTRIBUTED_TABLE)
@ -800,9 +819,21 @@ ConvertTable(TableConversionState *con)
ExecuteQueryViaSPI(tableConstructionSQL, SPI_OK_UTILITY);
}
/*
* when there are many partitions, each call to ProcessUtilityParseTree
* accumulates used memory. Free context after each call.
*/
MemoryContext citusPerPartitionContext =
AllocSetContextCreate(CurrentMemoryContext,
"citus_per_partition_context",
ALLOCSET_DEFAULT_SIZES);
MemoryContext oldContext = MemoryContextSwitchTo(citusPerPartitionContext);
char *attachPartitionCommand = NULL;
foreach_ptr(attachPartitionCommand, attachPartitionCommands)
{
MemoryContextReset(citusPerPartitionContext);
Node *parseTree = ParseTreeNode(attachPartitionCommand);
ProcessUtilityParseTree(parseTree, attachPartitionCommand,
@ -810,6 +841,9 @@ ConvertTable(TableConversionState *con)
NULL, None_Receiver, NULL);
}
MemoryContextSwitchTo(oldContext);
MemoryContextDelete(citusPerPartitionContext);
if (isPartitionTable)
{
ExecuteQueryViaSPI(attachToParentCommand, SPI_OK_UTILITY);
@ -869,10 +903,77 @@ ConvertTable(TableConversionState *con)
SetLocalEnableLocalReferenceForeignKeys(oldEnableLocalReferenceForeignKeys);
InTableTypeConversionFunctionCall = false;
return ret;
}
/*
* CopyTableConversionReturnIntoCurrentContext copies given tableConversionReturn
* into CurrentMemoryContext.
*/
static TableConversionReturn *
CopyTableConversionReturnIntoCurrentContext(TableConversionReturn *tableConversionReturn)
{
TableConversionReturn *tableConversionReturnCopy = NULL;
if (tableConversionReturn)
{
tableConversionReturnCopy = palloc0(sizeof(TableConversionReturn));
List *copyForeignKeyCommands = NIL;
char *foreignKeyCommand = NULL;
foreach_ptr(foreignKeyCommand, tableConversionReturn->foreignKeyCommands)
{
char *copyForeignKeyCommand = MemoryContextStrdup(CurrentMemoryContext,
foreignKeyCommand);
copyForeignKeyCommands = lappend(copyForeignKeyCommands,
copyForeignKeyCommand);
}
tableConversionReturnCopy->foreignKeyCommands = copyForeignKeyCommands;
}
return tableConversionReturnCopy;
}
/*
* ConvertTable is a wrapper for ConvertTableInternal to persist only
* TableConversionReturn and delete all other allocations.
*/
static TableConversionReturn *
ConvertTable(TableConversionState *con)
{
/*
* We do not allow alter_distributed_table and undistribute_table operations
* for tables with identity columns. This is because we do not have a proper way
* of keeping sequence states consistent across the cluster.
*/
ErrorIfTableHasIdentityColumn(con->relationId);
/*
* when there are many partitions or colocated tables, memory usage is
* accumulated. Free context for each call to ConvertTable.
*/
MemoryContext convertTableContext =
AllocSetContextCreate(CurrentMemoryContext,
"citus_convert_table_context",
ALLOCSET_DEFAULT_SIZES);
MemoryContext oldContext = MemoryContextSwitchTo(convertTableContext);
TableConversionReturn *tableConversionReturn = ConvertTableInternal(con);
MemoryContextSwitchTo(oldContext);
/* persist TableConversionReturn in oldContext */
TableConversionReturn *tableConversionReturnCopy =
CopyTableConversionReturnIntoCurrentContext(tableConversionReturn);
/* delete convertTableContext */
MemoryContextDelete(convertTableContext);
return tableConversionReturnCopy;
}
/*
* DropIndexesNotSupportedByColumnar is a helper function used during accces
* method conversion to drop the indexes that are not supported by columnarAM.
@ -1268,8 +1369,7 @@ CreateCitusTableLike(TableConversionState *con)
}
else if (IsCitusTableType(con->relationId, REFERENCE_TABLE))
{
CreateDistributedTable(con->newRelationId, NULL, DISTRIBUTE_BY_NONE, 0, false,
NULL);
CreateReferenceTable(con->newRelationId);
}
else if (IsCitusTableType(con->relationId, CITUS_LOCAL_TABLE))
{
@ -1410,17 +1510,16 @@ GetViewCreationCommandsOfTable(Oid relationId)
/*
* GetViewCreationTableDDLCommandsOfTable is the same as GetViewCreationCommandsOfTable,
* but the returned list includes objects of TableDDLCommand's, not strings.
* WrapTableDDLCommands takes a list of command strings and wraps them
* in TableDDLCommand structs.
*/
List *
GetViewCreationTableDDLCommandsOfTable(Oid relationId)
static List *
WrapTableDDLCommands(List *commandStrings)
{
List *commands = GetViewCreationCommandsOfTable(relationId);
List *tableDDLCommands = NIL;
char *command = NULL;
foreach_ptr(command, commands)
foreach_ptr(command, commandStrings)
{
tableDDLCommands = lappend(tableDDLCommands, makeTableDDLCommandString(command));
}
@ -1523,96 +1622,6 @@ CreateMaterializedViewDDLCommand(Oid matViewOid)
}
/*
* This function marks all the identity sequences as distributed on the given table.
*/
static void
MarkIdentitiesAsDistributed(Oid targetRelationId)
{
Relation relation = relation_open(targetRelationId, AccessShareLock);
TupleDesc tupleDescriptor = RelationGetDescr(relation);
relation_close(relation, NoLock);
bool missingSequenceOk = false;
for (int attributeIndex = 0; attributeIndex < tupleDescriptor->natts;
attributeIndex++)
{
Form_pg_attribute attributeForm = TupleDescAttr(tupleDescriptor, attributeIndex);
if (attributeForm->attidentity)
{
Oid seqOid = getIdentitySequence(targetRelationId, attributeForm->attnum,
missingSequenceOk);
ObjectAddress seqAddress = { 0 };
ObjectAddressSet(seqAddress, RelationRelationId, seqOid);
MarkObjectDistributed(&seqAddress);
}
}
}
/*
* This function returns sql statements to rename identites on the given table
*/
static void
PrepareRenameIdentitiesCommands(Oid sourceRelationId, Oid targetRelationId,
List **outCoordinatorCommands, List **outWorkerCommands)
{
Relation targetRelation = relation_open(targetRelationId, AccessShareLock);
TupleDesc targetTupleDescriptor = RelationGetDescr(targetRelation);
relation_close(targetRelation, NoLock);
bool missingSequenceOk = false;
for (int attributeIndex = 0; attributeIndex < targetTupleDescriptor->natts;
attributeIndex++)
{
Form_pg_attribute attributeForm = TupleDescAttr(targetTupleDescriptor,
attributeIndex);
if (attributeForm->attidentity)
{
char *columnName = NameStr(attributeForm->attname);
Oid targetSequenceOid = getIdentitySequence(targetRelationId,
attributeForm->attnum,
missingSequenceOk);
char *targetSequenceName = generate_relation_name(targetSequenceOid, NIL);
Oid sourceSequenceOid = getIdentitySequence(sourceRelationId,
attributeForm->attnum,
missingSequenceOk);
char *sourceSequenceName = generate_relation_name(sourceSequenceOid, NIL);
/* to rename sequence on the coordinator */
*outCoordinatorCommands = lappend(*outCoordinatorCommands, psprintf(
"SET citus.enable_ddl_propagation TO OFF; ALTER SEQUENCE %s RENAME TO %s; RESET citus.enable_ddl_propagation;",
quote_identifier(
targetSequenceName),
quote_identifier(
sourceSequenceName)));
/* update workers to use existing sequence and drop the new one generated by PG */
bool missingTableOk = true;
*outWorkerCommands = lappend(*outWorkerCommands,
GetAlterColumnWithNextvalDefaultCmd(
sourceSequenceOid, sourceRelationId,
columnName,
missingTableOk));
/* drop the sequence generated by identity column */
*outWorkerCommands = lappend(*outWorkerCommands, psprintf(
"DROP SEQUENCE IF EXISTS %s",
quote_identifier(
targetSequenceName)));
}
}
}
/*
* ReplaceTable replaces the source table with the target table.
* It moves all the rows of the source table to target table with INSERT SELECT.
@ -1671,24 +1680,6 @@ ReplaceTable(Oid sourceId, Oid targetId, List *justBeforeDropCommands,
ExecuteQueryViaSPI(query->data, SPI_OK_INSERT);
}
/*
* Drop identity dependencies (sequences marked as DEPENDENCY_INTERNAL) on the workers
* to keep their states after the source table is dropped.
*/
List *ownedIdentitySequences = getOwnedSequences_internal(sourceId, 0,
DEPENDENCY_INTERNAL);
if (ownedIdentitySequences != NIL && ShouldSyncTableMetadata(sourceId))
{
char *qualifiedTableName = quote_qualified_identifier(schemaName, sourceName);
StringInfo command = makeStringInfo();
appendStringInfo(command,
"SELECT pg_catalog.worker_drop_sequence_dependency(%s);",
quote_literal_cstr(qualifiedTableName));
SendCommandToWorkersWithMetadata(command->data);
}
/*
* Modify regular sequence dependencies (sequences marked as DEPENDENCY_AUTO)
*/
@ -1748,23 +1739,6 @@ ReplaceTable(Oid sourceId, Oid targetId, List *justBeforeDropCommands,
quote_qualified_identifier(schemaName, sourceName))));
}
/*
* We need to prepare rename identities commands before dropping the original table,
* otherwise we can't find the original names of the identity sequences.
* We prepare separate commands for the coordinator and the workers because:
* In the coordinator, we simply need to rename the identity sequences
* to their names on the old table, because right now the identity
* sequences have default names generated by Postgres with the creation of the new table
* In the workers, we have not dropped the original identity sequences,
* so what we do is we alter the columns and set their default to the
* original identity sequences, and after that we drop the new sequences.
*/
List *coordinatorCommandsToRenameIdentites = NIL;
List *workerCommandsToRenameIdentites = NIL;
PrepareRenameIdentitiesCommands(sourceId, targetId,
&coordinatorCommandsToRenameIdentites,
&workerCommandsToRenameIdentites);
resetStringInfo(query);
appendStringInfo(query, "DROP %sTABLE %s CASCADE",
IsForeignTable(sourceId) ? "FOREIGN " : "",
@ -1782,27 +1756,6 @@ ReplaceTable(Oid sourceId, Oid targetId, List *justBeforeDropCommands,
quote_qualified_identifier(schemaName, targetName),
quote_identifier(sourceName));
ExecuteQueryViaSPI(query->data, SPI_OK_UTILITY);
char *coordinatorCommand = NULL;
foreach_ptr(coordinatorCommand, coordinatorCommandsToRenameIdentites)
{
ExecuteQueryViaSPI(coordinatorCommand, SPI_OK_UTILITY);
}
char *workerCommand = NULL;
foreach_ptr(workerCommand, workerCommandsToRenameIdentites)
{
SendCommandToWorkersWithMetadata(workerCommand);
}
/*
* To preserve identity sequences states in case of redistributing the table again,
* we don't drop them when we undistribute a table. To maintain consistency and
* avoid future problems if we redistribute the table, we want to apply all changes happening to
* the identity sequence in the coordinator to their corresponding sequences in the workers as well.
* That's why we have to mark identity sequences as distributed
*/
MarkIdentitiesAsDistributed(targetId);
}

View File

@ -85,6 +85,7 @@ static void DropRelationTruncateTriggers(Oid relationId);
static char * GetDropTriggerCommand(Oid relationId, char *triggerName);
static void DropViewsOnTable(Oid relationId);
static void DropIdentitiesOnTable(Oid relationId);
static void DropTableFromPublications(Oid relationId);
static List * GetRenameStatsCommandList(List *statsOidList, uint64 shardId);
static List * ReversedOidList(List *oidList);
static void AppendExplicitIndexIdsToList(Form_pg_index indexForm,
@ -338,6 +339,10 @@ CreateCitusLocalTable(Oid relationId, bool cascadeViaForeignKeys, bool autoConve
List *shellTableDDLEvents = GetShellTableDDLEventsForCitusLocalTable(relationId);
List *tableViewCreationCommands = GetViewCreationCommandsOfTable(relationId);
bool isAdd = true;
List *alterPublicationCommands =
GetAlterPublicationDDLCommandsForTable(relationId, isAdd);
char *relationName = get_rel_name(relationId);
Oid relationSchemaId = get_rel_namespace(relationId);
@ -347,6 +352,12 @@ CreateCitusLocalTable(Oid relationId, bool cascadeViaForeignKeys, bool autoConve
*/
DropIdentitiesOnTable(relationId);
/*
* We do not want the shard to be in the publication (subscribers are
* unlikely to recognize it).
*/
DropTableFromPublications(relationId);
/* below we convert relation with relationId to the shard relation */
uint64 shardId = ConvertLocalTableToShard(relationId);
@ -363,6 +374,11 @@ CreateCitusLocalTable(Oid relationId, bool cascadeViaForeignKeys, bool autoConve
*/
ExecuteAndLogUtilityCommandListInTableTypeConversionViaSPI(tableViewCreationCommands);
/*
* Execute the publication creation commands with the shell table.
*/
ExecuteAndLogUtilityCommandListInTableTypeConversionViaSPI(alterPublicationCommands);
/*
* Set shellRelationId as the relation with relationId now points
* to the shard relation.
@ -1131,7 +1147,7 @@ DropIdentitiesOnTable(Oid relationId)
{
Relation relation = relation_open(relationId, AccessShareLock);
TupleDesc tupleDescriptor = RelationGetDescr(relation);
relation_close(relation, NoLock);
List *dropCommandList = NIL;
for (int attributeIndex = 0; attributeIndex < tupleDescriptor->natts;
attributeIndex++)
@ -1151,15 +1167,38 @@ DropIdentitiesOnTable(Oid relationId)
qualifiedTableName,
columnName);
dropCommandList = lappend(dropCommandList, dropCommand->data);
}
}
relation_close(relation, NoLock);
char *dropCommand = NULL;
foreach_ptr(dropCommand, dropCommandList)
{
/*
* We need to disable/enable ddl propagation for this command, to prevent
* sending unnecessary ALTER COLUMN commands for partitions, to MX workers.
*/
ExecuteAndLogUtilityCommandList(list_make3(DISABLE_DDL_PROPAGATION,
dropCommand->data,
dropCommand,
ENABLE_DDL_PROPAGATION));
}
}
/*
* DropTableFromPublications drops the table from all of its publications.
*/
static void
DropTableFromPublications(Oid relationId)
{
bool isAdd = false;
List *alterPublicationCommands =
GetAlterPublicationDDLCommandsForTable(relationId, isAdd);
ExecuteAndLogUtilityCommandList(alterPublicationCommands);
}

View File

@ -94,6 +94,28 @@
#include "utils/syscache.h"
#include "utils/inval.h"
/* common params that apply to all Citus table types */
typedef struct
{
char distributionMethod;
char replicationModel;
} CitusTableParams;
/*
* Params that only apply to distributed tables, i.e., the ones that are
* known as DISTRIBUTED_TABLE by Citus metadata.
*/
typedef struct
{
int shardCount;
bool shardCountIsStrict;
char *colocateWithTableName;
char *distributionColumnName;
} DistributedTableParams;
/*
* once every LOG_PER_TUPLE_AMOUNT, the copy will be logged.
*/
@ -106,17 +128,22 @@ static void CreateDistributedTableConcurrently(Oid relationId,
char *colocateWithTableName,
int shardCount,
bool shardCountIsStrict);
static char DecideReplicationModel(char distributionMethod, char *colocateWithTableName);
static char DecideDistTableReplicationModel(char distributionMethod,
char *colocateWithTableName);
static List * HashSplitPointsForShardList(List *shardList);
static List * HashSplitPointsForShardCount(int shardCount);
static List * WorkerNodesForShardList(List *shardList);
static List * RoundRobinWorkerNodeList(List *workerNodeList, int listLength);
static CitusTableParams DecideCitusTableParams(CitusTableType tableType,
DistributedTableParams *
distributedTableParams);
static void CreateCitusTable(Oid relationId, CitusTableType tableType,
DistributedTableParams *distributedTableParams);
static void CreateHashDistributedTableShards(Oid relationId, int shardCount,
Oid colocatedTableId, bool localTableEmpty);
static uint32 ColocationIdForNewTable(Oid relationId, Var *distributionColumn,
char distributionMethod, char replicationModel,
int shardCount, bool shardCountIsStrict,
char *colocateWithTableName);
static uint32 ColocationIdForNewTable(Oid relationId, CitusTableType tableType,
DistributedTableParams *distributedTableParams,
Var *distributionColumn);
static void EnsureRelationCanBeDistributed(Oid relationId, Var *distributionColumn,
char distributionMethod, uint32 colocationId,
char replicationModel);
@ -377,7 +404,7 @@ CreateDistributedTableConcurrently(Oid relationId, char *distributionColumnName,
EnsureForeignKeysForDistributedTableConcurrently(relationId);
char replicationModel = DecideReplicationModel(distributionMethod,
char replicationModel = DecideDistTableReplicationModel(distributionMethod,
colocateWithTableName);
/*
@ -622,7 +649,7 @@ static void
EnsureColocateWithTableIsValid(Oid relationId, char distributionMethod,
char *distributionColumnName, char *colocateWithTableName)
{
char replicationModel = DecideReplicationModel(distributionMethod,
char replicationModel = DecideDistTableReplicationModel(distributionMethod,
colocateWithTableName);
/*
@ -860,9 +887,6 @@ create_reference_table(PG_FUNCTION_ARGS)
CheckCitusVersion(ERROR);
Oid relationId = PG_GETARG_OID(0);
char *colocateWithTableName = NULL;
char *distributionColumnName = NULL;
EnsureCitusTableCanBeCreated(relationId);
/* enable create_reference_table on an empty node */
@ -895,8 +919,7 @@ create_reference_table(PG_FUNCTION_ARGS)
errdetail("There are no active worker nodes.")));
}
CreateDistributedTable(relationId, distributionColumnName, DISTRIBUTE_BY_NONE,
ShardCount, false, colocateWithTableName);
CreateReferenceTable(relationId);
PG_RETURN_VOID();
}
@ -951,18 +974,90 @@ EnsureRelationExists(Oid relationId)
/*
* CreateDistributedTable creates distributed table in the given configuration.
* CreateReferenceTable is a wrapper around CreateCitusTable that creates a
* distributed table.
*/
void
CreateDistributedTable(Oid relationId, char *distributionColumnName,
char distributionMethod,
int shardCount, bool shardCountIsStrict,
char *colocateWithTableName)
{
CitusTableType tableType;
switch (distributionMethod)
{
case DISTRIBUTE_BY_HASH:
{
tableType = HASH_DISTRIBUTED;
break;
}
case DISTRIBUTE_BY_APPEND:
{
tableType = APPEND_DISTRIBUTED;
break;
}
case DISTRIBUTE_BY_RANGE:
{
tableType = RANGE_DISTRIBUTED;
break;
}
default:
{
ereport(ERROR, (errmsg("unexpected distribution method when "
"deciding Citus table type")));
break;
}
}
DistributedTableParams distributedTableParams = {
.colocateWithTableName = colocateWithTableName,
.shardCount = shardCount,
.shardCountIsStrict = shardCountIsStrict,
.distributionColumnName = distributionColumnName
};
CreateCitusTable(relationId, tableType, &distributedTableParams);
}
/*
* CreateReferenceTable is a wrapper around CreateCitusTable that creates a
* reference table.
*/
void
CreateReferenceTable(Oid relationId)
{
CreateCitusTable(relationId, REFERENCE_TABLE, NULL);
}
/*
* CreateCitusTable is the internal method that creates a Citus table in
* given configuration.
*
* DistributedTableParams should be non-null only if we're creating a distributed
* table.
*
* This functions contains all necessary logic to create distributed tables. It
* performs necessary checks to ensure distributing the table is safe. If it is
* safe to distribute the table, this function creates distributed table metadata,
* creates shards and copies local data to shards. This function also handles
* partitioned tables by distributing its partitions as well.
*/
void
CreateDistributedTable(Oid relationId, char *distributionColumnName,
char distributionMethod, int shardCount,
bool shardCountIsStrict, char *colocateWithTableName)
static void
CreateCitusTable(Oid relationId, CitusTableType tableType,
DistributedTableParams *distributedTableParams)
{
if ((tableType == HASH_DISTRIBUTED || tableType == APPEND_DISTRIBUTED ||
tableType == RANGE_DISTRIBUTED) != (distributedTableParams != NULL))
{
ereport(ERROR, (errmsg("distributed table params must be provided "
"when creating a distributed table and must "
"not be otherwise")));
}
/*
* EnsureTableNotDistributed errors out when relation is a citus table but
* we don't want to ask user to first undistribute their citus local tables
@ -988,11 +1083,8 @@ CreateDistributedTable(Oid relationId, char *distributionColumnName,
* that ALTER TABLE hook does the necessary job, which means converting
* local tables to citus local tables to properly support such foreign
* keys.
*
* This function does not expect to create Citus local table, so we blindly
* create reference table when the method is DISTRIBUTE_BY_NONE.
*/
else if (distributionMethod == DISTRIBUTE_BY_NONE &&
else if (tableType == REFERENCE_TABLE &&
ShouldEnableLocalReferenceForeignKeys() &&
HasForeignKeyWithLocalTable(relationId))
{
@ -1022,24 +1114,29 @@ CreateDistributedTable(Oid relationId, char *distributionColumnName,
PropagatePrerequisiteObjectsForDistributedTable(relationId);
char replicationModel = DecideReplicationModel(distributionMethod,
colocateWithTableName);
Var *distributionColumn = BuildDistributionKeyFromColumnName(relationId,
Var *distributionColumn = NULL;
if (distributedTableParams)
{
distributionColumn = BuildDistributionKeyFromColumnName(relationId,
distributedTableParams->
distributionColumnName,
NoLock);
}
CitusTableParams citusTableParams = DecideCitusTableParams(tableType,
distributedTableParams);
/*
* ColocationIdForNewTable assumes caller acquires lock on relationId. In our case,
* our caller already acquired lock on relationId.
*/
uint32 colocationId = ColocationIdForNewTable(relationId, distributionColumn,
distributionMethod, replicationModel,
shardCount, shardCountIsStrict,
colocateWithTableName);
uint32 colocationId = ColocationIdForNewTable(relationId, tableType,
distributedTableParams,
distributionColumn);
EnsureRelationCanBeDistributed(relationId, distributionColumn, distributionMethod,
colocationId, replicationModel);
EnsureRelationCanBeDistributed(relationId, distributionColumn,
citusTableParams.distributionMethod,
colocationId, citusTableParams.replicationModel);
/*
* Make sure that existing reference tables have been replicated to all the nodes
@ -1068,8 +1165,10 @@ CreateDistributedTable(Oid relationId, char *distributionColumnName,
bool autoConverted = false;
/* create an entry for distributed table in pg_dist_partition */
InsertIntoPgDistPartition(relationId, distributionMethod, distributionColumn,
colocationId, replicationModel, autoConverted);
InsertIntoPgDistPartition(relationId, citusTableParams.distributionMethod,
distributionColumn,
colocationId, citusTableParams.replicationModel,
autoConverted);
/* foreign tables do not support TRUNCATE trigger */
if (RegularTable(relationId))
@ -1078,17 +1177,14 @@ CreateDistributedTable(Oid relationId, char *distributionColumnName,
}
/* create shards for hash distributed and reference tables */
if (distributionMethod == DISTRIBUTE_BY_HASH)
if (tableType == HASH_DISTRIBUTED)
{
CreateHashDistributedTableShards(relationId, shardCount, colocatedTableId,
CreateHashDistributedTableShards(relationId, distributedTableParams->shardCount,
colocatedTableId,
localTableEmpty);
}
else if (distributionMethod == DISTRIBUTE_BY_NONE)
else if (tableType == REFERENCE_TABLE)
{
/*
* This function does not expect to create Citus local table, so we blindly
* create reference table when the method is DISTRIBUTE_BY_NONE.
*/
CreateReferenceTableShard(relationId);
}
@ -1116,17 +1212,36 @@ CreateDistributedTable(Oid relationId, char *distributionColumnName,
char *relationName = get_rel_name(relationId);
char *parentRelationName = quote_qualified_identifier(schemaName, relationName);
/*
* when there are many partitions, each call to CreateDistributedTable
* accumulates used memory. Create and free context for each call.
*/
MemoryContext citusPartitionContext =
AllocSetContextCreate(CurrentMemoryContext,
"citus_per_partition_context",
ALLOCSET_DEFAULT_SIZES);
MemoryContext oldContext = MemoryContextSwitchTo(citusPartitionContext);
foreach_oid(partitionRelationId, partitionList)
{
CreateDistributedTable(partitionRelationId, distributionColumnName,
distributionMethod, shardCount, false,
parentRelationName);
MemoryContextReset(citusPartitionContext);
DistributedTableParams childDistributedTableParams = {
.colocateWithTableName = parentRelationName,
.shardCount = distributedTableParams->shardCount,
.shardCountIsStrict = false,
.distributionColumnName = distributedTableParams->distributionColumnName,
};
CreateCitusTable(partitionRelationId, tableType,
&childDistributedTableParams);
}
MemoryContextSwitchTo(oldContext);
MemoryContextDelete(citusPartitionContext);
}
/* copy over data for hash distributed and reference tables */
if (distributionMethod == DISTRIBUTE_BY_HASH ||
distributionMethod == DISTRIBUTE_BY_NONE)
if (tableType == HASH_DISTRIBUTED || tableType == REFERENCE_TABLE)
{
if (RegularTable(relationId))
{
@ -1145,6 +1260,70 @@ CreateDistributedTable(Oid relationId, char *distributionColumnName,
}
/*
* DecideCitusTableParams decides CitusTableParams based on given CitusTableType
* and DistributedTableParams if it's a distributed table.
*
* DistributedTableParams should be non-null only if CitusTableType corresponds
* to a distributed table.
*/
static
CitusTableParams
DecideCitusTableParams(CitusTableType tableType,
DistributedTableParams *distributedTableParams)
{
CitusTableParams citusTableParams = { 0 };
switch (tableType)
{
case HASH_DISTRIBUTED:
{
citusTableParams.distributionMethod = DISTRIBUTE_BY_HASH;
citusTableParams.replicationModel =
DecideDistTableReplicationModel(DISTRIBUTE_BY_HASH,
distributedTableParams->
colocateWithTableName);
break;
}
case APPEND_DISTRIBUTED:
{
citusTableParams.distributionMethod = DISTRIBUTE_BY_APPEND;
citusTableParams.replicationModel =
DecideDistTableReplicationModel(APPEND_DISTRIBUTED,
distributedTableParams->
colocateWithTableName);
break;
}
case RANGE_DISTRIBUTED:
{
citusTableParams.distributionMethod = DISTRIBUTE_BY_RANGE;
citusTableParams.replicationModel =
DecideDistTableReplicationModel(RANGE_DISTRIBUTED,
distributedTableParams->
colocateWithTableName);
break;
}
case REFERENCE_TABLE:
{
citusTableParams.distributionMethod = DISTRIBUTE_BY_NONE;
citusTableParams.replicationModel = REPLICATION_MODEL_2PC;
break;
}
default:
{
ereport(ERROR, (errmsg("unexpected table type when deciding Citus "
"table params")));
break;
}
}
return citusTableParams;
}
/*
* PropagatePrerequisiteObjectsForDistributedTable ensures we can create shards
* on all nodes by ensuring all dependent objects exist on all node.
@ -1190,7 +1369,7 @@ EnsureSequenceTypeSupported(Oid seqOid, Oid attributeTypeId, Oid ownerRelationId
foreach_oid(citusTableId, citusTableIdList)
{
List *seqInfoList = NIL;
GetDependentSequencesWithRelation(citusTableId, &seqInfoList, 0);
GetDependentSequencesWithRelation(citusTableId, &seqInfoList, 0, DEPENDENCY_AUTO);
SequenceInfo *seqInfo = NULL;
foreach_ptr(seqInfo, seqInfoList)
@ -1267,7 +1446,7 @@ EnsureRelationHasCompatibleSequenceTypes(Oid relationId)
{
List *seqInfoList = NIL;
GetDependentSequencesWithRelation(relationId, &seqInfoList, 0);
GetDependentSequencesWithRelation(relationId, &seqInfoList, 0, DEPENDENCY_AUTO);
EnsureDistributedSequencesHaveOneType(relationId, seqInfoList);
}
@ -1405,17 +1584,15 @@ DropFKeysRelationInvolvedWithTableType(Oid relationId, int tableTypeFlag)
/*
* DecideReplicationModel function decides which replication model should be
* used depending on given distribution configuration.
* DecideDistTableReplicationModel function decides which replication model should be
* used for a distributed table depending on given distribution configuration.
*/
static char
DecideReplicationModel(char distributionMethod, char *colocateWithTableName)
DecideDistTableReplicationModel(char distributionMethod, char *colocateWithTableName)
{
if (distributionMethod == DISTRIBUTE_BY_NONE)
{
return REPLICATION_MODEL_2PC;
}
else if (pg_strncasecmp(colocateWithTableName, "default", NAMEDATALEN) != 0 &&
Assert(distributionMethod != DISTRIBUTE_BY_NONE);
if (!IsColocateWithDefault(colocateWithTableName) &&
!IsColocateWithNone(colocateWithTableName))
{
text *colocateWithTableNameText = cstring_to_text(colocateWithTableName);
@ -1491,28 +1668,34 @@ CreateHashDistributedTableShards(Oid relationId, int shardCount,
/*
* ColocationIdForNewTable returns a colocation id for hash-distributed table
* ColocationIdForNewTable returns a colocation id for given table
* according to given configuration. If there is no such configuration, it
* creates one and returns colocation id of newly the created colocation group.
* Note that DistributedTableParams and the distribution column Var should be
* non-null only if CitusTableType corresponds to a distributed table.
*
* For append and range distributed tables, this function errors out if
* colocateWithTableName parameter is not NULL, otherwise directly returns
* INVALID_COLOCATION_ID.
*
* For reference tables, returns the common reference table colocation id.
*
* This function assumes its caller take necessary lock on relationId to
* prevent possible changes on it.
*/
static uint32
ColocationIdForNewTable(Oid relationId, Var *distributionColumn,
char distributionMethod, char replicationModel,
int shardCount, bool shardCountIsStrict,
char *colocateWithTableName)
ColocationIdForNewTable(Oid relationId, CitusTableType tableType,
DistributedTableParams *distributedTableParams,
Var *distributionColumn)
{
CitusTableParams citusTableParams = DecideCitusTableParams(tableType,
distributedTableParams);
uint32 colocationId = INVALID_COLOCATION_ID;
if (distributionMethod == DISTRIBUTE_BY_APPEND ||
distributionMethod == DISTRIBUTE_BY_RANGE)
if (tableType == APPEND_DISTRIBUTED || tableType == RANGE_DISTRIBUTED)
{
if (pg_strncasecmp(colocateWithTableName, "default", NAMEDATALEN) != 0)
if (!IsColocateWithDefault(distributedTableParams->colocateWithTableName))
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot distribute relation"),
@ -1522,7 +1705,7 @@ ColocationIdForNewTable(Oid relationId, Var *distributionColumn,
return colocationId;
}
else if (distributionMethod == DISTRIBUTE_BY_NONE)
else if (tableType == REFERENCE_TABLE)
{
return CreateReferenceTableColocationId();
}
@ -1533,27 +1716,29 @@ ColocationIdForNewTable(Oid relationId, Var *distributionColumn,
* can be sure that there will no modifications on the colocation table
* until this transaction is committed.
*/
Assert(distributionMethod == DISTRIBUTE_BY_HASH);
Assert(citusTableParams.distributionMethod == DISTRIBUTE_BY_HASH);
Oid distributionColumnType = distributionColumn->vartype;
Oid distributionColumnCollation = get_typcollation(distributionColumnType);
/* get an advisory lock to serialize concurrent default group creations */
if (IsColocateWithDefault(colocateWithTableName))
if (IsColocateWithDefault(distributedTableParams->colocateWithTableName))
{
AcquireColocationDefaultLock();
}
colocationId = FindColocateWithColocationId(relationId,
replicationModel,
citusTableParams.replicationModel,
distributionColumnType,
distributionColumnCollation,
shardCount,
distributedTableParams->shardCount,
distributedTableParams->
shardCountIsStrict,
distributedTableParams->
colocateWithTableName);
if (IsColocateWithDefault(colocateWithTableName) && (colocationId !=
INVALID_COLOCATION_ID))
if (IsColocateWithDefault(distributedTableParams->colocateWithTableName) &&
(colocationId != INVALID_COLOCATION_ID))
{
/*
* we can release advisory lock if there is already a default entry for given params;
@ -1565,23 +1750,25 @@ ColocationIdForNewTable(Oid relationId, Var *distributionColumn,
if (colocationId == INVALID_COLOCATION_ID)
{
if (IsColocateWithDefault(colocateWithTableName))
if (IsColocateWithDefault(distributedTableParams->colocateWithTableName))
{
/*
* Generate a new colocation ID and insert a pg_dist_colocation
* record.
*/
colocationId = CreateColocationGroup(shardCount, ShardReplicationFactor,
colocationId = CreateColocationGroup(distributedTableParams->shardCount,
ShardReplicationFactor,
distributionColumnType,
distributionColumnCollation);
}
else if (IsColocateWithNone(colocateWithTableName))
else if (IsColocateWithNone(distributedTableParams->colocateWithTableName))
{
/*
* Generate a new colocation ID and insert a pg_dist_colocation
* record.
*/
colocationId = CreateColocationGroup(shardCount, ShardReplicationFactor,
colocationId = CreateColocationGroup(distributedTableParams->shardCount,
ShardReplicationFactor,
distributionColumnType,
distributionColumnCollation);
}
@ -1608,6 +1795,8 @@ EnsureRelationCanBeDistributed(Oid relationId, Var *distributionColumn,
{
Oid parentRelationId = InvalidOid;
ErrorIfTableHasUnsupportedIdentityColumn(relationId);
EnsureLocalTableEmptyIfNecessary(relationId, distributionMethod);
/* user really wants triggers? */
@ -2219,12 +2408,12 @@ CopyLocalDataIntoShards(Oid distributedRelationId)
EState *estate = CreateExecutorState();
ExprContext *econtext = GetPerTupleExprContext(estate);
econtext->ecxt_scantuple = slot;
const bool nonPublishableData = false;
DestReceiver *copyDest =
(DestReceiver *) CreateCitusCopyDestReceiver(distributedRelationId,
columnNameList,
partitionColumnIndex,
estate, NULL);
estate, NULL, nonPublishableData);
/* initialise state for writing to shards, we'll open connections on demand */
copyDest->rStartup(copyDest, 0, tupleDescriptor);

View File

@ -29,16 +29,14 @@
#include "storage/lmgr.h"
#include "utils/lsyscache.h"
typedef bool (*AddressPredicate)(const ObjectAddress *);
static void EnsureDependenciesCanBeDistributed(const ObjectAddress *relationAddress);
static void ErrorIfCircularDependencyExists(const ObjectAddress *objectAddress);
static int ObjectAddressComparator(const void *a, const void *b);
static List * FilterObjectAddressListByPredicate(List *objectAddressList,
AddressPredicate predicate);
static void EnsureDependenciesExistOnAllNodes(const ObjectAddress *target);
static List * GetDependencyCreateDDLCommands(const ObjectAddress *dependency);
static bool ShouldPropagateObject(const ObjectAddress *address);
static char * DropTableIfExistsCommand(Oid relationId);
/*
* EnsureDependenciesExistOnAllNodes finds all the dependencies that we support and makes
@ -325,6 +323,21 @@ GetDistributableDependenciesForObject(const ObjectAddress *target)
}
/*
* DropTableIfExistsCommand returns command to drop given table if exists.
*/
static char *
DropTableIfExistsCommand(Oid relationId)
{
char *qualifiedRelationName = generate_qualified_relation_name(relationId);
StringInfo dropTableCommand = makeStringInfo();
appendStringInfo(dropTableCommand, "DROP TABLE IF EXISTS %s CASCADE",
qualifiedRelationName);
return dropTableCommand->data;
}
/*
* GetDependencyCreateDDLCommands returns a list (potentially empty or NIL) of ddl
* commands to execute on a worker to create the object.
@ -370,7 +383,7 @@ GetDependencyCreateDDLCommands(const ObjectAddress *dependency)
bool creatingShellTableOnRemoteNode = true;
List *tableDDLCommands = GetFullTableCreationCommands(relationId,
WORKER_NEXTVAL_SEQUENCE_DEFAULTS,
INCLUDE_IDENTITY_AS_SEQUENCE_DEFAULTS,
INCLUDE_IDENTITY,
creatingShellTableOnRemoteNode);
TableDDLCommand *tableDDLCommand = NULL;
foreach_ptr(tableDDLCommand, tableDDLCommands)
@ -379,6 +392,10 @@ GetDependencyCreateDDLCommands(const ObjectAddress *dependency)
commandList = lappend(commandList, GetTableDDLCommand(
tableDDLCommand));
}
/* we need to drop table, if exists, first to make table creation idempotent */
commandList = lcons(DropTableIfExistsCommand(relationId),
commandList);
}
return commandList;
@ -438,6 +455,11 @@ GetDependencyCreateDDLCommands(const ObjectAddress *dependency)
return DDLCommands;
}
case OCLASS_PUBLICATION:
{
return CreatePublicationDDLCommandsIdempotent(dependency);
}
case OCLASS_ROLE:
{
return GenerateCreateOrAlterRoleCommand(dependency->objectId);
@ -527,68 +549,6 @@ GetAllDependencyCreateDDLCommands(const List *dependencies)
}
/*
* ReplicateAllObjectsToNodeCommandList returns commands to replicate all
* previously marked objects to a worker node. The function also sets
* clusterHasDistributedFunction if there are any distributed functions.
*/
List *
ReplicateAllObjectsToNodeCommandList(const char *nodeName, int nodePort)
{
/* since we are executing ddl commands disable propagation first, primarily for mx */
List *ddlCommands = list_make1(DISABLE_DDL_PROPAGATION);
/*
* collect all dependencies in creation order and get their ddl commands
*/
List *dependencies = GetDistributedObjectAddressList();
/*
* Depending on changes in the environment, such as the enable_metadata_sync guc
* there might be objects in the distributed object address list that should currently
* not be propagated by citus as they are 'not supported'.
*/
dependencies = FilterObjectAddressListByPredicate(dependencies,
&SupportedDependencyByCitus);
/*
* When dependency lists are getting longer we see a delay in the creation time on the
* workers. We would like to inform the user. Currently we warn for lists greater than
* 100 items, where 100 is an arbitrarily chosen number. If we find it too high or too
* low we can adjust this based on experience.
*/
if (list_length(dependencies) > 100)
{
ereport(NOTICE, (errmsg("Replicating postgres objects to node %s:%d", nodeName,
nodePort),
errdetail("There are %d objects to replicate, depending on your "
"environment this might take a while",
list_length(dependencies))));
}
dependencies = OrderObjectAddressListInDependencyOrder(dependencies);
ObjectAddress *dependency = NULL;
foreach_ptr(dependency, dependencies)
{
if (IsAnyObjectAddressOwnedByExtension(list_make1(dependency), NULL))
{
/*
* we expect extension-owned objects to be created as a result
* of the extension being created.
*/
continue;
}
ddlCommands = list_concat(ddlCommands,
GetDependencyCreateDDLCommands(dependency));
}
ddlCommands = lappend(ddlCommands, ENABLE_DDL_PROPAGATION);
return ddlCommands;
}
/*
* ShouldPropagate determines if we should be propagating anything
*/
@ -744,7 +704,7 @@ ShouldPropagateAnyObject(List *addresses)
* FilterObjectAddressListByPredicate takes a list of ObjectAddress *'s and returns a list
* only containing the ObjectAddress *'s for which the predicate returned true.
*/
static List *
List *
FilterObjectAddressListByPredicate(List *objectAddressList, AddressPredicate predicate)
{
List *result = NIL;

View File

@ -245,6 +245,15 @@ static DistributeObjectOps Any_CreatePolicy = {
.address = NULL,
.markDistributed = false,
};
static DistributeObjectOps Any_CreatePublication = {
.deparse = DeparseCreatePublicationStmt,
.qualify = QualifyCreatePublicationStmt,
.preprocess = NULL,
.postprocess = PostProcessCreatePublicationStmt,
.operationType = DIST_OPS_CREATE,
.address = CreatePublicationStmtObjectAddress,
.markDistributed = true,
};
static DistributeObjectOps Any_CreateRole = {
.deparse = DeparseCreateRoleStmt,
.qualify = NULL,
@ -707,6 +716,45 @@ static DistributeObjectOps Procedure_Rename = {
.address = RenameFunctionStmtObjectAddress,
.markDistributed = false,
};
static DistributeObjectOps Publication_Alter = {
.deparse = DeparseAlterPublicationStmt,
.qualify = QualifyAlterPublicationStmt,
.preprocess = PreprocessAlterPublicationStmt,
.postprocess = PostprocessAlterDistributedObjectStmt,
.objectType = OBJECT_PUBLICATION,
.operationType = DIST_OPS_ALTER,
.address = AlterPublicationStmtObjectAddress,
.markDistributed = false,
};
static DistributeObjectOps Publication_AlterOwner = {
.deparse = DeparseAlterPublicationOwnerStmt,
.qualify = NULL,
.preprocess = PreprocessAlterDistributedObjectStmt,
.postprocess = PostprocessAlterDistributedObjectStmt,
.objectType = OBJECT_PUBLICATION,
.operationType = DIST_OPS_ALTER,
.address = AlterPublicationOwnerStmtObjectAddress,
.markDistributed = false,
};
static DistributeObjectOps Publication_Drop = {
.deparse = DeparseDropPublicationStmt,
.qualify = NULL,
.preprocess = PreprocessDropDistributedObjectStmt,
.postprocess = NULL,
.operationType = DIST_OPS_DROP,
.address = NULL,
.markDistributed = false,
};
static DistributeObjectOps Publication_Rename = {
.deparse = DeparseRenamePublicationStmt,
.qualify = NULL,
.preprocess = PreprocessAlterDistributedObjectStmt,
.postprocess = NULL,
.objectType = OBJECT_PUBLICATION,
.operationType = DIST_OPS_ALTER,
.address = RenamePublicationStmtObjectAddress,
.markDistributed = false,
};
static DistributeObjectOps Routine_AlterObjectDepends = {
.deparse = DeparseAlterFunctionDependsStmt,
.qualify = QualifyAlterFunctionDependsStmt,
@ -1399,6 +1447,11 @@ GetDistributeObjectOps(Node *node)
return &Procedure_AlterOwner;
}
case OBJECT_PUBLICATION:
{
return &Publication_AlterOwner;
}
case OBJECT_ROUTINE:
{
return &Routine_AlterOwner;
@ -1436,6 +1489,11 @@ GetDistributeObjectOps(Node *node)
return &Any_AlterPolicy;
}
case T_AlterPublicationStmt:
{
return &Publication_Alter;
}
case T_AlterRoleStmt:
{
return &Any_AlterRole;
@ -1610,6 +1668,11 @@ GetDistributeObjectOps(Node *node)
return &Any_CreatePolicy;
}
case T_CreatePublicationStmt:
{
return &Any_CreatePublication;
}
case T_CreateRoleStmt:
{
return &Any_CreateRole;
@ -1722,6 +1785,11 @@ GetDistributeObjectOps(Node *node)
return &Procedure_Drop;
}
case OBJECT_PUBLICATION:
{
return &Publication_Drop;
}
case OBJECT_ROUTINE:
{
return &Routine_Drop;
@ -1901,6 +1969,11 @@ GetDistributeObjectOps(Node *node)
return &Procedure_Rename;
}
case OBJECT_PUBLICATION:
{
return &Publication_Rename;
}
case OBJECT_ROUTINE:
{
return &Routine_Rename;

View File

@ -221,7 +221,8 @@ ErrorIfUnsupportedForeignConstraintExists(Relation relation, char referencingDis
if (!referencedIsCitus && !selfReferencingTable)
{
if (IsCitusLocalTableByDistParams(referencingDistMethod,
referencingReplicationModel))
referencingReplicationModel,
referencingColocationId))
{
ErrorOutForFKeyBetweenPostgresAndCitusLocalTable(referencedTableId);
}
@ -245,8 +246,7 @@ ErrorIfUnsupportedForeignConstraintExists(Relation relation, char referencingDis
if (!selfReferencingTable)
{
referencedDistMethod = PartitionMethod(referencedTableId);
referencedDistKey = IsCitusTableType(referencedTableId,
CITUS_TABLE_WITH_NO_DIST_KEY) ?
referencedDistKey = !HasDistributionKey(referencedTableId) ?
NULL :
DistPartitionKey(referencedTableId);
referencedColocationId = TableColocationId(referencedTableId);
@ -278,9 +278,17 @@ ErrorIfUnsupportedForeignConstraintExists(Relation relation, char referencingDis
}
bool referencingIsCitusLocalOrRefTable =
(referencingDistMethod == DISTRIBUTE_BY_NONE);
IsCitusLocalTableByDistParams(referencingDistMethod,
referencingReplicationModel,
referencingColocationId) ||
IsReferenceTableByDistParams(referencingDistMethod,
referencingReplicationModel);
bool referencedIsCitusLocalOrRefTable =
(referencedDistMethod == DISTRIBUTE_BY_NONE);
IsCitusLocalTableByDistParams(referencedDistMethod,
referencedReplicationModel,
referencedColocationId) ||
IsReferenceTableByDistParams(referencedDistMethod,
referencedReplicationModel);
if (referencingIsCitusLocalOrRefTable && referencedIsCitusLocalOrRefTable)
{
EnsureSupportedFKeyBetweenCitusLocalAndRefTable(constraintForm,
@ -313,7 +321,8 @@ ErrorIfUnsupportedForeignConstraintExists(Relation relation, char referencingDis
* reference table is referenced.
*/
bool referencedIsReferenceTable =
(referencedReplicationModel == REPLICATION_MODEL_2PC);
IsReferenceTableByDistParams(referencedDistMethod,
referencedReplicationModel);
if (!referencedIsReferenceTable && (
referencingColocationId == INVALID_COLOCATION_ID ||
referencingColocationId != referencedColocationId))

View File

@ -1190,7 +1190,7 @@ ErrorIfUnsupportedIndexStmt(IndexStmt *createIndexStatement)
* Non-distributed tables do not have partition key, and unique constraints
* are allowed for them. Thus, we added a short-circuit for non-distributed tables.
*/
if (IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKey(relationId))
{
return;
}

View File

@ -36,6 +36,7 @@
#include "distributed/local_multi_copy.h"
#include "distributed/shard_utils.h"
#include "distributed/version_compat.h"
#include "distributed/replication_origin_session_utils.h"
/* managed via GUC, default is 512 kB */
int LocalCopyFlushThresholdByte = 512 * 1024;
@ -46,7 +47,7 @@ static void AddSlotToBuffer(TupleTableSlot *slot, CitusCopyDestReceiver *copyDes
static bool ShouldAddBinaryHeaders(StringInfo buffer, bool isBinary);
static bool ShouldSendCopyNow(StringInfo buffer);
static void DoLocalCopy(StringInfo buffer, Oid relationId, int64 shardId,
CopyStmt *copyStatement, bool isEndOfCopy);
CopyStmt *copyStatement, bool isEndOfCopy, bool isPublishable);
static int ReadFromLocalBufferCallback(void *outBuf, int minRead, int maxRead);
@ -94,7 +95,7 @@ WriteTupleToLocalShard(TupleTableSlot *slot, CitusCopyDestReceiver *copyDest, in
bool isEndOfCopy = false;
DoLocalCopy(localCopyOutState->fe_msgbuf, copyDest->distributedRelationId,
shardId,
copyDest->copyStatement, isEndOfCopy);
copyDest->copyStatement, isEndOfCopy, copyDest->isPublishable);
resetStringInfo(localCopyOutState->fe_msgbuf);
}
}
@ -133,7 +134,7 @@ FinishLocalCopyToShard(CitusCopyDestReceiver *copyDest, int64 shardId,
}
bool isEndOfCopy = true;
DoLocalCopy(localCopyOutState->fe_msgbuf, copyDest->distributedRelationId, shardId,
copyDest->copyStatement, isEndOfCopy);
copyDest->copyStatement, isEndOfCopy, copyDest->isPublishable);
}
@ -197,7 +198,7 @@ ShouldSendCopyNow(StringInfo buffer)
*/
static void
DoLocalCopy(StringInfo buffer, Oid relationId, int64 shardId, CopyStmt *copyStatement,
bool isEndOfCopy)
bool isEndOfCopy, bool isPublishable)
{
/*
* Set the buffer as a global variable to allow ReadFromLocalBufferCallback
@ -205,6 +206,10 @@ DoLocalCopy(StringInfo buffer, Oid relationId, int64 shardId, CopyStmt *copyStat
* ReadFromLocalBufferCallback.
*/
LocalCopyBuffer = buffer;
if (!isPublishable)
{
SetupReplicationOriginLocalSession();
}
Oid shardOid = GetTableLocalShardOid(relationId, shardId);
Relation shard = table_open(shardOid, RowExclusiveLock);
@ -219,6 +224,10 @@ DoLocalCopy(StringInfo buffer, Oid relationId, int64 shardId, CopyStmt *copyStat
EndCopyFrom(cstate);
table_close(shard, NoLock);
if (!isPublishable)
{
ResetReplicationOriginLocalSession();
}
free_parsestate(pState);
}

View File

@ -85,6 +85,7 @@
#include "distributed/relation_access_tracking.h"
#include "distributed/remote_commands.h"
#include "distributed/remote_transaction.h"
#include "distributed/replication_origin_session_utils.h"
#include "distributed/resource_lock.h"
#include "distributed/shard_pruning.h"
#include "distributed/shared_connection_stats.h"
@ -270,7 +271,8 @@ static CopyConnectionState * GetConnectionState(HTAB *connectionStateHash,
static CopyShardState * GetShardState(uint64 shardId, HTAB *shardStateHash,
HTAB *connectionStateHash,
bool *found, bool shouldUseLocalCopy, CopyOutState
copyOutState, bool isColocatedIntermediateResult);
copyOutState, bool isColocatedIntermediateResult,
bool isPublishable);
static MultiConnection * CopyGetPlacementConnection(HTAB *connectionStateHash,
ShardPlacement *placement,
bool colocatedIntermediateResult);
@ -285,7 +287,8 @@ static void InitializeCopyShardState(CopyShardState *shardState,
uint64 shardId,
bool canUseLocalCopy,
CopyOutState copyOutState,
bool colocatedIntermediateResult);
bool colocatedIntermediateResult, bool
isPublishable);
static void StartPlacementStateCopyCommand(CopyPlacementState *placementState,
CopyStmt *copyStatement,
CopyOutState copyOutState);
@ -393,7 +396,7 @@ CitusCopyFrom(CopyStmt *copyStatement, QueryCompletion *completionTag)
if (IsCitusTableTypeCacheEntry(cacheEntry, HASH_DISTRIBUTED) ||
IsCitusTableTypeCacheEntry(cacheEntry, RANGE_DISTRIBUTED) ||
IsCitusTableTypeCacheEntry(cacheEntry, APPEND_DISTRIBUTED) ||
IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY))
!HasDistributionKeyCacheEntry(cacheEntry))
{
CopyToExistingShards(copyStatement, completionTag);
}
@ -492,9 +495,11 @@ CopyToExistingShards(CopyStmt *copyStatement, QueryCompletion *completionTag)
ExprContext *executorExpressionContext = GetPerTupleExprContext(executorState);
/* set up the destination for the COPY */
const bool publishableData = true;
CitusCopyDestReceiver *copyDest = CreateCitusCopyDestReceiver(tableId, columnNameList,
partitionColumnIndex,
executorState, NULL);
executorState, NULL,
publishableData);
/* if the user specified an explicit append-to_shard option, write to it */
uint64 appendShardId = ProcessAppendToShardOption(tableId, copyStatement);
@ -1934,7 +1939,7 @@ CopyFlushOutput(CopyOutState cstate, char *start, char *pointer)
CitusCopyDestReceiver *
CreateCitusCopyDestReceiver(Oid tableId, List *columnNameList, int partitionColumnIndex,
EState *executorState,
char *intermediateResultIdPrefix)
char *intermediateResultIdPrefix, bool isPublishable)
{
CitusCopyDestReceiver *copyDest = (CitusCopyDestReceiver *) palloc0(
sizeof(CitusCopyDestReceiver));
@ -1953,6 +1958,7 @@ CreateCitusCopyDestReceiver(Oid tableId, List *columnNameList, int partitionColu
copyDest->executorState = executorState;
copyDest->colocatedIntermediateResultIdPrefix = intermediateResultIdPrefix;
copyDest->memoryContext = CurrentMemoryContext;
copyDest->isPublishable = isPublishable;
return copyDest;
}
@ -2318,7 +2324,9 @@ CitusSendTupleToPlacements(TupleTableSlot *slot, CitusCopyDestReceiver *copyDest
&cachedShardStateFound,
copyDest->shouldUseLocalCopy,
copyDest->copyOutState,
isColocatedIntermediateResult);
isColocatedIntermediateResult,
copyDest->isPublishable);
if (!cachedShardStateFound)
{
firstTupleInShard = true;
@ -2751,6 +2759,11 @@ ShutdownCopyConnectionState(CopyConnectionState *connectionState,
if (activePlacementState != NULL)
{
EndPlacementStateCopyCommand(activePlacementState, copyOutState);
if (!copyDest->isPublishable)
{
ResetReplicationOriginRemoteSession(
activePlacementState->connectionState->connection);
}
}
dlist_foreach(iter, &connectionState->bufferedPlacementList)
@ -2764,6 +2777,10 @@ ShutdownCopyConnectionState(CopyConnectionState *connectionState,
SendCopyDataToPlacement(placementState->data, shardId,
connectionState->connection);
EndPlacementStateCopyCommand(placementState, copyOutState);
if (!copyDest->isPublishable)
{
ResetReplicationOriginRemoteSession(connectionState->connection);
}
}
}
@ -3436,7 +3453,7 @@ static CopyShardState *
GetShardState(uint64 shardId, HTAB *shardStateHash,
HTAB *connectionStateHash, bool *found, bool
shouldUseLocalCopy, CopyOutState copyOutState,
bool isColocatedIntermediateResult)
bool isColocatedIntermediateResult, bool isPublishable)
{
CopyShardState *shardState = (CopyShardState *) hash_search(shardStateHash, &shardId,
HASH_ENTER, found);
@ -3444,7 +3461,8 @@ GetShardState(uint64 shardId, HTAB *shardStateHash,
{
InitializeCopyShardState(shardState, connectionStateHash,
shardId, shouldUseLocalCopy,
copyOutState, isColocatedIntermediateResult);
copyOutState, isColocatedIntermediateResult,
isPublishable);
}
return shardState;
@ -3461,7 +3479,8 @@ InitializeCopyShardState(CopyShardState *shardState,
HTAB *connectionStateHash, uint64 shardId,
bool shouldUseLocalCopy,
CopyOutState copyOutState,
bool colocatedIntermediateResult)
bool colocatedIntermediateResult,
bool isPublishable)
{
ListCell *placementCell = NULL;
int failedPlacementCount = 0;
@ -3532,6 +3551,11 @@ InitializeCopyShardState(CopyShardState *shardState,
RemoteTransactionBeginIfNecessary(connection);
}
if (!isPublishable)
{
SetupReplicationOriginRemoteSession(connection);
}
CopyPlacementState *placementState = palloc0(sizeof(CopyPlacementState));
placementState->shardState = shardState;
placementState->data = makeStringInfo();

View File

@ -0,0 +1,634 @@
/*-------------------------------------------------------------------------
*
* publication.c
* Commands for creating publications
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include "catalog/pg_publication.h"
#include "catalog/pg_publication_rel.h"
#include "distributed/commands.h"
#include "distributed/deparser.h"
#include "distributed/listutils.h"
#include "distributed/metadata_utility.h"
#include "distributed/metadata_sync.h"
#include "distributed/metadata/distobject.h"
#include "distributed/reference_table_utils.h"
#include "distributed/worker_create_or_replace.h"
#include "nodes/makefuncs.h"
#include "nodes/parsenodes.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
#include "pg_version_compat.h"
static CreatePublicationStmt * BuildCreatePublicationStmt(Oid publicationId);
#if (PG_VERSION_NUM >= PG_VERSION_15)
static PublicationObjSpec * BuildPublicationRelationObjSpec(Oid relationId,
Oid publicationId,
bool tableOnly);
#endif
static void AppendPublishOptionList(StringInfo str, List *strings);
static char * AlterPublicationOwnerCommand(Oid publicationId);
static bool ShouldPropagateCreatePublication(CreatePublicationStmt *stmt);
static List * ObjectAddressForPublicationName(char *publicationName, bool missingOk);
/*
* PostProcessCreatePublicationStmt handles CREATE PUBLICATION statements
* that contain distributed tables.
*/
List *
PostProcessCreatePublicationStmt(Node *node, const char *queryString)
{
CreatePublicationStmt *stmt = castNode(CreatePublicationStmt, node);
if (!ShouldPropagateCreatePublication(stmt))
{
/* should not propagate right now */
return NIL;
}
/* call into CreatePublicationStmtObjectAddress */
List *publicationAddresses = GetObjectAddressListFromParseTree(node, false, true);
/* the code-path only supports a single object */
Assert(list_length(publicationAddresses) == 1);
if (IsAnyObjectAddressOwnedByExtension(publicationAddresses, NULL))
{
/* should not propagate publications owned by extensions */
return NIL;
}
EnsureAllObjectDependenciesExistOnAllNodes(publicationAddresses);
const ObjectAddress *pubAddress = linitial(publicationAddresses);
List *commands = NIL;
commands = lappend(commands, DISABLE_DDL_PROPAGATION);
commands = lappend(commands, CreatePublicationDDLCommand(pubAddress->objectId));
commands = lappend(commands, ENABLE_DDL_PROPAGATION);
return NodeDDLTaskList(NON_COORDINATOR_NODES, commands);
}
/*
* CreatePublicationDDLCommandsIdempotent returns a list of DDL statements to be
* executed on a node to recreate the publication addressed by the publicationAddress.
*/
List *
CreatePublicationDDLCommandsIdempotent(const ObjectAddress *publicationAddress)
{
Assert(publicationAddress->classId == PublicationRelationId);
char *ddlCommand =
CreatePublicationDDLCommand(publicationAddress->objectId);
char *alterPublicationOwnerSQL =
AlterPublicationOwnerCommand(publicationAddress->objectId);
return list_make2(
WrapCreateOrReplace(ddlCommand),
alterPublicationOwnerSQL);
}
/*
* CreatePublicationDDLCommand returns the CREATE PUBLICATION string that
* can be used to recreate a given publication.
*/
char *
CreatePublicationDDLCommand(Oid publicationId)
{
CreatePublicationStmt *createPubStmt = BuildCreatePublicationStmt(publicationId);
/* we took the WHERE clause from the catalog where it is already transformed */
bool whereClauseRequiresTransform = false;
/* only propagate Citus tables in publication */
bool includeLocalTables = false;
return DeparseCreatePublicationStmtExtended((Node *) createPubStmt,
whereClauseRequiresTransform,
includeLocalTables);
}
/*
* BuildCreatePublicationStmt constructs a CreatePublicationStmt struct for the
* given publication.
*/
static CreatePublicationStmt *
BuildCreatePublicationStmt(Oid publicationId)
{
CreatePublicationStmt *createPubStmt = makeNode(CreatePublicationStmt);
HeapTuple publicationTuple =
SearchSysCache1(PUBLICATIONOID, ObjectIdGetDatum(publicationId));
if (!HeapTupleIsValid(publicationTuple))
{
ereport(ERROR, (errmsg("cannot find publication with oid: %d", publicationId)));
}
Form_pg_publication publicationForm =
(Form_pg_publication) GETSTRUCT(publicationTuple);
/* CREATE PUBLICATION <name> */
createPubStmt->pubname = pstrdup(NameStr(publicationForm->pubname));
/* FOR ALL TABLES */
createPubStmt->for_all_tables = publicationForm->puballtables;
ReleaseSysCache(publicationTuple);
#if (PG_VERSION_NUM >= PG_VERSION_15)
List *schemaIds = GetPublicationSchemas(publicationId);
Oid schemaId = InvalidOid;
foreach_oid(schemaId, schemaIds)
{
char *schemaName = get_namespace_name(schemaId);
PublicationObjSpec *publicationObject = makeNode(PublicationObjSpec);
publicationObject->pubobjtype = PUBLICATIONOBJ_TABLES_IN_SCHEMA;
publicationObject->pubtable = NULL;
publicationObject->name = schemaName;
publicationObject->location = -1;
createPubStmt->pubobjects = lappend(createPubStmt->pubobjects, publicationObject);
}
#endif
List *relationIds = GetPublicationRelations(publicationId,
publicationForm->pubviaroot ?
PUBLICATION_PART_ROOT :
PUBLICATION_PART_LEAF);
Oid relationId = InvalidOid;
int citusTableCount PG_USED_FOR_ASSERTS_ONLY = 0;
/* mainly for consistent ordering in test output */
relationIds = SortList(relationIds, CompareOids);
foreach_oid(relationId, relationIds)
{
#if (PG_VERSION_NUM >= PG_VERSION_15)
bool tableOnly = false;
/* since postgres 15, tables can have a column list and filter */
PublicationObjSpec *publicationObject =
BuildPublicationRelationObjSpec(relationId, publicationId, tableOnly);
createPubStmt->pubobjects = lappend(createPubStmt->pubobjects, publicationObject);
#else
/* before postgres 15, only full tables are supported */
char *schemaName = get_namespace_name(get_rel_namespace(relationId));
char *tableName = get_rel_name(relationId);
RangeVar *rangeVar = makeRangeVar(schemaName, tableName, -1);
createPubStmt->tables = lappend(createPubStmt->tables, rangeVar);
#endif
if (IsCitusTable(relationId))
{
citusTableCount++;
}
}
/* WITH (publish_via_partition_root = true) option */
bool publishViaRoot = publicationForm->pubviaroot;
char *publishViaRootString = publishViaRoot ? "true" : "false";
DefElem *pubViaRootOption = makeDefElem("publish_via_partition_root",
(Node *) makeString(publishViaRootString),
-1);
createPubStmt->options = lappend(createPubStmt->options, pubViaRootOption);
/* WITH (publish = 'insert, update, delete, truncate') option */
List *publishList = NIL;
if (publicationForm->pubinsert)
{
publishList = lappend(publishList, makeString("insert"));
}
if (publicationForm->pubupdate)
{
publishList = lappend(publishList, makeString("update"));
}
if (publicationForm->pubdelete)
{
publishList = lappend(publishList, makeString("delete"));
}
if (publicationForm->pubtruncate)
{
publishList = lappend(publishList, makeString("truncate"));
}
if (list_length(publishList) > 0)
{
StringInfo optionValue = makeStringInfo();
AppendPublishOptionList(optionValue, publishList);
DefElem *publishOption = makeDefElem("publish",
(Node *) makeString(optionValue->data), -1);
createPubStmt->options = lappend(createPubStmt->options, publishOption);
}
return createPubStmt;
}
/*
* AppendPublishOptionList appends a list of publication options in
* comma-separate form.
*/
static void
AppendPublishOptionList(StringInfo str, List *options)
{
ListCell *stringCell = NULL;
foreach(stringCell, options)
{
const char *string = strVal(lfirst(stringCell));
if (stringCell != list_head(options))
{
appendStringInfoString(str, ", ");
}
/* we cannot escape these strings */
appendStringInfoString(str, string);
}
}
#if (PG_VERSION_NUM >= PG_VERSION_15)
/*
* BuildPublicationRelationObjSpec returns a PublicationObjSpec that
* can be included in a CREATE or ALTER PUBLICATION statement.
*/
static PublicationObjSpec *
BuildPublicationRelationObjSpec(Oid relationId, Oid publicationId,
bool tableOnly)
{
HeapTuple pubRelationTuple = SearchSysCache2(PUBLICATIONRELMAP,
ObjectIdGetDatum(relationId),
ObjectIdGetDatum(publicationId));
if (!HeapTupleIsValid(pubRelationTuple))
{
ereport(ERROR, (errmsg("cannot find relation with oid %d in publication "
"with oid %d", relationId, publicationId)));
}
List *columnNameList = NIL;
Node *whereClause = NULL;
/* build the column list */
if (!tableOnly)
{
bool isNull = false;
Datum attributesDatum = SysCacheGetAttr(PUBLICATIONRELMAP, pubRelationTuple,
Anum_pg_publication_rel_prattrs,
&isNull);
if (!isNull)
{
ArrayType *attributesArray = DatumGetArrayTypeP(attributesDatum);
int attributeCount = ARR_DIMS(attributesArray)[0];
int16 *elems = (int16 *) ARR_DATA_PTR(attributesArray);
for (int attNumIndex = 0; attNumIndex < attributeCount; attNumIndex++)
{
AttrNumber attributeNumber = elems[attNumIndex];
char *columnName = get_attname(relationId, attributeNumber, false);
columnNameList = lappend(columnNameList, makeString(columnName));
}
}
/* build the WHERE clause */
Datum whereClauseDatum = SysCacheGetAttr(PUBLICATIONRELMAP, pubRelationTuple,
Anum_pg_publication_rel_prqual,
&isNull);
if (!isNull)
{
/*
* We use the already-transformed parse tree form here, which does
* not match regular CreatePublicationStmt
*/
whereClause = stringToNode(TextDatumGetCString(whereClauseDatum));
}
}
ReleaseSysCache(pubRelationTuple);
char *schemaName = get_namespace_name(get_rel_namespace(relationId));
char *tableName = get_rel_name(relationId);
RangeVar *rangeVar = makeRangeVar(schemaName, tableName, -1);
/* build the FOR TABLE */
PublicationTable *publicationTable =
makeNode(PublicationTable);
publicationTable->relation = rangeVar;
publicationTable->whereClause = whereClause;
publicationTable->columns = columnNameList;
PublicationObjSpec *publicationObject = makeNode(PublicationObjSpec);
publicationObject->pubobjtype = PUBLICATIONOBJ_TABLE;
publicationObject->pubtable = publicationTable;
publicationObject->name = NULL;
publicationObject->location = -1;
return publicationObject;
}
#endif
/*
* PreprocessAlterPublicationStmt handles ALTER PUBLICATION statements
* in a way that is mostly similar to PreprocessAlterDistributedObjectStmt,
* except we do not ensure sequential mode (publications do not interact with
* shards) and can handle NULL deparse commands for ALTER PUBLICATION commands
* that only involve local tables.
*/
List *
PreprocessAlterPublicationStmt(Node *stmt, const char *queryString,
ProcessUtilityContext processUtilityContext)
{
List *addresses = GetObjectAddressListFromParseTree(stmt, false, false);
/* the code-path only supports a single object */
Assert(list_length(addresses) == 1);
if (!ShouldPropagateAnyObject(addresses))
{
return NIL;
}
EnsureCoordinator();
QualifyTreeNode(stmt);
const char *sql = DeparseTreeNode((Node *) stmt);
if (sql == NULL)
{
/*
* Deparsing logic decided that there is nothing to propagate, e.g.
* because the command only concerns local tables.
*/
return NIL;
}
List *commands = list_make3(DISABLE_DDL_PROPAGATION,
(void *) sql,
ENABLE_DDL_PROPAGATION);
return NodeDDLTaskList(NON_COORDINATOR_NODES, commands);
}
/*
* GetAlterPublicationDDLCommandsForTable gets a list of ALTER PUBLICATION .. ADD/DROP
* commands for the given table.
*
* If isAdd is true, it return ALTER PUBLICATION .. ADD TABLE commands for all
* publications.
*
* Otherwise, it returns ALTER PUBLICATION .. DROP TABLE commands for all
* publications.
*/
List *
GetAlterPublicationDDLCommandsForTable(Oid relationId, bool isAdd)
{
List *commands = NIL;
List *publicationIds = GetRelationPublications(relationId);
Oid publicationId = InvalidOid;
foreach_oid(publicationId, publicationIds)
{
char *command = GetAlterPublicationTableDDLCommand(publicationId,
relationId, isAdd);
commands = lappend(commands, command);
}
return commands;
}
/*
* GetAlterPublicationTableDDLCommand generates an ALTer PUBLICATION .. ADD/DROP TABLE
* command for the given publication and relation ID.
*
* If isAdd is true, it return an ALTER PUBLICATION .. ADD TABLE command.
* Otherwise, it returns ALTER PUBLICATION .. DROP TABLE command.
*/
char *
GetAlterPublicationTableDDLCommand(Oid publicationId, Oid relationId,
bool isAdd)
{
HeapTuple pubTuple = SearchSysCache1(PUBLICATIONOID,
ObjectIdGetDatum(publicationId));
if (!HeapTupleIsValid(pubTuple))
{
ereport(ERROR, (errmsg("cannot find publication with oid: %d",
publicationId)));
}
Form_pg_publication pubForm = (Form_pg_publication) GETSTRUCT(pubTuple);
AlterPublicationStmt *alterPubStmt = makeNode(AlterPublicationStmt);
alterPubStmt->pubname = pstrdup(NameStr(pubForm->pubname));
ReleaseSysCache(pubTuple);
#if (PG_VERSION_NUM >= PG_VERSION_15)
bool tableOnly = !isAdd;
/* since postgres 15, tables can have a column list and filter */
PublicationObjSpec *publicationObject =
BuildPublicationRelationObjSpec(relationId, publicationId, tableOnly);
alterPubStmt->pubobjects = lappend(alterPubStmt->pubobjects, publicationObject);
alterPubStmt->action = isAdd ? AP_AddObjects : AP_DropObjects;
#else
/* before postgres 15, only full tables are supported */
char *schemaName = get_namespace_name(get_rel_namespace(relationId));
char *tableName = get_rel_name(relationId);
RangeVar *rangeVar = makeRangeVar(schemaName, tableName, -1);
alterPubStmt->tables = lappend(alterPubStmt->tables, rangeVar);
alterPubStmt->tableAction = isAdd ? DEFELEM_ADD : DEFELEM_DROP;
#endif
/* we take the WHERE clause from the catalog where it is already transformed */
bool whereClauseNeedsTransform = false;
/*
* We use these commands to restore publications before/after transforming a
* table, including transformations to/from local tables.
*/
bool includeLocalTables = true;
char *command = DeparseAlterPublicationStmtExtended((Node *) alterPubStmt,
whereClauseNeedsTransform,
includeLocalTables);
return command;
}
/*
* AlterPublicationOwnerCommand returns "ALTER PUBLICATION .. OWNER TO .."
* statement for the specified publication.
*/
static char *
AlterPublicationOwnerCommand(Oid publicationId)
{
HeapTuple publicationTuple =
SearchSysCache1(PUBLICATIONOID, ObjectIdGetDatum(publicationId));
if (!HeapTupleIsValid(publicationTuple))
{
ereport(ERROR, (errmsg("cannot find publication with oid: %d",
publicationId)));
}
Form_pg_publication publicationForm =
(Form_pg_publication) GETSTRUCT(publicationTuple);
char *publicationName = NameStr(publicationForm->pubname);
Oid publicationOwnerId = publicationForm->pubowner;
char *publicationOwnerName = GetUserNameFromId(publicationOwnerId, false);
StringInfo alterCommand = makeStringInfo();
appendStringInfo(alterCommand, "ALTER PUBLICATION %s OWNER TO %s",
quote_identifier(publicationName),
quote_identifier(publicationOwnerName));
ReleaseSysCache(publicationTuple);
return alterCommand->data;
}
/*
* ShouldPropagateCreatePublication tests if we need to propagate a CREATE PUBLICATION
* statement.
*/
static bool
ShouldPropagateCreatePublication(CreatePublicationStmt *stmt)
{
if (!ShouldPropagate())
{
return false;
}
if (!ShouldPropagateCreateInCoordinatedTransction())
{
return false;
}
return true;
}
/*
* AlterPublicationStmtObjectAddress generates the object address for the
* publication altered by a regular ALTER PUBLICATION .. statement.
*/
List *
AlterPublicationStmtObjectAddress(Node *node, bool missingOk, bool isPostProcess)
{
AlterPublicationStmt *stmt = castNode(AlterPublicationStmt, node);
return ObjectAddressForPublicationName(stmt->pubname, missingOk);
}
/*
* AlterPublicationOwnerStmtObjectAddress generates the object address for the
* publication altered by the given ALTER PUBLICATION .. OWNER TO statement.
*/
List *
AlterPublicationOwnerStmtObjectAddress(Node *node, bool missingOk, bool isPostProcess)
{
AlterOwnerStmt *stmt = castNode(AlterOwnerStmt, node);
return ObjectAddressForPublicationName(strVal(stmt->object), missingOk);
}
/*
* CreatePublicationStmtObjectAddress generates the object address for the
* publication created by the given CREATE PUBLICATION statement.
*/
List *
CreatePublicationStmtObjectAddress(Node *node, bool missingOk, bool isPostProcess)
{
CreatePublicationStmt *stmt = castNode(CreatePublicationStmt, node);
return ObjectAddressForPublicationName(stmt->pubname, missingOk);
}
/*
* RenamePublicationStmtObjectAddress generates the object address for the
* publication altered by the given ALter PUBLICATION .. RENAME TO statement.
*/
List *
RenamePublicationStmtObjectAddress(Node *node, bool missingOk, bool isPostprocess)
{
RenameStmt *stmt = castNode(RenameStmt, node);
return ObjectAddressForPublicationName(strVal(stmt->object), missingOk);
}
/*
* ObjectAddressForPublicationName returns the object address for a given publication
* name.
*/
static List *
ObjectAddressForPublicationName(char *publicationName, bool missingOk)
{
Oid publicationId = InvalidOid;
HeapTuple publicationTuple =
SearchSysCache1(PUBLICATIONNAME, CStringGetDatum(publicationName));
if (HeapTupleIsValid(publicationTuple))
{
Form_pg_publication publicationForm =
(Form_pg_publication) GETSTRUCT(publicationTuple);
publicationId = publicationForm->oid;
ReleaseSysCache(publicationTuple);
}
else if (!missingOk)
{
/* it should have just been created */
ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT),
errmsg("publication \"%s\" does not exist", publicationName)));
}
ObjectAddress *address = palloc0(sizeof(ObjectAddress));
ObjectAddressSet(*address, PublicationRelationId, publicationId);
return list_make1(address);
}

View File

@ -33,7 +33,8 @@
/* Local functions forward declarations for helper functions */
static bool OptionsSpecifyOwnedBy(List *optionList, Oid *ownedByTableId);
static Oid SequenceUsedInDistributedTable(const ObjectAddress *sequenceAddress);
static Oid SequenceUsedInDistributedTable(const ObjectAddress *sequenceAddress, char
depType);
static List * FilterDistributedSequences(GrantStmt *stmt);
@ -183,7 +184,7 @@ ExtractDefaultColumnsAndOwnedSequences(Oid relationId, List **columnNameList,
char *columnName = NameStr(attributeForm->attname);
List *columnOwnedSequences =
getOwnedSequences_internal(relationId, attributeIndex + 1, 0);
getOwnedSequences_internal(relationId, attributeIndex + 1, DEPENDENCY_AUTO);
if (attributeForm->atthasdef && list_length(columnOwnedSequences) == 0)
{
@ -453,21 +454,22 @@ PreprocessAlterSequenceStmt(Node *node, const char *queryString,
/* the code-path only supports a single object */
Assert(list_length(addresses) == 1);
/* We have already asserted that we have exactly 1 address in the addresses. */
ObjectAddress *address = linitial(addresses);
/* error out if the sequence is distributed */
if (IsAnyObjectDistributed(addresses))
if (IsAnyObjectDistributed(addresses) || SequenceUsedInDistributedTable(address,
DEPENDENCY_INTERNAL))
{
ereport(ERROR, (errmsg(
"Altering a distributed sequence is currently not supported.")));
}
/* We have already asserted that we have exactly 1 address in the addresses. */
ObjectAddress *address = linitial(addresses);
/*
* error out if the sequence is used in a distributed table
* and this is an ALTER SEQUENCE .. AS .. statement
*/
Oid citusTableId = SequenceUsedInDistributedTable(address);
Oid citusTableId = SequenceUsedInDistributedTable(address, DEPENDENCY_AUTO);
if (citusTableId != InvalidOid)
{
List *options = stmt->options;
@ -497,16 +499,19 @@ PreprocessAlterSequenceStmt(Node *node, const char *queryString,
* SequenceUsedInDistributedTable returns true if the argument sequence
* is used as the default value of a column in a distributed table.
* Returns false otherwise
* See DependencyType for the possible values of depType.
* We use DEPENDENCY_INTERNAL for sequences created by identity column.
* DEPENDENCY_AUTO for regular sequences.
*/
static Oid
SequenceUsedInDistributedTable(const ObjectAddress *sequenceAddress)
SequenceUsedInDistributedTable(const ObjectAddress *sequenceAddress, char depType)
{
List *citusTableIdList = CitusTableTypeIdList(ANY_CITUS_TABLE_TYPE);
Oid citusTableId = InvalidOid;
foreach_oid(citusTableId, citusTableIdList)
{
List *seqInfoList = NIL;
GetDependentSequencesWithRelation(citusTableId, &seqInfoList, 0);
GetDependentSequencesWithRelation(citusTableId, &seqInfoList, 0, depType);
SequenceInfo *seqInfo = NULL;
foreach_ptr(seqInfo, seqInfoList)
{

View File

@ -75,7 +75,7 @@ static void DistributePartitionUsingParent(Oid parentRelationId,
static void ErrorIfMultiLevelPartitioning(Oid parentRelationId, Oid partitionRelationId);
static void ErrorIfAttachCitusTableToPgLocalTable(Oid parentRelationId,
Oid partitionRelationId);
static bool AlterTableDefinesFKeyBetweenPostgresAndNonDistTable(
static bool ATDefinesFKeyBetweenPostgresAndCitusLocalOrRef(
AlterTableStmt *alterTableStatement);
static bool ShouldMarkConnectedRelationsNotAutoConverted(Oid leftRelationId,
Oid rightRelationId);
@ -1119,7 +1119,7 @@ PreprocessAlterTableStmt(Node *node, const char *alterTableCommand,
if (ShouldEnableLocalReferenceForeignKeys() &&
processUtilityContext != PROCESS_UTILITY_SUBCOMMAND &&
AlterTableDefinesFKeyBetweenPostgresAndNonDistTable(alterTableStatement))
ATDefinesFKeyBetweenPostgresAndCitusLocalOrRef(alterTableStatement))
{
/*
* We don't process subcommands generated by postgres.
@ -1378,29 +1378,6 @@ PreprocessAlterTableStmt(Node *node, const char *alterTableCommand,
}
}
/*
* We check for ADD COLUMN .. GENERATED .. AS IDENTITY expr
* since it uses a sequence as an internal dependency
* we should deparse the statement
*/
constraint = NULL;
foreach_ptr(constraint, columnConstraints)
{
if (constraint->contype == CONSTR_IDENTITY)
{
deparseAT = true;
useInitialDDLCommandString = false;
/*
* Since we don't support constraints for AT_AddColumn
* we have to set is_not_null to true explicitly for identity columns
*/
ColumnDef *newColDef = copyObject(columnDefinition);
newColDef->constraints = NULL;
newColDef->is_not_null = true;
newCmd->def = (Node *) newColDef;
}
}
/*
* We check for ADD COLUMN .. SERIAL pseudo-type
@ -1584,12 +1561,12 @@ PreprocessAlterTableStmt(Node *node, const char *alterTableCommand,
/*
* AlterTableDefinesFKeyBetweenPostgresAndNonDistTable returns true if given
* ATDefinesFKeyBetweenPostgresAndCitusLocalOrRef returns true if given
* alter table command defines foreign key between a postgres table and a
* reference or citus local table.
*/
static bool
AlterTableDefinesFKeyBetweenPostgresAndNonDistTable(AlterTableStmt *alterTableStatement)
ATDefinesFKeyBetweenPostgresAndCitusLocalOrRef(AlterTableStmt *alterTableStatement)
{
List *foreignKeyConstraintList =
GetAlterTableAddFKeyConstraintList(alterTableStatement);
@ -1607,9 +1584,12 @@ AlterTableDefinesFKeyBetweenPostgresAndNonDistTable(AlterTableStmt *alterTableSt
if (!IsCitusTable(leftRelationId))
{
return RelationIdListContainsCitusTableType(rightRelationIdList,
CITUS_TABLE_WITH_NO_DIST_KEY);
CITUS_LOCAL_TABLE) ||
RelationIdListContainsCitusTableType(rightRelationIdList,
REFERENCE_TABLE);
}
else if (IsCitusTableType(leftRelationId, CITUS_TABLE_WITH_NO_DIST_KEY))
else if (IsCitusTableType(leftRelationId, CITUS_LOCAL_TABLE) ||
IsCitusTableType(leftRelationId, REFERENCE_TABLE))
{
return RelationIdListContainsPostgresTable(rightRelationIdList);
}
@ -2539,34 +2519,6 @@ PostprocessAlterTableStmt(AlterTableStmt *alterTableStatement)
}
}
}
/*
* We check for ADD COLUMN .. GENERATED AS IDENTITY expr
* since it uses a seqeunce as an internal dependency
*/
constraint = NULL;
foreach_ptr(constraint, columnConstraints)
{
if (constraint->contype == CONSTR_IDENTITY)
{
AttrNumber attnum = get_attnum(relationId,
columnDefinition->colname);
bool missing_ok = false;
Oid seqOid = getIdentitySequence(relationId, attnum, missing_ok);
if (ShouldSyncTableMetadata(relationId))
{
needMetadataSyncForNewSequences = true;
alterTableDefaultNextvalCmd =
GetAddColumnWithNextvalDefaultCmd(seqOid,
relationId,
columnDefinition
->colname,
columnDefinition
->typeName);
}
}
}
}
/*
* We check for ALTER COLUMN .. SET DEFAULT nextval('user_defined_seq')
@ -3222,6 +3174,17 @@ ErrorIfUnsupportedAlterTableStmt(AlterTableStmt *alterTableStatement)
{
if (columnConstraint->contype == CONSTR_IDENTITY)
{
/*
* We currently don't support adding an identity column for an MX table
*/
if (ShouldSyncTableMetadata(relationId))
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg(
"cannot execute ADD COLUMN commands involving identity"
" columns when metadata is synchronized to workers")));
}
/*
* Currently we don't support backfilling the new identity column with default values
* if the table is not empty
@ -3352,7 +3315,8 @@ ErrorIfUnsupportedAlterTableStmt(AlterTableStmt *alterTableStatement)
*/
AttrNumber attnum = get_attnum(relationId, command->name);
List *seqInfoList = NIL;
GetDependentSequencesWithRelation(relationId, &seqInfoList, attnum);
GetDependentSequencesWithRelation(relationId, &seqInfoList, attnum,
DEPENDENCY_AUTO);
if (seqInfoList != NIL)
{
ereport(ERROR, (errmsg("cannot execute ALTER COLUMN TYPE .. command "
@ -3666,7 +3630,7 @@ SetupExecutionModeForAlterTable(Oid relationId, AlterTableCmd *command)
* sequential mode.
*/
if (executeSequentially &&
!IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY) &&
HasDistributionKey(relationId) &&
ParallelQueryExecutedInTransaction())
{
char *relationName = get_rel_name(relationId);
@ -4011,3 +3975,59 @@ MakeNameListFromRangeVar(const RangeVar *rel)
return list_make1(makeString(rel->relname));
}
}
/*
* ErrorIfTableHasUnsupportedIdentityColumn errors out if the given table has any identity column other than bigint identity column.
*/
void
ErrorIfTableHasUnsupportedIdentityColumn(Oid relationId)
{
Relation relation = relation_open(relationId, AccessShareLock);
TupleDesc tupleDescriptor = RelationGetDescr(relation);
for (int attributeIndex = 0; attributeIndex < tupleDescriptor->natts;
attributeIndex++)
{
Form_pg_attribute attributeForm = TupleDescAttr(tupleDescriptor, attributeIndex);
if (attributeForm->attidentity && attributeForm->atttypid != INT8OID)
{
char *qualifiedRelationName = generate_qualified_relation_name(relationId);
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg(
"cannot complete operation on %s with smallint/int identity column",
qualifiedRelationName),
errhint(
"Use bigint identity column instead.")));
}
}
relation_close(relation, NoLock);
}
/*
* ErrorIfTableHasIdentityColumn errors out if the given table has identity column
*/
void
ErrorIfTableHasIdentityColumn(Oid relationId)
{
Relation relation = relation_open(relationId, AccessShareLock);
TupleDesc tupleDescriptor = RelationGetDescr(relation);
for (int attributeIndex = 0; attributeIndex < tupleDescriptor->natts;
attributeIndex++)
{
Form_pg_attribute attributeForm = TupleDescAttr(tupleDescriptor, attributeIndex);
if (attributeForm->attidentity)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg(
"cannot complete operation on a table with identity column")));
}
}
relation_close(relation, NoLock);
}

View File

@ -324,7 +324,7 @@ ExecuteTruncateStmtSequentialIfNecessary(TruncateStmt *command)
{
Oid relationId = RangeVarGetRelid(rangeVar, NoLock, failOK);
if (IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY) &&
if (IsCitusTable(relationId) && !HasDistributionKey(relationId) &&
TableReferenced(relationId))
{
char *relationName = get_rel_name(relationId);

View File

@ -53,6 +53,7 @@
#include "distributed/coordinator_protocol.h"
#include "distributed/deparser.h"
#include "distributed/deparse_shard_query.h"
#include "distributed/executor_util.h"
#include "distributed/foreign_key_relationship.h"
#include "distributed/listutils.h"
#include "distributed/local_executor.h"

View File

@ -1202,6 +1202,17 @@ FinishConnectionEstablishment(MultiConnection *connection)
}
/*
* ForceConnectionCloseAtTransactionEnd marks connection to be closed at the end of the
* transaction.
*/
void
ForceConnectionCloseAtTransactionEnd(MultiConnection *connection)
{
connection->forceCloseAtTransactionEnd = true;
}
/*
* ClaimConnectionExclusively signals that this connection is actively being
* used. That means it'll not be, again, returned by
@ -1484,6 +1495,7 @@ AfterXactHostConnectionHandling(ConnectionHashEntry *entry, bool isCommit)
* - Current cached connections is already at MaxCachedConnectionsPerWorker
* - Connection is forced to close at the end of transaction
* - Connection is not in OK state
* - Connection has a replication origin setup
* - A transaction is still in progress (usually because we are cancelling a distributed transaction)
* - A connection reached its maximum lifetime
*/
@ -1503,6 +1515,7 @@ ShouldShutdownConnection(MultiConnection *connection, const int cachedConnection
PQstatus(connection->pgConn) != CONNECTION_OK ||
!RemoteTransactionIdle(connection) ||
connection->requiresReplication ||
connection->isReplicationOriginSessionSetup ||
(MaxCachedConnectionLifetime >= 0 &&
MillisecondsToTimeout(connection->connectionEstablishmentStart,
MaxCachedConnectionLifetime) <= 0);

View File

@ -573,6 +573,47 @@ SendRemoteCommand(MultiConnection *connection, const char *command)
}
/*
* ExecuteRemoteCommandAndCheckResult executes the given command in the remote node and
* checks if the result is equal to the expected result. If the result is equal to the
* expected result, the function returns true, otherwise it returns false.
*/
bool
ExecuteRemoteCommandAndCheckResult(MultiConnection *connection, char *command,
char *expected)
{
if (!SendRemoteCommand(connection, command))
{
/* if we cannot connect, we warn and report false */
ReportConnectionError(connection, WARNING);
return false;
}
bool raiseInterrupts = true;
PGresult *queryResult = GetRemoteCommandResult(connection, raiseInterrupts);
/* if remote node throws an error, we also throw an error */
if (!IsResponseOK(queryResult))
{
ReportResultError(connection, queryResult, ERROR);
}
StringInfo queryResultString = makeStringInfo();
/* Evaluate the queryResult and store it into the queryResultString */
bool success = EvaluateSingleQueryResult(connection, queryResult, queryResultString);
bool result = false;
if (success && strcmp(queryResultString->data, expected) == 0)
{
result = true;
}
PQclear(queryResult);
ForgetResults(connection);
return result;
}
/*
* ReadFirstColumnAsText reads the first column of result tuples from the given
* PGresult struct and returns them in a StringInfo list.

View File

@ -304,10 +304,7 @@ pg_get_sequencedef(Oid sequenceRelationId)
* When it's WORKER_NEXTVAL_SEQUENCE_DEFAULTS, the function creates the DEFAULT
* clause using worker_nextval('sequence') and not nextval('sequence')
* When IncludeIdentities is NO_IDENTITY, the function does not include identity column
* specifications. When it's INCLUDE_IDENTITY_AS_SEQUENCE_DEFAULTS, the function
* uses sequences and set them as default values for identity columns by using exactly
* the same approach with worker_nextval('sequence') & nextval('sequence') logic
* desribed above. When it's INCLUDE_IDENTITY it creates GENERATED .. AS IDENTIY clauses.
* specifications. When it's INCLUDE_IDENTITY it creates GENERATED .. AS IDENTIY clauses.
*/
char *
pg_get_tableschemadef_string(Oid tableRelationId, IncludeSequenceDefaults
@ -403,26 +400,9 @@ pg_get_tableschemadef_string(Oid tableRelationId, IncludeSequenceDefaults
Oid seqOid = getIdentitySequence(RelationGetRelid(relation),
attributeForm->attnum, missing_ok);
char *sequenceName = generate_qualified_relation_name(seqOid);
if (includeIdentityDefaults == INCLUDE_IDENTITY_AS_SEQUENCE_DEFAULTS)
{
if (pg_get_sequencedef(seqOid)->seqtypid != INT8OID)
{
appendStringInfo(&buffer,
" DEFAULT worker_nextval(%s::regclass)",
quote_literal_cstr(sequenceName));
}
else
{
appendStringInfo(&buffer, " DEFAULT nextval(%s::regclass)",
quote_literal_cstr(sequenceName));
}
}
else if (includeIdentityDefaults == INCLUDE_IDENTITY)
if (includeIdentityDefaults == INCLUDE_IDENTITY)
{
Form_pg_sequence pgSequenceForm = pg_get_sequencedef(seqOid);
uint64 sequenceStart = nextval_internal(seqOid, false);
char *sequenceDef = psprintf(
" GENERATED %s AS IDENTITY (INCREMENT BY " INT64_FORMAT \
" MINVALUE " INT64_FORMAT " MAXVALUE "
@ -433,7 +413,8 @@ pg_get_tableschemadef_string(Oid tableRelationId, IncludeSequenceDefaults
"ALWAYS" : "BY DEFAULT",
pgSequenceForm->seqincrement,
pgSequenceForm->seqmin,
pgSequenceForm->seqmax, sequenceStart,
pgSequenceForm->seqmax,
pgSequenceForm->seqstart,
pgSequenceForm->seqcache,
pgSequenceForm->seqcycle ? "" : "NO ");
@ -1391,7 +1372,7 @@ convert_aclright_to_string(int aclright)
/*
* contain_nextval_expression_walker walks over expression tree and returns
* true if it contains call to 'nextval' function.
* true if it contains call to 'nextval' function or it has an identity column.
*/
bool
contain_nextval_expression_walker(Node *node, void *context)
@ -1401,6 +1382,13 @@ contain_nextval_expression_walker(Node *node, void *context)
return false;
}
/* check if the node contains an identity column */
if (IsA(node, NextValueExpr))
{
return true;
}
/* check if the node contains call to 'nextval' */
if (IsA(node, FuncExpr))
{
FuncExpr *funcExpr = (FuncExpr *) node;

View File

@ -0,0 +1,690 @@
/*-------------------------------------------------------------------------
*
* deparse_publication_stmts.c
* All routines to deparse publication statements.
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/relation.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "distributed/citus_ruleutils.h"
#include "distributed/deparser.h"
#include "distributed/listutils.h"
#include "distributed/namespace_utils.h"
#include "lib/stringinfo.h"
#include "parser/parse_clause.h"
#include "parser/parse_collate.h"
#include "parser/parse_node.h"
#include "parser/parse_relation.h"
#include "nodes/value.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/ruleutils.h"
static void AppendCreatePublicationStmt(StringInfo buf, CreatePublicationStmt *stmt,
bool whereClauseNeedsTransform,
bool includeLocalTables);
#if (PG_VERSION_NUM >= PG_VERSION_15)
static bool AppendPublicationObjects(StringInfo buf, List *publicationObjects,
bool whereClauseNeedsTransform,
bool includeLocalTables);
static void AppendWhereClauseExpression(StringInfo buf, RangeVar *tableName,
Node *whereClause,
bool whereClauseNeedsTransform);
static void AppendAlterPublicationAction(StringInfo buf, AlterPublicationAction action);
#else
static bool AppendTables(StringInfo buf, List *tables, bool includeLocalTables);
static void AppendDefElemAction(StringInfo buf, DefElemAction action);
#endif
static bool AppendAlterPublicationStmt(StringInfo buf, AlterPublicationStmt *stmt,
bool whereClauseNeedsTransform,
bool includeLocalTables);
static void AppendDropPublicationStmt(StringInfo buf, DropStmt *stmt);
static void AppendRenamePublicationStmt(StringInfo buf, RenameStmt *stmt);
static void AppendAlterPublicationOwnerStmt(StringInfo buf, AlterOwnerStmt *stmt);
static void AppendPublicationOptions(StringInfo stringBuffer, List *optionList);
static void AppendIdentifierList(StringInfo buf, List *objects);
/*
* DeparseCreatePublicationStmt builds and returns a string representing a
* CreatePublicationStmt.
*/
char *
DeparseCreatePublicationStmt(Node *node)
{
/* regular deparsing function takes CREATE PUBLICATION from the parser */
bool whereClauseNeedsTransform = false;
/* for regular CREATE PUBLICATION we do not propagate local tables */
bool includeLocalTables = false;
return DeparseCreatePublicationStmtExtended(node, whereClauseNeedsTransform,
includeLocalTables);
}
/*
* DeparseCreatePublicationStmtExtended builds and returns a string representing a
* CreatePublicationStmt, which may have already-transformed expressions.
*/
char *
DeparseCreatePublicationStmtExtended(Node *node, bool whereClauseNeedsTransform,
bool includeLocalTables)
{
CreatePublicationStmt *stmt = castNode(CreatePublicationStmt, node);
StringInfoData str = { 0 };
initStringInfo(&str);
AppendCreatePublicationStmt(&str, stmt, whereClauseNeedsTransform,
includeLocalTables);
return str.data;
}
/*
* AppendCreatePublicationStmt appends a string representing a
* CreatePublicationStmt to a buffer.
*/
static void
AppendCreatePublicationStmt(StringInfo buf, CreatePublicationStmt *stmt,
bool whereClauseNeedsTransform,
bool includeLocalTables)
{
appendStringInfo(buf, "CREATE PUBLICATION %s",
quote_identifier(stmt->pubname));
if (stmt->for_all_tables)
{
appendStringInfoString(buf, " FOR ALL TABLES");
}
#if (PG_VERSION_NUM >= PG_VERSION_15)
else if (stmt->pubobjects != NIL)
{
bool hasObjects = false;
PublicationObjSpec *publicationObject = NULL;
/*
* Check whether there are objects to propagate, mainly to know whether
* we should include "FOR".
*/
foreach_ptr(publicationObject, stmt->pubobjects)
{
if (publicationObject->pubobjtype == PUBLICATIONOBJ_TABLE)
{
/* FOR TABLE ... */
PublicationTable *publicationTable = publicationObject->pubtable;
if (includeLocalTables ||
IsCitusTableRangeVar(publicationTable->relation, NoLock, false))
{
hasObjects = true;
break;
}
}
else
{
hasObjects = true;
break;
}
}
if (hasObjects)
{
appendStringInfoString(buf, " FOR");
AppendPublicationObjects(buf, stmt->pubobjects, whereClauseNeedsTransform,
includeLocalTables);
}
}
#else
else if (stmt->tables != NIL)
{
bool hasTables = false;
RangeVar *rangeVar = NULL;
/*
* Check whether there are tables to propagate, mainly to know whether
* we should include "FOR".
*/
foreach_ptr(rangeVar, stmt->tables)
{
if (includeLocalTables || IsCitusTableRangeVar(rangeVar, NoLock, false))
{
hasTables = true;
break;
}
}
if (hasTables)
{
appendStringInfoString(buf, " FOR");
AppendTables(buf, stmt->tables, includeLocalTables);
}
}
#endif
if (stmt->options != NIL)
{
appendStringInfoString(buf, " WITH (");
AppendPublicationOptions(buf, stmt->options);
appendStringInfoString(buf, ")");
}
}
#if (PG_VERSION_NUM >= PG_VERSION_15)
/*
* AppendPublicationObjects appends a string representing a list of publication
* objects to a buffer.
*
* For instance: TABLE users, departments, TABLES IN SCHEMA production
*/
static bool
AppendPublicationObjects(StringInfo buf, List *publicationObjects,
bool whereClauseNeedsTransform,
bool includeLocalTables)
{
PublicationObjSpec *publicationObject = NULL;
bool appendedObject = false;
foreach_ptr(publicationObject, publicationObjects)
{
if (publicationObject->pubobjtype == PUBLICATIONOBJ_TABLE)
{
/* FOR TABLE ... */
PublicationTable *publicationTable = publicationObject->pubtable;
RangeVar *rangeVar = publicationTable->relation;
char *schemaName = rangeVar->schemaname;
char *tableName = rangeVar->relname;
if (!includeLocalTables && !IsCitusTableRangeVar(rangeVar, NoLock, false))
{
/* do not propagate local tables */
continue;
}
if (schemaName != NULL)
{
/* qualified table name */
appendStringInfo(buf, "%s TABLE %s",
appendedObject ? "," : "",
quote_qualified_identifier(schemaName, tableName));
}
else
{
/* unqualified table name */
appendStringInfo(buf, "%s TABLE %s",
appendedObject ? "," : "",
quote_identifier(tableName));
}
if (publicationTable->columns != NIL)
{
appendStringInfoString(buf, " (");
AppendIdentifierList(buf, publicationTable->columns);
appendStringInfoString(buf, ")");
}
if (publicationTable->whereClause != NULL)
{
appendStringInfoString(buf, " WHERE (");
AppendWhereClauseExpression(buf, rangeVar,
publicationTable->whereClause,
whereClauseNeedsTransform);
appendStringInfoString(buf, ")");
}
}
else
{
/* FOR TABLES IN SCHEMA */
char *schemaName = publicationObject->name;
if (publicationObject->pubobjtype == PUBLICATIONOBJ_TABLES_IN_CUR_SCHEMA)
{
List *searchPath = fetch_search_path(false);
if (searchPath == NIL)
{
ereport(ERROR, errcode(ERRCODE_UNDEFINED_SCHEMA),
errmsg("no schema has been selected for "
"CURRENT_SCHEMA"));
}
schemaName = get_namespace_name(linitial_oid(searchPath));
}
appendStringInfo(buf, "%s TABLES IN SCHEMA %s",
appendedObject ? "," : "",
quote_identifier(schemaName));
}
appendedObject = true;
}
return appendedObject;
}
/*
* AppendWhereClauseExpression appends a deparsed expression that can
* contain a filter on the given table. If whereClauseNeedsTransform is set
* the expression is first tranformed.
*/
static void
AppendWhereClauseExpression(StringInfo buf, RangeVar *tableName,
Node *whereClause, bool whereClauseNeedsTransform)
{
Relation relation = relation_openrv(tableName, AccessShareLock);
if (whereClauseNeedsTransform)
{
ParseState *pstate = make_parsestate(NULL);
pstate->p_sourcetext = "";
ParseNamespaceItem *nsitem = addRangeTableEntryForRelation(pstate,
relation,
AccessShareLock, NULL,
false, false);
addNSItemToQuery(pstate, nsitem, false, true, true);
whereClause = transformWhereClause(pstate,
copyObject(whereClause),
EXPR_KIND_WHERE,
"PUBLICATION WHERE");
assign_expr_collations(pstate, whereClause);
}
List *relationContext = deparse_context_for(tableName->relname, relation->rd_id);
PushOverrideEmptySearchPath(CurrentMemoryContext);
char *whereClauseString = deparse_expression(whereClause,
relationContext,
true, true);
PopOverrideSearchPath();
appendStringInfoString(buf, whereClauseString);
relation_close(relation, AccessShareLock);
}
#else
/*
* AppendPublicationObjects appends a string representing a list of publication
* objects to a buffer.
*
* For instance: TABLE users, departments
*/
static bool
AppendTables(StringInfo buf, List *tables, bool includeLocalTables)
{
RangeVar *rangeVar = NULL;
bool appendedObject = false;
foreach_ptr(rangeVar, tables)
{
if (!includeLocalTables &&
!IsCitusTableRangeVar(rangeVar, NoLock, false))
{
/* do not propagate local tables */
continue;
}
char *schemaName = rangeVar->schemaname;
char *tableName = rangeVar->relname;
if (schemaName != NULL)
{
/* qualified table name */
appendStringInfo(buf, "%s %s",
appendedObject ? "," : " TABLE",
quote_qualified_identifier(schemaName, tableName));
}
else
{
/* unqualified table name */
appendStringInfo(buf, "%s %s",
appendedObject ? "," : " TABLE",
quote_identifier(tableName));
}
appendedObject = true;
}
return appendedObject;
}
#endif
/*
* DeparseAlterPublicationSchemaStmt builds and returns a string representing
* an AlterPublicationStmt.
*/
char *
DeparseAlterPublicationStmt(Node *node)
{
/* regular deparsing function takes ALTER PUBLICATION from the parser */
bool whereClauseNeedsTransform = true;
/* for regular ALTER PUBLICATION we do not propagate local tables */
bool includeLocalTables = false;
return DeparseAlterPublicationStmtExtended(node, whereClauseNeedsTransform,
includeLocalTables);
}
/*
* DeparseAlterPublicationStmtExtended builds and returns a string representing a
* AlterPublicationStmt, which may have already-transformed expressions.
*/
char *
DeparseAlterPublicationStmtExtended(Node *node, bool whereClauseNeedsTransform,
bool includeLocalTables)
{
AlterPublicationStmt *stmt = castNode(AlterPublicationStmt, node);
StringInfoData str = { 0 };
initStringInfo(&str);
if (!AppendAlterPublicationStmt(&str, stmt, whereClauseNeedsTransform,
includeLocalTables))
{
Assert(!includeLocalTables);
/*
* When there are no objects to propagate, then there is no
* valid ALTER PUBLICATION to construct.
*/
return NULL;
}
return str.data;
}
/*
* AppendAlterPublicationStmt appends a string representing an AlterPublicationStmt
* of the form ALTER PUBLICATION .. ADD/SET/DROP
*/
static bool
AppendAlterPublicationStmt(StringInfo buf, AlterPublicationStmt *stmt,
bool whereClauseNeedsTransform,
bool includeLocalTables)
{
appendStringInfo(buf, "ALTER PUBLICATION %s",
quote_identifier(stmt->pubname));
if (stmt->options)
{
appendStringInfoString(buf, " SET (");
AppendPublicationOptions(buf, stmt->options);
appendStringInfoString(buf, ")");
/* changing options cannot be combined with other actions */
return true;
}
#if (PG_VERSION_NUM >= PG_VERSION_15)
AppendAlterPublicationAction(buf, stmt->action);
return AppendPublicationObjects(buf, stmt->pubobjects, whereClauseNeedsTransform,
includeLocalTables);
#else
AppendDefElemAction(buf, stmt->tableAction);
return AppendTables(buf, stmt->tables, includeLocalTables);
#endif
}
#if (PG_VERSION_NUM >= PG_VERSION_15)
/*
* AppendAlterPublicationAction appends a string representing an AlterPublicationAction
* to a buffer.
*/
static void
AppendAlterPublicationAction(StringInfo buf, AlterPublicationAction action)
{
switch (action)
{
case AP_AddObjects:
{
appendStringInfoString(buf, " ADD");
break;
}
case AP_DropObjects:
{
appendStringInfoString(buf, " DROP");
break;
}
case AP_SetObjects:
{
appendStringInfoString(buf, " SET");
break;
}
default:
{
ereport(ERROR, (errmsg("unrecognized publication action: %d", action)));
}
}
}
#else
/*
* AppendDefElemAction appends a string representing a DefElemAction
* to a buffer.
*/
static void
AppendDefElemAction(StringInfo buf, DefElemAction action)
{
switch (action)
{
case DEFELEM_ADD:
{
appendStringInfoString(buf, " ADD");
break;
}
case DEFELEM_DROP:
{
appendStringInfoString(buf, " DROP");
break;
}
case DEFELEM_SET:
{
appendStringInfoString(buf, " SET");
break;
}
default:
{
ereport(ERROR, (errmsg("unrecognized publication action: %d", action)));
}
}
}
#endif
/*
* DeparseDropPublicationStmt builds and returns a string representing the DropStmt
*/
char *
DeparseDropPublicationStmt(Node *node)
{
DropStmt *stmt = castNode(DropStmt, node);
StringInfoData str = { 0 };
initStringInfo(&str);
Assert(stmt->removeType == OBJECT_PUBLICATION);
AppendDropPublicationStmt(&str, stmt);
return str.data;
}
/*
* AppendDropPublicationStmt appends a string representing the DropStmt to a buffer
*/
static void
AppendDropPublicationStmt(StringInfo buf, DropStmt *stmt)
{
appendStringInfoString(buf, "DROP PUBLICATION ");
if (stmt->missing_ok)
{
appendStringInfoString(buf, "IF EXISTS ");
}
AppendIdentifierList(buf, stmt->objects);
if (stmt->behavior == DROP_CASCADE)
{
appendStringInfoString(buf, " CASCADE");
}
}
/*
* DeparseRenamePublicationStmt builds and returns a string representing the RenameStmt
*/
char *
DeparseRenamePublicationStmt(Node *node)
{
RenameStmt *stmt = castNode(RenameStmt, node);
StringInfoData str = { 0 };
initStringInfo(&str);
Assert(stmt->renameType == OBJECT_PUBLICATION);
AppendRenamePublicationStmt(&str, stmt);
return str.data;
}
/*
* AppendRenamePublicationStmt appends a string representing the RenameStmt to a buffer
*/
static void
AppendRenamePublicationStmt(StringInfo buf, RenameStmt *stmt)
{
appendStringInfo(buf, "ALTER PUBLICATION %s RENAME TO %s;",
quote_identifier(strVal(stmt->object)),
quote_identifier(stmt->newname));
}
/*
* DeparseAlterPublicationOwnerStmt builds and returns a string representing the AlterOwnerStmt
*/
char *
DeparseAlterPublicationOwnerStmt(Node *node)
{
AlterOwnerStmt *stmt = castNode(AlterOwnerStmt, node);
StringInfoData str = { 0 };
initStringInfo(&str);
Assert(stmt->objectType == OBJECT_PUBLICATION);
AppendAlterPublicationOwnerStmt(&str, stmt);
return str.data;
}
/*
* AppendAlterPublicationOwnerStmt appends a string representing the AlterOwnerStmt to a buffer
*/
static void
AppendAlterPublicationOwnerStmt(StringInfo buf, AlterOwnerStmt *stmt)
{
Assert(stmt->objectType == OBJECT_PUBLICATION);
appendStringInfo(buf, "ALTER PUBLICATION %s OWNER TO %s;",
quote_identifier(strVal(stmt->object)),
RoleSpecString(stmt->newowner, true));
}
/*
* AppendPublicationOptions appends a string representing a list of publication opions.
*/
static void
AppendPublicationOptions(StringInfo stringBuffer, List *optionList)
{
ListCell *optionCell = NULL;
bool firstOptionPrinted = false;
foreach(optionCell, optionList)
{
DefElem *option = (DefElem *) lfirst(optionCell);
char *optionName = option->defname;
char *optionValue = defGetString(option);
NodeTag valueType = nodeTag(option->arg);
if (firstOptionPrinted)
{
appendStringInfo(stringBuffer, ", ");
}
firstOptionPrinted = true;
appendStringInfo(stringBuffer, "%s = ",
quote_identifier(optionName));
#if (PG_VERSION_NUM >= PG_VERSION_15)
if (valueType == T_Integer || valueType == T_Float || valueType == T_Boolean)
#else
if (valueType == T_Integer || valueType == T_Float)
#endif
{
/* string escaping is unnecessary for numeric types and can cause issues */
appendStringInfo(stringBuffer, "%s", optionValue);
}
else
{
appendStringInfo(stringBuffer, "%s", quote_literal_cstr(optionValue));
}
}
}
/*
* AppendIdentifierList appends a string representing a list of
* identifiers (of String type).
*/
static void
AppendIdentifierList(StringInfo buf, List *objects)
{
ListCell *objectCell = NULL;
foreach(objectCell, objects)
{
char *name = strVal(lfirst(objectCell));
if (objectCell != list_head(objects))
{
appendStringInfo(buf, ", ");
}
appendStringInfoString(buf, quote_identifier(name));
}
}

View File

@ -0,0 +1,119 @@
/*-------------------------------------------------------------------------
*
* qualify_publication_stmt.c
* Functions specialized in fully qualifying all publication statements. These
* functions are dispatched from qualify.c
*
* Copyright (c), Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/namespace.h"
#include "distributed/deparser.h"
#include "distributed/listutils.h"
#include "nodes/nodes.h"
#include "utils/guc.h"
#include "utils/lsyscache.h"
#if (PG_VERSION_NUM >= PG_VERSION_15)
static void QualifyPublicationObjects(List *publicationObjects);
#else
static void QualifyTables(List *tables);
#endif
static void QualifyPublicationRangeVar(RangeVar *publication);
/*
* QualifyCreatePublicationStmt quailifies the publication names of the
* CREATE PUBLICATION statement.
*/
void
QualifyCreatePublicationStmt(Node *node)
{
CreatePublicationStmt *stmt = castNode(CreatePublicationStmt, node);
#if (PG_VERSION_NUM >= PG_VERSION_15)
QualifyPublicationObjects(stmt->pubobjects);
#else
QualifyTables(stmt->tables);
#endif
}
#if (PG_VERSION_NUM >= PG_VERSION_15)
/*
* QualifyPublicationObjects ensures all table names in a list of
* publication objects are fully qualified.
*/
static void
QualifyPublicationObjects(List *publicationObjects)
{
PublicationObjSpec *publicationObject = NULL;
foreach_ptr(publicationObject, publicationObjects)
{
if (publicationObject->pubobjtype == PUBLICATIONOBJ_TABLE)
{
/* FOR TABLE ... */
PublicationTable *publicationTable = publicationObject->pubtable;
QualifyPublicationRangeVar(publicationTable->relation);
}
}
}
#else
/*
* QualifyTables ensures all table names in a list are fully qualified.
*/
static void
QualifyTables(List *tables)
{
RangeVar *rangeVar = NULL;
foreach_ptr(rangeVar, tables)
{
QualifyPublicationRangeVar(rangeVar);
}
}
#endif
/*
* QualifyPublicationObjects ensures all table names in a list of
* publication objects are fully qualified.
*/
void
QualifyAlterPublicationStmt(Node *node)
{
AlterPublicationStmt *stmt = castNode(AlterPublicationStmt, node);
#if (PG_VERSION_NUM >= PG_VERSION_15)
QualifyPublicationObjects(stmt->pubobjects);
#else
QualifyTables(stmt->tables);
#endif
}
/*
* QualifyPublicationRangeVar qualifies the given publication RangeVar if it is not qualified.
*/
static void
QualifyPublicationRangeVar(RangeVar *publication)
{
if (publication->schemaname == NULL)
{
Oid publicationOid = RelnameGetRelid(publication->relname);
Oid schemaOid = get_rel_namespace(publicationOid);
publication->schemaname = get_namespace_name(schemaOid);
}
}

View File

@ -53,6 +53,7 @@
#include "common/keywords.h"
#include "distributed/citus_nodefuncs.h"
#include "distributed/citus_ruleutils.h"
#include "distributed/multi_router_planner.h"
#include "executor/spi.h"
#include "foreign/foreign.h"
#include "funcapi.h"
@ -3723,7 +3724,6 @@ static void
get_merge_query_def(Query *query, deparse_context *context)
{
StringInfo buf = context->buf;
RangeTblEntry *targetRte;
/* Insert the WITH clause if given */
get_with_clause(query, context);
@ -3731,7 +3731,7 @@ get_merge_query_def(Query *query, deparse_context *context)
/*
* Start the query with MERGE INTO <target>
*/
targetRte = rt_fetch(query->resultRelation, query->rtable);
RangeTblEntry *targetRte = ExtractResultRelationRTE(query);
if (PRETTY_INDENT(context))
{
@ -3853,6 +3853,15 @@ get_merge_query_def(Query *query, deparse_context *context)
}
}
/*
* RETURNING is not supported in MERGE, so it must be NULL, but if PG adds it later,
* we might miss it, let's raise an exception to investigate.
*/
if (unlikely(query->returningList))
{
elog(ERROR, "Unexpected RETURNING clause in MERGE");
}
ereport(DEBUG1, (errmsg("<Deparsed MERGE query: %s>", buf->data)));
}

File diff suppressed because it is too large Load Diff

View File

@ -9,6 +9,7 @@
*-------------------------------------------------------------------------
*/
#include "distributed/distributed_execution_locks.h"
#include "distributed/executor_util.h"
#include "distributed/listutils.h"
#include "distributed/coordinator_protocol.h"
#include "distributed/metadata_cache.h"
@ -19,6 +20,259 @@
#include "distributed/transaction_management.h"
/*
* AcquireExecutorShardLocksForExecution acquires advisory lock on shard IDs
* to prevent unsafe concurrent modifications of shards.
*
* We prevent concurrent modifications of shards in two cases:
* 1. Any non-commutative writes to a replicated table
* 2. Multi-shard writes that are executed in parallel
*
* The first case ensures we do not apply updates in different orders on
* different replicas (e.g. of a reference table), which could lead the
* replicas to diverge.
*
* The second case prevents deadlocks due to out-of-order execution.
*
* There are two GUCs that can override the default behaviors.
* 'citus.all_modifications_commutative' relaxes locking
* that's done for the purpose of keeping replicas consistent.
* 'citus.enable_deadlock_prevention' relaxes locking done for
* the purpose of avoiding deadlocks between concurrent
* multi-shard commands.
*
* We do not take executor shard locks for utility commands such as
* TRUNCATE because the table locks already prevent concurrent access.
*/
void
AcquireExecutorShardLocksForExecution(RowModifyLevel modLevel, List *taskList)
{
if (modLevel <= ROW_MODIFY_READONLY &&
!SelectForUpdateOnReferenceTable(taskList))
{
/*
* Executor locks only apply to DML commands and SELECT FOR UPDATE queries
* touching reference tables.
*/
return;
}
bool requiresParallelExecutionLocks =
!(list_length(taskList) == 1 || ShouldRunTasksSequentially(taskList));
bool modifiedTableReplicated = ModifiedTableReplicated(taskList);
if (!modifiedTableReplicated && !requiresParallelExecutionLocks)
{
/*
* When a distributed query on tables with replication
* factor == 1 and command hits only a single shard, we
* rely on Postgres to handle the serialization of the
* concurrent modifications on the workers.
*
* For reference tables, even if their placements are replicated
* ones (e.g., single node), we acquire the distributed execution
* locks to be consistent when new node(s) are added. So, they
* do not return at this point.
*/
return;
}
/*
* We first assume that all the remaining modifications are going to
* be serialized. So, start with an ExclusiveLock and lower the lock level
* as much as possible.
*/
int lockMode = ExclusiveLock;
/*
* In addition to honouring commutativity rules, we currently only
* allow a single multi-shard command on a shard at a time. Otherwise,
* concurrent multi-shard commands may take row-level locks on the
* shard placements in a different order and create a distributed
* deadlock. This applies even when writes are commutative and/or
* there is no replication. This can be relaxed via
* EnableDeadlockPrevention.
*
* 1. If citus.all_modifications_commutative is set to true, then all locks
* are acquired as RowExclusiveLock.
*
* 2. If citus.all_modifications_commutative is false, then only the shards
* with more than one replicas are locked with ExclusiveLock. Otherwise, the
* lock is acquired with ShareUpdateExclusiveLock.
*
* ShareUpdateExclusiveLock conflicts with itself such that only one
* multi-shard modification at a time is allowed on a shard. It also conflicts
* with ExclusiveLock, which ensures that updates/deletes/upserts are applied
* in the same order on all placements. It does not conflict with
* RowExclusiveLock, which is normally obtained by single-shard, commutative
* writes.
*/
if (!modifiedTableReplicated && requiresParallelExecutionLocks)
{
/*
* When there is no replication then we only need to prevent
* concurrent multi-shard commands on the same shards. This is
* because concurrent, parallel commands may modify the same
* set of shards, but in different orders. The order of the
* accesses might trigger distributed deadlocks that are not
* possible to happen on non-distributed systems such
* regular Postgres.
*
* As an example, assume that we have two queries: query-1 and query-2.
* Both queries access shard-1 and shard-2. If query-1 first accesses to
* shard-1 then shard-2, and query-2 accesses shard-2 then shard-1, these
* two commands might block each other in case they modify the same rows
* (e.g., cause distributed deadlocks).
*
* In either case, ShareUpdateExclusive has the desired effect, since
* it conflicts with itself and ExclusiveLock (taken by non-commutative
* writes).
*
* However, some users find this too restrictive, so we allow them to
* reduce to a RowExclusiveLock when citus.enable_deadlock_prevention
* is enabled, which lets multi-shard modifications run in parallel as
* long as they all disable the GUC.
*/
lockMode =
EnableDeadlockPrevention ? ShareUpdateExclusiveLock : RowExclusiveLock;
if (!IsCoordinator())
{
/*
* We also skip taking a heavy-weight lock when running a multi-shard
* commands from workers, since we currently do not prevent concurrency
* across workers anyway.
*/
lockMode = RowExclusiveLock;
}
}
else if (modifiedTableReplicated)
{
/*
* When we are executing distributed queries on replicated tables, our
* default behaviour is to prevent any concurrency. This is valid
* for when parallel execution is happening or not.
*
* The reason is that we cannot control the order of the placement accesses
* of two distributed queries to the same shards. The order of the accesses
* might cause the replicas of the same shard placements diverge. This is
* not possible to happen on non-distributed systems such regular Postgres.
*
* As an example, assume that we have two queries: query-1 and query-2.
* Both queries only access the placements of shard-1, say p-1 and p-2.
*
* And, assume that these queries are non-commutative, such as:
* query-1: UPDATE table SET b = 1 WHERE key = 1;
* query-2: UPDATE table SET b = 2 WHERE key = 1;
*
* If query-1 accesses to p-1 then p-2, and query-2 accesses
* p-2 then p-1, these two commands would leave the p-1 and p-2
* diverged (e.g., the values for the column "b" would be different).
*
* The only exception to this rule is the single shard commutative
* modifications, such as INSERTs. In that case, we can allow
* concurrency among such backends, hence lowering the lock level
* to RowExclusiveLock.
*/
if (!requiresParallelExecutionLocks && modLevel < ROW_MODIFY_NONCOMMUTATIVE)
{
lockMode = RowExclusiveLock;
}
}
if (AllModificationsCommutative)
{
/*
* The mapping is overridden when all_modifications_commutative is set to true.
* In that case, all modifications are treated as commutative, which can be used
* to communicate that the application is only generating commutative
* UPDATE/DELETE/UPSERT commands and exclusive locks are unnecessary. This
* is irrespective of single-shard/multi-shard or replicated tables.
*/
lockMode = RowExclusiveLock;
}
/* now, iterate on the tasks and acquire the executor locks on the shards */
List *anchorShardIntervalList = NIL;
List *relationRowLockList = NIL;
List *requiresConsistentSnapshotRelationShardList = NIL;
Task *task = NULL;
foreach_ptr(task, taskList)
{
ShardInterval *anchorShardInterval = LoadShardInterval(task->anchorShardId);
anchorShardIntervalList = lappend(anchorShardIntervalList, anchorShardInterval);
/* Acquire additional locks for SELECT .. FOR UPDATE on reference tables */
AcquireExecutorShardLocksForRelationRowLockList(task->relationRowLockList);
relationRowLockList =
list_concat(relationRowLockList,
task->relationRowLockList);
/*
* If the task has a subselect, then we may need to lock the shards from which
* the query selects as well to prevent the subselects from seeing different
* results on different replicas.
*/
if (RequiresConsistentSnapshot(task))
{
/*
* ExclusiveLock conflicts with all lock types used by modifications
* and therefore prevents other modifications from running
* concurrently.
*/
requiresConsistentSnapshotRelationShardList =
list_concat(requiresConsistentSnapshotRelationShardList,
task->relationShardList);
}
}
/*
* Acquire the locks in a sorted way to avoid deadlocks due to lock
* ordering across concurrent sessions.
*/
anchorShardIntervalList =
SortList(anchorShardIntervalList, CompareShardIntervalsById);
/*
* If we are dealing with a partition we are also taking locks on parent table
* to prevent deadlocks on concurrent operations on a partition and its parent.
*
* Note that this function currently does not acquire any remote locks as that
* is necessary to control the concurrency across multiple nodes for replicated
* tables. That is because Citus currently does not allow modifications to
* partitions from any node other than the coordinator.
*/
LockParentShardResourceIfPartition(anchorShardIntervalList, lockMode);
/* Acquire distribution execution locks on the affected shards */
SerializeNonCommutativeWrites(anchorShardIntervalList, lockMode);
if (relationRowLockList != NIL)
{
/* Acquire additional locks for SELECT .. FOR UPDATE on reference tables */
AcquireExecutorShardLocksForRelationRowLockList(relationRowLockList);
}
if (requiresConsistentSnapshotRelationShardList != NIL)
{
/*
* If the task has a subselect, then we may need to lock the shards from which
* the query selects as well to prevent the subselects from seeing different
* results on different replicas.
*
* ExclusiveLock conflicts with all lock types used by modifications
* and therefore prevents other modifications from running
* concurrently.
*/
LockRelationShardResources(requiresConsistentSnapshotRelationShardList,
ExclusiveLock);
}
}
/*
* RequiresConsistentSnapshot returns true if the given task need to take
* the necessary locks to ensure that a subquery in the modify query
@ -188,3 +442,27 @@ LockPartitionRelations(Oid relationId, LOCKMODE lockMode)
LockRelationOid(partitionRelationId, lockMode);
}
}
/*
* LockPartitionsForDistributedPlan ensures commands take locks on all partitions
* of a distributed table that appears in the query. We do this primarily out of
* consistency with PostgreSQL locking.
*/
void
LockPartitionsForDistributedPlan(DistributedPlan *plan)
{
if (TaskListModifiesDatabase(plan->modLevel, plan->workerJob->taskList))
{
Oid targetRelationId = plan->targetRelationId;
LockPartitionsInRelationList(list_make1_oid(targetRelationId), RowExclusiveLock);
}
/*
* Lock partitions of tables that appear in a SELECT or subquery. In the
* DML case this also includes the target relation, but since we already
* have a stronger lock this doesn't do any harm.
*/
LockPartitionsInRelationList(plan->relationIdList, AccessShareLock);
}

View File

@ -0,0 +1,101 @@
/*-------------------------------------------------------------------------
*
* executor_util_tasks.c
*
* Utility functions for dealing with task lists in the executor.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "distributed/executor_util.h"
#include "utils/lsyscache.h"
/*
* ExtractParametersForRemoteExecution extracts parameter types and values from
* the given ParamListInfo structure, and fills parameter type and value arrays.
* It changes oid of custom types to InvalidOid so that they are the same in workers
* and coordinators.
*/
void
ExtractParametersForRemoteExecution(ParamListInfo paramListInfo, Oid **parameterTypes,
const char ***parameterValues)
{
ExtractParametersFromParamList(paramListInfo, parameterTypes,
parameterValues, false);
}
/*
* ExtractParametersFromParamList extracts parameter types and values from
* the given ParamListInfo structure, and fills parameter type and value arrays.
* If useOriginalCustomTypeOids is true, it uses the original oids for custom types.
*/
void
ExtractParametersFromParamList(ParamListInfo paramListInfo,
Oid **parameterTypes,
const char ***parameterValues, bool
useOriginalCustomTypeOids)
{
int parameterCount = paramListInfo->numParams;
*parameterTypes = (Oid *) palloc0(parameterCount * sizeof(Oid));
*parameterValues = (const char **) palloc0(parameterCount * sizeof(char *));
/* get parameter types and values */
for (int parameterIndex = 0; parameterIndex < parameterCount; parameterIndex++)
{
ParamExternData *parameterData = &paramListInfo->params[parameterIndex];
Oid typeOutputFunctionId = InvalidOid;
bool variableLengthType = false;
/*
* Use 0 for data types where the oid values can be different on
* the coordinator and worker nodes. Therefore, the worker nodes can
* infer the correct oid.
*/
if (parameterData->ptype >= FirstNormalObjectId && !useOriginalCustomTypeOids)
{
(*parameterTypes)[parameterIndex] = 0;
}
else
{
(*parameterTypes)[parameterIndex] = parameterData->ptype;
}
/*
* If the parameter is not referenced / used (ptype == 0) and
* would otherwise have errored out inside standard_planner()),
* don't pass a value to the remote side, and pass text oid to prevent
* undetermined data type errors on workers.
*/
if (parameterData->ptype == 0)
{
(*parameterValues)[parameterIndex] = NULL;
(*parameterTypes)[parameterIndex] = TEXTOID;
continue;
}
/*
* If the parameter is NULL then we preserve its type, but
* don't need to evaluate its value.
*/
if (parameterData->isnull)
{
(*parameterValues)[parameterIndex] = NULL;
continue;
}
getTypeOutputInfo(parameterData->ptype, &typeOutputFunctionId,
&variableLengthType);
(*parameterValues)[parameterIndex] = OidOutputFunctionCall(typeOutputFunctionId,
parameterData->value);
}
}

View File

@ -0,0 +1,297 @@
/*-------------------------------------------------------------------------
*
* executor_util_tasks.c
*
* Utility functions for dealing with task lists in the executor.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "distributed/executor_util.h"
#include "distributed/listutils.h"
#include "distributed/shardinterval_utils.h"
/*
* TaskListModifiesDatabase is a helper function for DistributedExecutionModifiesDatabase and
* DistributedPlanModifiesDatabase.
*/
bool
TaskListModifiesDatabase(RowModifyLevel modLevel, List *taskList)
{
if (modLevel > ROW_MODIFY_READONLY)
{
return true;
}
/*
* If we cannot decide by only checking the row modify level,
* we should look closer to the tasks.
*/
if (list_length(taskList) < 1)
{
/* is this ever possible? */
return false;
}
Task *firstTask = (Task *) linitial(taskList);
return !ReadOnlyTask(firstTask->taskType);
}
/*
* TaskListRequiresRollback returns true if the distributed
* execution should start a CoordinatedTransaction. In other words, if the
* function returns true, the execution sends BEGIN; to every connection
* involved in the distributed execution.
*/
bool
TaskListRequiresRollback(List *taskList)
{
int taskCount = list_length(taskList);
if (taskCount == 0)
{
return false;
}
Task *task = (Task *) linitial(taskList);
if (task->cannotBeExecutedInTransction)
{
/* vacuum, create index concurrently etc. */
return false;
}
bool selectForUpdate = task->relationRowLockList != NIL;
if (selectForUpdate)
{
/*
* Do not check SelectOpensTransactionBlock, always open transaction block
* if SELECT FOR UPDATE is executed inside a distributed transaction.
*/
return IsMultiStatementTransaction();
}
if (ReadOnlyTask(task->taskType))
{
return SelectOpensTransactionBlock &&
IsTransactionBlock();
}
if (IsMultiStatementTransaction())
{
return true;
}
if (list_length(taskList) > 1)
{
return true;
}
if (list_length(task->taskPlacementList) > 1)
{
/*
* Single DML/DDL tasks with replicated tables (including
* reference and non-reference tables) should require
* BEGIN/COMMIT/ROLLBACK.
*/
return true;
}
if (task->queryCount > 1)
{
/*
* When there are multiple sequential queries in a task
* we need to run those as a transaction.
*/
return true;
}
return false;
}
/*
* TaskListRequires2PC determines whether the given task list requires 2PC.
*/
bool
TaskListRequires2PC(List *taskList)
{
if (taskList == NIL)
{
return false;
}
Task *task = (Task *) linitial(taskList);
if (ReadOnlyTask(task->taskType))
{
/* we do not trigger 2PC for ReadOnly queries */
return false;
}
bool singleTask = list_length(taskList) == 1;
if (singleTask && list_length(task->taskPlacementList) == 1)
{
/* we do not trigger 2PC for modifications that are:
* - single task
* - single placement
*/
return false;
}
/*
* Otherwise, all modifications are done via 2PC. This includes:
* - Multi-shard commands irrespective of the replication factor
* - Single-shard commands that are targeting more than one replica
*/
return true;
}
/*
* TaskListCannotBeExecutedInTransaction returns true if any of the
* tasks in the input cannot be executed in a transaction. These are
* tasks like VACUUM or CREATE INDEX CONCURRENTLY etc.
*/
bool
TaskListCannotBeExecutedInTransaction(List *taskList)
{
Task *task = NULL;
foreach_ptr(task, taskList)
{
if (task->cannotBeExecutedInTransction)
{
return true;
}
}
return false;
}
/*
* SelectForUpdateOnReferenceTable returns true if the input task
* contains a FOR UPDATE clause that locks any reference tables.
*/
bool
SelectForUpdateOnReferenceTable(List *taskList)
{
if (list_length(taskList) != 1)
{
/* we currently do not support SELECT FOR UPDATE on multi task queries */
return false;
}
Task *task = (Task *) linitial(taskList);
RelationRowLock *relationRowLock = NULL;
foreach_ptr(relationRowLock, task->relationRowLockList)
{
Oid relationId = relationRowLock->relationId;
if (IsCitusTableType(relationId, REFERENCE_TABLE))
{
return true;
}
}
return false;
}
/*
* ReadOnlyTask returns true if the input task does a read-only operation
* on the database.
*/
bool
ReadOnlyTask(TaskType taskType)
{
switch (taskType)
{
case READ_TASK:
case MAP_OUTPUT_FETCH_TASK:
case MAP_TASK:
case MERGE_TASK:
{
return true;
}
default:
{
return false;
}
}
}
/*
* ModifiedTableReplicated iterates on the task list and returns true
* if any of the tasks' anchor shard is a replicated table. We qualify
* replicated tables as any reference table or any distributed table with
* replication factor > 1.
*/
bool
ModifiedTableReplicated(List *taskList)
{
Task *task = NULL;
foreach_ptr(task, taskList)
{
int64 shardId = task->anchorShardId;
if (shardId == INVALID_SHARD_ID)
{
continue;
}
if (ReferenceTableShardId(shardId))
{
return true;
}
Oid relationId = RelationIdForShard(shardId);
if (!SingleReplicatedTable(relationId))
{
return true;
}
}
return false;
}
/*
* ShouldRunTasksSequentially returns true if each of the individual tasks
* should be executed one by one. Note that this is different than
* MultiShardConnectionType == SEQUENTIAL_CONNECTION case. In that case,
* running the tasks across the nodes in parallel is acceptable and implemented
* in that way.
*
* However, the executions that are qualified here would perform poorly if the
* tasks across the workers are executed in parallel. We currently qualify only
* one class of distributed queries here, multi-row INSERTs. If we do not enforce
* true sequential execution, concurrent multi-row upserts could easily form
* a distributed deadlock when the upserts touch the same rows.
*/
bool
ShouldRunTasksSequentially(List *taskList)
{
if (list_length(taskList) < 2)
{
/* single task plans are already qualified as sequential by definition */
return false;
}
/* all the tasks are the same, so we only look one */
Task *initialTask = (Task *) linitial(taskList);
if (initialTask->rowValuesLists != NIL)
{
/* found a multi-row INSERT */
return true;
}
return false;
}

View File

@ -0,0 +1,129 @@
/*-------------------------------------------------------------------------
*
* executor_util_tuples.c
*
* Utility functions for handling tuples during remote execution.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "distributed/executor_util.h"
#include "utils/lsyscache.h"
/*
* TupleDescGetAttBinaryInMetadata - Build an AttInMetadata structure based on
* the supplied TupleDesc. AttInMetadata can be used in conjunction with
* fmStringInfos containing binary encoded types to produce a properly formed
* tuple.
*
* NOTE: This function is a copy of the PG function TupleDescGetAttInMetadata,
* except that it uses getTypeBinaryInputInfo instead of getTypeInputInfo.
*/
AttInMetadata *
TupleDescGetAttBinaryInMetadata(TupleDesc tupdesc)
{
int natts = tupdesc->natts;
int i;
Oid atttypeid;
Oid attinfuncid;
AttInMetadata *attinmeta = (AttInMetadata *) palloc(sizeof(AttInMetadata));
/* "Bless" the tupledesc so that we can make rowtype datums with it */
attinmeta->tupdesc = BlessTupleDesc(tupdesc);
/*
* Gather info needed later to call the "in" function for each attribute
*/
FmgrInfo *attinfuncinfo = (FmgrInfo *) palloc0(natts * sizeof(FmgrInfo));
Oid *attioparams = (Oid *) palloc0(natts * sizeof(Oid));
int32 *atttypmods = (int32 *) palloc0(natts * sizeof(int32));
for (i = 0; i < natts; i++)
{
Form_pg_attribute att = TupleDescAttr(tupdesc, i);
/* Ignore dropped attributes */
if (!att->attisdropped)
{
atttypeid = att->atttypid;
getTypeBinaryInputInfo(atttypeid, &attinfuncid, &attioparams[i]);
fmgr_info(attinfuncid, &attinfuncinfo[i]);
atttypmods[i] = att->atttypmod;
}
}
attinmeta->attinfuncs = attinfuncinfo;
attinmeta->attioparams = attioparams;
attinmeta->atttypmods = atttypmods;
return attinmeta;
}
/*
* BuildTupleFromBytes - build a HeapTuple given user data in binary form.
* values is an array of StringInfos, one for each attribute of the return
* tuple. A NULL StringInfo pointer indicates we want to create a NULL field.
*
* NOTE: This function is a copy of the PG function BuildTupleFromCStrings,
* except that it uses ReceiveFunctionCall instead of InputFunctionCall.
*/
HeapTuple
BuildTupleFromBytes(AttInMetadata *attinmeta, fmStringInfo *values)
{
TupleDesc tupdesc = attinmeta->tupdesc;
int natts = tupdesc->natts;
int i;
Datum *dvalues = (Datum *) palloc(natts * sizeof(Datum));
bool *nulls = (bool *) palloc(natts * sizeof(bool));
/*
* Call the "in" function for each non-dropped attribute, even for nulls,
* to support domains.
*/
for (i = 0; i < natts; i++)
{
if (!TupleDescAttr(tupdesc, i)->attisdropped)
{
/* Non-dropped attributes */
dvalues[i] = ReceiveFunctionCall(&attinmeta->attinfuncs[i],
values[i],
attinmeta->attioparams[i],
attinmeta->atttypmods[i]);
if (values[i] != NULL)
{
nulls[i] = false;
}
else
{
nulls[i] = true;
}
}
else
{
/* Handle dropped attributes by setting to NULL */
dvalues[i] = (Datum) 0;
nulls[i] = true;
}
}
/*
* Form a tuple
*/
HeapTuple tuple = heap_form_tuple(tupdesc, dvalues, nulls);
/*
* Release locally palloc'd space. XXX would probably be good to pfree
* values of pass-by-reference datums, as well.
*/
pfree(dvalues);
pfree(nulls);
return tuple;
}

View File

@ -409,11 +409,13 @@ ExecutePlanIntoColocatedIntermediateResults(Oid targetRelationId,
columnNameList);
/* set up a DestReceiver that copies into the intermediate table */
const bool publishableData = true;
CitusCopyDestReceiver *copyDest = CreateCitusCopyDestReceiver(targetRelationId,
columnNameList,
partitionColumnIndex,
executorState,
intermediateResultIdPrefix);
intermediateResultIdPrefix,
publishableData);
ExecutePlanIntoDestReceiver(selectPlan, paramListInfo, (DestReceiver *) copyDest);
@ -443,10 +445,12 @@ ExecutePlanIntoRelation(Oid targetRelationId, List *insertTargetList,
columnNameList);
/* set up a DestReceiver that copies into the distributed table */
const bool publishableData = true;
CitusCopyDestReceiver *copyDest = CreateCitusCopyDestReceiver(targetRelationId,
columnNameList,
partitionColumnIndex,
executorState, NULL);
executorState, NULL,
publishableData);
ExecutePlanIntoDestReceiver(selectPlan, paramListInfo, (DestReceiver *) copyDest);

View File

@ -90,6 +90,7 @@
#include "distributed/local_executor.h"
#include "distributed/local_plan_cache.h"
#include "distributed/coordinator_protocol.h"
#include "distributed/executor_util.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_executor.h"
#include "distributed/multi_server_executor.h"

View File

@ -802,6 +802,11 @@ GetObjectTypeString(ObjectType objType)
return "function";
}
case OBJECT_PUBLICATION:
{
return "publication";
}
case OBJECT_SCHEMA:
{
return "schema";

View File

@ -132,6 +132,7 @@ typedef struct ViewDependencyNode
static List * GetRelationSequenceDependencyList(Oid relationId);
static List * GetRelationFunctionDependencyList(Oid relationId);
static List * GetRelationTriggerFunctionDependencyList(Oid relationId);
static List * GetPublicationRelationsDependencyList(Oid relationId);
static List * GetRelationStatsSchemaDependencyList(Oid relationId);
static List * GetRelationIndicesDependencyList(Oid relationId);
static DependencyDefinition * CreateObjectAddressDependencyDef(Oid classId, Oid objectId);
@ -722,6 +723,11 @@ SupportedDependencyByCitus(const ObjectAddress *address)
return true;
}
case OCLASS_PUBLICATION:
{
return true;
}
case OCLASS_TSCONFIG:
{
return true;
@ -1656,6 +1662,36 @@ ExpandCitusSupportedTypes(ObjectAddressCollector *collector, ObjectAddress targe
List *ruleRefDepList = GetViewRuleReferenceDependencyList(relationId);
result = list_concat(result, ruleRefDepList);
}
break;
}
case PublicationRelationId:
{
Oid publicationId = target.objectId;
/*
* Publications do not depend directly on relations, because dropping
* the relation will only remove it from the publications. However,
* we add a dependency to ensure the relation is created first when
* adding a node.
*/
List *relationDependencyList =
GetPublicationRelationsDependencyList(publicationId);
result = list_concat(result, relationDependencyList);
/*
* As of PostgreSQL 15, the same applies to schemas.
*/
#if PG_VERSION_NUM >= PG_VERSION_15
List *schemaIdList =
GetPublicationSchemas(publicationId);
List *schemaDependencyList =
CreateObjectAddressDependencyDefList(NamespaceRelationId, schemaIdList);
result = list_concat(result, schemaDependencyList);
#endif
break;
}
default:
@ -1834,7 +1870,7 @@ static List *
GetRelationSequenceDependencyList(Oid relationId)
{
List *seqInfoList = NIL;
GetDependentSequencesWithRelation(relationId, &seqInfoList, 0);
GetDependentSequencesWithRelation(relationId, &seqInfoList, 0, DEPENDENCY_AUTO);
List *seqIdList = NIL;
SequenceInfo *seqInfo = NULL;
@ -1923,6 +1959,33 @@ GetRelationTriggerFunctionDependencyList(Oid relationId)
}
/*
* GetPublicationRelationsDependencyList creates a list of ObjectAddressDependencies for
* a publication on the Citus relations it contains. This helps make sure we distribute
* Citus tables before local tables.
*/
static List *
GetPublicationRelationsDependencyList(Oid publicationId)
{
List *allRelationIds = GetPublicationRelations(publicationId, PUBLICATION_PART_ROOT);
List *citusRelationIds = NIL;
Oid relationId = InvalidOid;
foreach_oid(relationId, allRelationIds)
{
if (!IsCitusTable(relationId))
{
continue;
}
citusRelationIds = lappend_oid(citusRelationIds, relationId);
}
return CreateObjectAddressDependencyDefList(RelationRelationId, citusRelationIds);
}
/*
* GetTypeConstraintDependencyDefinition creates a list of constraint dependency
* definitions for a given type

View File

@ -311,7 +311,7 @@ static void InvalidateDistTableCache(void);
static void InvalidateDistObjectCache(void);
static bool InitializeTableCacheEntry(int64 shardId, bool missingOk);
static bool IsCitusTableTypeInternal(char partitionMethod, char replicationModel,
CitusTableType tableType);
uint32 colocationId, CitusTableType tableType);
static bool RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry, bool
missingOk);
@ -450,7 +450,36 @@ bool
IsCitusTableTypeCacheEntry(CitusTableCacheEntry *tableEntry, CitusTableType tableType)
{
return IsCitusTableTypeInternal(tableEntry->partitionMethod,
tableEntry->replicationModel, tableType);
tableEntry->replicationModel,
tableEntry->colocationId, tableType);
}
/*
* HasDistributionKey returs true if given Citus table doesn't have a
* distribution key.
*/
bool
HasDistributionKey(Oid relationId)
{
CitusTableCacheEntry *tableEntry = LookupCitusTableCacheEntry(relationId);
if (tableEntry == NULL)
{
ereport(ERROR, (errmsg("relation with oid %u is not a Citus table", relationId)));
}
return HasDistributionKeyCacheEntry(tableEntry);
}
/*
* HasDistributionKey returs true if given cache entry identifies a Citus
* table that doesn't have a distribution key.
*/
bool
HasDistributionKeyCacheEntry(CitusTableCacheEntry *tableEntry)
{
return tableEntry->partitionMethod != DISTRIBUTE_BY_NONE;
}
@ -460,7 +489,7 @@ IsCitusTableTypeCacheEntry(CitusTableCacheEntry *tableEntry, CitusTableType tabl
*/
static bool
IsCitusTableTypeInternal(char partitionMethod, char replicationModel,
CitusTableType tableType)
uint32 colocationId, CitusTableType tableType)
{
switch (tableType)
{
@ -501,12 +530,8 @@ IsCitusTableTypeInternal(char partitionMethod, char replicationModel,
case CITUS_LOCAL_TABLE:
{
return partitionMethod == DISTRIBUTE_BY_NONE &&
replicationModel != REPLICATION_MODEL_2PC;
}
case CITUS_TABLE_WITH_NO_DIST_KEY:
{
return partitionMethod == DISTRIBUTE_BY_NONE;
replicationModel != REPLICATION_MODEL_2PC &&
colocationId == INVALID_COLOCATION_ID;
}
case ANY_CITUS_TABLE_TYPE:
@ -529,33 +554,21 @@ IsCitusTableTypeInternal(char partitionMethod, char replicationModel,
char *
GetTableTypeName(Oid tableId)
{
bool regularTable = false;
char partitionMethod = ' ';
char replicationModel = ' ';
if (IsCitusTable(tableId))
{
CitusTableCacheEntry *referencingCacheEntry = GetCitusTableCacheEntry(tableId);
partitionMethod = referencingCacheEntry->partitionMethod;
replicationModel = referencingCacheEntry->replicationModel;
}
else
{
regularTable = true;
}
if (regularTable)
if (!IsCitusTable(tableId))
{
return "regular table";
}
else if (partitionMethod == 'h')
CitusTableCacheEntry *tableCacheEntry = GetCitusTableCacheEntry(tableId);
if (IsCitusTableTypeCacheEntry(tableCacheEntry, HASH_DISTRIBUTED))
{
return "distributed table";
}
else if (partitionMethod == 'n' && replicationModel == 't')
else if (IsCitusTableTypeCacheEntry(tableCacheEntry, REFERENCE_TABLE))
{
return "reference table";
}
else if (partitionMethod == 'n' && replicationModel != 't')
else if (IsCitusTableTypeCacheEntry(tableCacheEntry, CITUS_LOCAL_TABLE))
{
return "citus local table";
}
@ -577,6 +590,18 @@ IsCitusTable(Oid relationId)
}
/*
* IsCitusTableRangeVar returns whether the table named in the given
* rangeVar is a Citus table.
*/
bool
IsCitusTableRangeVar(RangeVar *rangeVar, LOCKMODE lockMode, bool missingOK)
{
Oid relationId = RangeVarGetRelid(rangeVar, lockMode, missingOK);
return IsCitusTable(relationId);
}
/*
* IsCitusTableViaCatalog returns whether the given relation is a
* distributed table or not.
@ -765,14 +790,28 @@ PgDistPartitionTupleViaCatalog(Oid relationId)
/*
* IsCitusLocalTableByDistParams returns true if given partitionMethod and
* replicationModel would identify a citus local table.
* IsReferenceTableByDistParams returns true if given partitionMethod and
* replicationModel would identify a reference table.
*/
bool
IsCitusLocalTableByDistParams(char partitionMethod, char replicationModel)
IsReferenceTableByDistParams(char partitionMethod, char replicationModel)
{
return partitionMethod == DISTRIBUTE_BY_NONE &&
replicationModel != REPLICATION_MODEL_2PC;
replicationModel == REPLICATION_MODEL_2PC;
}
/*
* IsCitusLocalTableByDistParams returns true if given partitionMethod,
* replicationModel and colocationId would identify a citus local table.
*/
bool
IsCitusLocalTableByDistParams(char partitionMethod, char replicationModel,
uint32 colocationId)
{
return partitionMethod == DISTRIBUTE_BY_NONE &&
replicationModel != REPLICATION_MODEL_2PC &&
colocationId == INVALID_COLOCATION_ID;
}
@ -4837,11 +4876,14 @@ CitusTableTypeIdList(CitusTableType citusTableType)
Datum partMethodDatum = datumArray[Anum_pg_dist_partition_partmethod - 1];
Datum replicationModelDatum = datumArray[Anum_pg_dist_partition_repmodel - 1];
Datum colocationIdDatum = datumArray[Anum_pg_dist_partition_colocationid - 1];
Oid partitionMethod = DatumGetChar(partMethodDatum);
Oid replicationModel = DatumGetChar(replicationModelDatum);
uint32 colocationId = DatumGetUInt32(colocationIdDatum);
if (IsCitusTableTypeInternal(partitionMethod, replicationModel, citusTableType))
if (IsCitusTableTypeInternal(partitionMethod, replicationModel, colocationId,
citusTableType))
{
Datum relationIdDatum = datumArray[Anum_pg_dist_partition_logicalrelid - 1];

File diff suppressed because it is too large Load Diff

View File

@ -985,7 +985,7 @@ AppendShardSizeQuery(StringInfo selectQuery, ShardInterval *shardInterval)
appendStringInfo(selectQuery, "SELECT " UINT64_FORMAT " AS shard_id, ", shardId);
appendStringInfo(selectQuery, "%s AS shard_name, ", quotedShardName);
appendStringInfo(selectQuery, PG_RELATION_SIZE_FUNCTION, quotedShardName);
appendStringInfo(selectQuery, PG_TOTAL_RELATION_SIZE_FUNCTION, quotedShardName);
}
@ -1670,6 +1670,48 @@ TupleToGroupShardPlacement(TupleDesc tupleDescriptor, HeapTuple heapTuple)
}
/*
* LookupTaskPlacementHostAndPort sets the nodename and nodeport for the given task placement
* with a lookup.
*/
void
LookupTaskPlacementHostAndPort(ShardPlacement *taskPlacement, char **nodeName,
int *nodePort)
{
if (IsDummyPlacement(taskPlacement))
{
/*
* If we create a dummy placement for the local node, it is possible
* that the entry doesn't exist in pg_dist_node, hence a lookup will fail.
* In that case we want to use the dummy placements values.
*/
*nodeName = taskPlacement->nodeName;
*nodePort = taskPlacement->nodePort;
}
else
{
/*
* We want to lookup the node information again since it is possible that
* there were changes in pg_dist_node and we will get those invalidations
* in LookupNodeForGroup.
*/
WorkerNode *workerNode = LookupNodeForGroup(taskPlacement->groupId);
*nodeName = workerNode->workerName;
*nodePort = workerNode->workerPort;
}
}
/*
* IsDummyPlacement returns true if the given placement is a dummy placement.
*/
bool
IsDummyPlacement(ShardPlacement *taskPlacement)
{
return taskPlacement->nodeId == LOCAL_NODE_ID;
}
/*
* InsertShardRow opens the shard system catalog, and inserts a new row with the
* given values into that system catalog. Note that we allow the user to pass in

File diff suppressed because it is too large Load Diff

View File

@ -425,6 +425,7 @@ ErrorIfCurrentUserCanNotDistributeObject(char *textType, ObjectType type,
case OBJECT_COLLATION:
case OBJECT_VIEW:
case OBJECT_ROLE:
case OBJECT_PUBLICATION:
{
check_object_ownership(userId, type, *addr, node, *relation);
break;

View File

@ -215,6 +215,7 @@ CreateColocatedShards(Oid targetRelationId, Oid sourceRelationId, bool
{
bool colocatedShard = true;
List *insertedShardPlacements = NIL;
List *insertedShardIds = NIL;
/* make sure that tables are hash partitioned */
CheckHashPartitionedTable(targetRelationId);
@ -254,7 +255,9 @@ CreateColocatedShards(Oid targetRelationId, Oid sourceRelationId, bool
foreach_ptr(sourceShardInterval, sourceShardIntervalList)
{
uint64 sourceShardId = sourceShardInterval->shardId;
uint64 newShardId = GetNextShardId();
uint64 *newShardIdPtr = (uint64 *) palloc0(sizeof(uint64));
*newShardIdPtr = GetNextShardId();
insertedShardIds = lappend(insertedShardIds, newShardIdPtr);
int32 shardMinValue = DatumGetInt32(sourceShardInterval->minValue);
int32 shardMaxValue = DatumGetInt32(sourceShardInterval->maxValue);
@ -263,7 +266,7 @@ CreateColocatedShards(Oid targetRelationId, Oid sourceRelationId, bool
List *sourceShardPlacementList = ShardPlacementListSortedByWorker(
sourceShardId);
InsertShardRow(targetRelationId, newShardId, targetShardStorageType,
InsertShardRow(targetRelationId, *newShardIdPtr, targetShardStorageType,
shardMinValueText, shardMaxValueText);
ShardPlacement *sourcePlacement = NULL;
@ -272,21 +275,26 @@ CreateColocatedShards(Oid targetRelationId, Oid sourceRelationId, bool
int32 groupId = sourcePlacement->groupId;
const uint64 shardSize = 0;
/*
* Optimistically add shard placement row the pg_dist_shard_placement, in case
* of any error it will be roll-backed.
*/
uint64 shardPlacementId = InsertShardPlacementRow(newShardId,
InsertShardPlacementRow(*newShardIdPtr,
INVALID_PLACEMENT_ID,
shardSize,
groupId);
ShardPlacement *shardPlacement = LoadShardPlacement(newShardId,
shardPlacementId);
insertedShardPlacements = lappend(insertedShardPlacements, shardPlacement);
}
}
/*
* load shard placements for the shard at once after all placement insertions
* finished. That prevents MetadataCache from rebuilding unnecessarily after
* each placement insertion.
*/
uint64 *shardIdPtr;
foreach_ptr(shardIdPtr, insertedShardIds)
{
List *placementsForShard = ShardPlacementList(*shardIdPtr);
insertedShardPlacements = list_concat(insertedShardPlacements,
placementsForShard);
}
CreateShardsOnWorkers(targetRelationId, insertedShardPlacements,
useExclusiveConnections, colocatedShard);
}

View File

@ -461,10 +461,7 @@ ResolveRelationId(text *relationName, bool missingOk)
* definition, optional column storage and statistics definitions, and index
* constraint and trigger definitions.
* When IncludeIdentities is NO_IDENTITY, the function does not include identity column
* specifications. When it's INCLUDE_IDENTITY_AS_SEQUENCE_DEFAULTS, the function
* uses sequences and set them as default values for identity columns by using exactly
* the same approach with worker_nextval('sequence') & nextval('sequence') logic
* desribed above. When it's INCLUDE_IDENTITY it creates GENERATED .. AS IDENTIY clauses.
* specifications. When it's INCLUDE_IDENTITY it creates GENERATED .. AS IDENTIY clauses.
*/
List *
GetFullTableCreationCommands(Oid relationId,
@ -500,6 +497,15 @@ GetFullTableCreationCommands(Oid relationId,
tableDDLEventList = lappend(tableDDLEventList,
truncateTriggerCommand);
}
/*
* For identity column sequences, we only need to modify
* their min/max values to produce unique values on the worker nodes.
*/
List *identitySequenceDependencyCommandList =
IdentitySequenceDependencyCommandList(relationId);
tableDDLEventList = list_concat(tableDDLEventList,
identitySequenceDependencyCommandList);
}
tableDDLEventList = list_concat(tableDDLEventList, postLoadCreationCommandList);

View File

@ -190,6 +190,19 @@ typedef struct WorkerShardStatistics
HTAB *statistics;
} WorkerShardStatistics;
/* ShardMoveDependencyHashEntry contains the taskId which any new shard move task within the corresponding colocation group must take a dependency on */
typedef struct ShardMoveDependencyInfo
{
int64 key;
int64 taskId;
} ShardMoveDependencyInfo;
typedef struct ShardMoveDependencies
{
HTAB *colocationDependencies;
HTAB *nodeDependencies;
} ShardMoveDependencies;
char *VariablesToBePassedToNewConnections = NULL;
/* static declarations for main logic */
@ -475,6 +488,7 @@ GetRebalanceSteps(RebalanceOptions *options)
/* sort the lists to make the function more deterministic */
List *activeWorkerList = SortedActiveWorkers();
List *activeShardPlacementListList = NIL;
List *unbalancedShards = NIL;
Oid relationId = InvalidOid;
foreach_oid(relationId, options->relationIdList)
@ -490,8 +504,29 @@ GetRebalanceSteps(RebalanceOptions *options)
shardPlacementList, options->workerNode);
}
activeShardPlacementListList =
lappend(activeShardPlacementListList, activeShardPlacementListForRelation);
if (list_length(activeShardPlacementListForRelation) >= list_length(
activeWorkerList))
{
activeShardPlacementListList = lappend(activeShardPlacementListList,
activeShardPlacementListForRelation);
}
else
{
/*
* If the number of shard groups are less than the number of worker nodes,
* at least one of the worker nodes will remain empty. For such cases,
* we consider those shard groups as a colocation group and try to
* distribute them across the cluster.
*/
unbalancedShards = list_concat(unbalancedShards,
activeShardPlacementListForRelation);
}
}
if (list_length(unbalancedShards) > 0)
{
activeShardPlacementListList = lappend(activeShardPlacementListList,
unbalancedShards);
}
if (options->threshold < options->rebalanceStrategy->minimumThreshold)
@ -1796,10 +1831,10 @@ static void
RebalanceTableShards(RebalanceOptions *options, Oid shardReplicationModeOid)
{
char transferMode = LookupShardTransferMode(shardReplicationModeOid);
EnsureReferenceTablesExistOnAllNodesExtended(transferMode);
if (list_length(options->relationIdList) == 0)
{
EnsureReferenceTablesExistOnAllNodesExtended(transferMode);
return;
}
@ -1814,6 +1849,25 @@ RebalanceTableShards(RebalanceOptions *options, Oid shardReplicationModeOid)
List *placementUpdateList = GetRebalanceSteps(options);
if (transferMode == TRANSFER_MODE_AUTOMATIC)
{
/*
* If the shard transfer mode is set to auto, we should check beforehand
* if we are able to use logical replication to transfer shards or not.
* We throw an error if any of the tables do not have a replica identity, which
* is required for logical replication to replicate UPDATE and DELETE commands.
*/
PlacementUpdateEvent *placementUpdate = NULL;
foreach_ptr(placementUpdate, placementUpdateList)
{
Oid relationId = RelationIdForShard(placementUpdate->shardId);
List *colocatedTableList = ColocatedTableList(relationId);
VerifyTablesHaveReplicaIdentity(colocatedTableList);
}
}
EnsureReferenceTablesExistOnAllNodesExtended(transferMode);
if (list_length(placementUpdateList) == 0)
{
return;
@ -1857,6 +1911,137 @@ ErrorOnConcurrentRebalance(RebalanceOptions *options)
}
/*
* GetColocationId function returns the colocationId of the shard in a PlacementUpdateEvent.
*/
static int64
GetColocationId(PlacementUpdateEvent *move)
{
ShardInterval *shardInterval = LoadShardInterval(move->shardId);
CitusTableCacheEntry *citusTableCacheEntry = GetCitusTableCacheEntry(
shardInterval->relationId);
return citusTableCacheEntry->colocationId;
}
/*
* InitializeShardMoveDependencies function creates the hash maps that we use to track
* the latest moves so that subsequent moves with the same properties must take a dependency
* on them. There are two hash maps. One is for tracking the latest move scheduled in a
* given colocation group and the other one is for tracking the latest move which involves
* a given node either as its source node or its target node.
*/
static ShardMoveDependencies
InitializeShardMoveDependencies()
{
ShardMoveDependencies shardMoveDependencies;
shardMoveDependencies.colocationDependencies = CreateSimpleHashWithNameAndSize(int64,
ShardMoveDependencyInfo,
"colocationDependencyHashMap",
6);
shardMoveDependencies.nodeDependencies = CreateSimpleHashWithNameAndSize(int64,
ShardMoveDependencyInfo,
"nodeDependencyHashMap",
6);
return shardMoveDependencies;
}
/*
* GenerateTaskMoveDependencyList creates and returns a List of taskIds that
* the move must take a dependency on.
*/
static int64 *
GenerateTaskMoveDependencyList(PlacementUpdateEvent *move, int64 colocationId,
ShardMoveDependencies shardMoveDependencies, int *nDepends)
{
HTAB *dependsList = CreateSimpleHashSetWithNameAndSize(int64,
"shardMoveDependencyList", 0);
bool found;
/* Check if there exists a move in the same colocation group scheduled earlier. */
ShardMoveDependencyInfo *shardMoveDependencyInfo = hash_search(
shardMoveDependencies.colocationDependencies, &colocationId, HASH_ENTER, &found);
if (found)
{
hash_search(dependsList, &shardMoveDependencyInfo->taskId, HASH_ENTER, NULL);
}
/* Check if there exists a move scheduled earlier whose source or target node
* overlaps with the current move's source node. */
shardMoveDependencyInfo = hash_search(
shardMoveDependencies.nodeDependencies, &move->sourceNode->nodeId, HASH_ENTER,
&found);
if (found)
{
hash_search(dependsList, &shardMoveDependencyInfo->taskId, HASH_ENTER, NULL);
}
/* Check if there exists a move scheduled earlier whose source or target node
* overlaps with the current move's target node. */
shardMoveDependencyInfo = hash_search(
shardMoveDependencies.nodeDependencies, &move->targetNode->nodeId, HASH_ENTER,
&found);
if (found)
{
hash_search(dependsList, &shardMoveDependencyInfo->taskId, HASH_ENTER, NULL);
}
*nDepends = hash_get_num_entries(dependsList);
int64 *dependsArray = NULL;
if (*nDepends > 0)
{
HASH_SEQ_STATUS seq;
dependsArray = palloc((*nDepends) * sizeof(int64));
hash_seq_init(&seq, dependsList);
int i = 0;
int64 *dependsTaskId;
while ((dependsTaskId = (int64 *) hash_seq_search(&seq)) != NULL)
{
dependsArray[i++] = *dependsTaskId;
}
}
return dependsArray;
}
/*
* UpdateShardMoveDependencies function updates the dependency maps with the latest move's taskId.
*/
static void
UpdateShardMoveDependencies(PlacementUpdateEvent *move, uint64 colocationId, int64 taskId,
ShardMoveDependencies shardMoveDependencies)
{
ShardMoveDependencyInfo *shardMoveDependencyInfo = hash_search(
shardMoveDependencies.colocationDependencies, &colocationId, HASH_ENTER, NULL);
shardMoveDependencyInfo->taskId = taskId;
shardMoveDependencyInfo = hash_search(shardMoveDependencies.nodeDependencies,
&move->sourceNode->nodeId, HASH_ENTER, NULL);
shardMoveDependencyInfo->taskId = taskId;
shardMoveDependencyInfo = hash_search(shardMoveDependencies.nodeDependencies,
&move->targetNode->nodeId, HASH_ENTER, NULL);
shardMoveDependencyInfo->taskId = taskId;
}
/*
* RebalanceTableShardsBackground rebalances the shards for the relations
* inside the relationIdList across the different workers. It does so using our
@ -1894,12 +2079,6 @@ RebalanceTableShardsBackground(RebalanceOptions *options, Oid shardReplicationMo
EnsureTableOwner(colocatedTableId);
}
if (shardTransferMode == TRANSFER_MODE_AUTOMATIC)
{
/* make sure that all tables included in the rebalance have a replica identity*/
VerifyTablesHaveReplicaIdentity(colocatedTableList);
}
List *placementUpdateList = GetRebalanceSteps(options);
if (list_length(placementUpdateList) == 0)
@ -1908,6 +2087,23 @@ RebalanceTableShardsBackground(RebalanceOptions *options, Oid shardReplicationMo
return 0;
}
if (shardTransferMode == TRANSFER_MODE_AUTOMATIC)
{
/*
* If the shard transfer mode is set to auto, we should check beforehand
* if we are able to use logical replication to transfer shards or not.
* We throw an error if any of the tables do not have a replica identity, which
* is required for logical replication to replicate UPDATE and DELETE commands.
*/
PlacementUpdateEvent *placementUpdate = NULL;
foreach_ptr(placementUpdate, placementUpdateList)
{
relationId = RelationIdForShard(placementUpdate->shardId);
List *colocatedTables = ColocatedTableList(relationId);
VerifyTablesHaveReplicaIdentity(colocatedTables);
}
}
DropOrphanedResourcesInSeparateTransaction();
/* find the name of the shard transfer mode to interpolate in the scheduled command */
@ -1922,18 +2118,8 @@ RebalanceTableShardsBackground(RebalanceOptions *options, Oid shardReplicationMo
StringInfoData buf = { 0 };
initStringInfo(&buf);
/*
* Currently we only have two tasks that any move can depend on:
* - replicating reference tables
* - the previous move
*
* prevJobIdx tells what slot to write the id of the task into. We only use both slots
* if we are actually replicating reference tables.
*/
int64 prevJobId[2] = { 0 };
int prevJobIdx = 0;
List *referenceTableIdList = NIL;
int64 replicateRefTablesTaskId = 0;
if (HasNodesWithMissingReferenceTables(&referenceTableIdList))
{
@ -1949,15 +2135,15 @@ RebalanceTableShardsBackground(RebalanceOptions *options, Oid shardReplicationMo
appendStringInfo(&buf,
"SELECT pg_catalog.replicate_reference_tables(%s)",
quote_literal_cstr(shardTranferModeLabel));
BackgroundTask *task = ScheduleBackgroundTask(jobId, GetUserId(), buf.data,
prevJobIdx, prevJobId);
prevJobId[prevJobIdx] = task->taskid;
prevJobIdx++;
BackgroundTask *task = ScheduleBackgroundTask(jobId, GetUserId(), buf.data, 0,
NULL);
replicateRefTablesTaskId = task->taskid;
}
PlacementUpdateEvent *move = NULL;
bool first = true;
int prevMoveIndex = prevJobIdx;
ShardMoveDependencies shardMoveDependencies = InitializeShardMoveDependencies();
foreach_ptr(move, placementUpdateList)
{
resetStringInfo(&buf);
@ -1969,14 +2155,27 @@ RebalanceTableShardsBackground(RebalanceOptions *options, Oid shardReplicationMo
move->targetNode->nodeId,
quote_literal_cstr(shardTranferModeLabel));
BackgroundTask *task = ScheduleBackgroundTask(jobId, GetUserId(), buf.data,
prevJobIdx, prevJobId);
prevJobId[prevMoveIndex] = task->taskid;
if (first)
int64 colocationId = GetColocationId(move);
int nDepends = 0;
int64 *dependsArray = GenerateTaskMoveDependencyList(move, colocationId,
shardMoveDependencies,
&nDepends);
if (nDepends == 0 && replicateRefTablesTaskId > 0)
{
first = false;
prevJobIdx++;
nDepends = 1;
dependsArray = palloc(nDepends * sizeof(int64));
dependsArray[0] = replicateRefTablesTaskId;
}
BackgroundTask *task = ScheduleBackgroundTask(jobId, GetUserId(), buf.data,
nDepends,
dependsArray);
UpdateShardMoveDependencies(move, colocationId, task->taskid,
shardMoveDependencies);
}
ereport(NOTICE,

View File

@ -70,22 +70,43 @@ typedef struct ShardCommandList
List *ddlCommandList;
} ShardCommandList;
static const char *ShardTransferTypeNames[] = {
[SHARD_TRANSFER_INVALID_FIRST] = "unknown",
[SHARD_TRANSFER_MOVE] = "move",
[SHARD_TRANSFER_COPY] = "copy",
};
static const char *ShardTransferTypeNamesCapitalized[] = {
[SHARD_TRANSFER_INVALID_FIRST] = "unknown",
[SHARD_TRANSFER_MOVE] = "Move",
[SHARD_TRANSFER_COPY] = "Copy",
};
static const char *ShardTransferTypeNamesContinuous[] = {
[SHARD_TRANSFER_INVALID_FIRST] = "unknown",
[SHARD_TRANSFER_MOVE] = "Moving",
[SHARD_TRANSFER_COPY] = "Copying",
};
static const char *ShardTransferTypeFunctionNames[] = {
[SHARD_TRANSFER_INVALID_FIRST] = "unknown",
[SHARD_TRANSFER_MOVE] = "citus_move_shard_placement",
[SHARD_TRANSFER_COPY] = "citus_copy_shard_placement",
};
/* local function forward declarations */
static bool CanUseLogicalReplication(Oid relationId, char shardReplicationMode);
static void ErrorIfTableCannotBeReplicated(Oid relationId);
static void ErrorIfTargetNodeIsNotSafeToCopyTo(const char *targetNodeName,
int targetNodePort);
static void ErrorIfTargetNodeIsNotSafeForTransfer(const char *targetNodeName,
int targetNodePort,
ShardTransferType transferType);
static void ErrorIfSameNode(char *sourceNodeName, int sourceNodePort,
char *targetNodeName, int targetNodePort,
const char *operationName);
static void ReplicateColocatedShardPlacement(int64 shardId, char *sourceNodeName,
int32 sourceNodePort, char *targetNodeName,
int32 targetNodePort,
char shardReplicationMode);
static void CopyShardTables(List *shardIntervalList, char *sourceNodeName,
int32 sourceNodePort, char *targetNodeName,
int32 targetNodePort, bool useLogicalReplication,
char *operationName);
const char *operationName);
static void CopyShardTablesViaLogicalReplication(List *shardIntervalList,
char *sourceNodeName,
int32 sourceNodePort,
@ -100,7 +121,7 @@ static void EnsureShardCanBeCopied(int64 shardId, const char *sourceNodeName,
int32 targetNodePort);
static List * RecreateTableDDLCommandList(Oid relationId);
static void EnsureTableListOwner(List *tableIdList);
static void EnsureTableListSuitableForReplication(List *tableIdList);
static void ErrorIfReplicatingDistributedTableWithFKeys(List *tableIdList);
static void DropShardPlacementsFromMetadata(List *shardList,
char *nodeName,
@ -112,12 +133,28 @@ static void UpdateColocatedShardPlacementMetadataOnWorkers(int64 shardId,
int32 targetNodePort);
static bool IsShardListOnNode(List *colocatedShardList, char *targetNodeName,
uint32 targetPort);
static void SetupRebalanceMonitorForShardTransfer(uint64 shardId, Oid distributedTableId,
char *sourceNodeName,
uint32 sourceNodePort,
char *targetNodeName,
uint32 targetNodePort,
ShardTransferType transferType);
static void CheckSpaceConstraints(MultiConnection *connection,
uint64 colocationSizeInBytes);
static void EnsureAllShardsCanBeCopied(List *colocatedShardList,
char *sourceNodeName, uint32 sourceNodePort,
char *targetNodeName, uint32 targetNodePort);
static void EnsureEnoughDiskSpaceForShardMove(List *colocatedShardList,
char *sourceNodeName, uint32 sourceNodePort,
char *targetNodeName, uint32
targetNodePort);
char *targetNodeName, uint32 targetNodePort,
ShardTransferType transferType);
static bool TransferAlreadyCompleted(List *colocatedShardList,
char *sourceNodeName, uint32 sourceNodePort,
char *targetNodeName, uint32 targetNodePort,
ShardTransferType transferType);
static void LockColocatedRelationsForMove(List *colocatedTableList);
static void ErrorIfForeignTableForShardTransfer(List *colocatedTableList,
ShardTransferType transferType);
static List * RecreateShardDDLCommandList(ShardInterval *shardInterval,
const char *sourceNodeName,
int32 sourceNodePort);
@ -163,9 +200,9 @@ citus_copy_shard_placement(PG_FUNCTION_ARGS)
char shardReplicationMode = LookupShardTransferMode(shardReplicationModeOid);
ReplicateColocatedShardPlacement(shardId, sourceNodeName, sourceNodePort,
TransferShards(shardId, sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort,
shardReplicationMode);
shardReplicationMode, SHARD_TRANSFER_COPY);
PG_RETURN_VOID();
}
@ -192,10 +229,9 @@ citus_copy_shard_placement_with_nodeid(PG_FUNCTION_ARGS)
char shardReplicationMode = LookupShardTransferMode(shardReplicationModeOid);
ReplicateColocatedShardPlacement(shardId,
sourceNode->workerName, sourceNode->workerPort,
TransferShards(shardId, sourceNode->workerName, sourceNode->workerPort,
targetNode->workerName, targetNode->workerPort,
shardReplicationMode);
shardReplicationMode, SHARD_TRANSFER_COPY);
PG_RETURN_VOID();
}
@ -228,9 +264,9 @@ master_copy_shard_placement(PG_FUNCTION_ARGS)
ereport(WARNING, (errmsg("do_repair argument is deprecated")));
}
ReplicateColocatedShardPlacement(shardId, sourceNodeName, sourceNodePort,
TransferShards(shardId, sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort,
shardReplicationMode);
shardReplicationMode, SHARD_TRANSFER_COPY);
PG_RETURN_VOID();
@ -264,9 +300,10 @@ citus_move_shard_placement(PG_FUNCTION_ARGS)
int32 targetNodePort = PG_GETARG_INT32(4);
Oid shardReplicationModeOid = PG_GETARG_OID(5);
citus_move_shard_placement_internal(shardId, sourceNodeName, sourceNodePort,
char shardReplicationMode = LookupShardTransferMode(shardReplicationModeOid);
TransferShards(shardId, sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort,
shardReplicationModeOid);
shardReplicationMode, SHARD_TRANSFER_MOVE);
PG_RETURN_VOID();
}
@ -291,126 +328,111 @@ citus_move_shard_placement_with_nodeid(PG_FUNCTION_ARGS)
WorkerNode *sourceNode = FindNodeWithNodeId(sourceNodeId, missingOk);
WorkerNode *targetNode = FindNodeWithNodeId(targetNodeId, missingOk);
citus_move_shard_placement_internal(shardId, sourceNode->workerName,
char shardReplicationMode = LookupShardTransferMode(shardReplicationModeOid);
TransferShards(shardId, sourceNode->workerName,
sourceNode->workerPort, targetNode->workerName,
targetNode->workerPort,
shardReplicationModeOid);
targetNode->workerPort, shardReplicationMode, SHARD_TRANSFER_MOVE);
PG_RETURN_VOID();
}
/*
* citus_move_shard_placement_internal is the internal function for shard moves.
* TransferShards is the function for shard transfers.
*/
void
citus_move_shard_placement_internal(int64 shardId, char *sourceNodeName,
TransferShards(int64 shardId, char *sourceNodeName,
int32 sourceNodePort, char *targetNodeName,
int32 targetNodePort, Oid shardReplicationModeOid)
int32 targetNodePort, char shardReplicationMode,
ShardTransferType transferType)
{
ListCell *colocatedTableCell = NULL;
ListCell *colocatedShardCell = NULL;
/* strings to be used in log messages */
const char *operationName = ShardTransferTypeNames[transferType];
const char *operationNameCapitalized =
ShardTransferTypeNamesCapitalized[transferType];
const char *operationFunctionName = ShardTransferTypeFunctionNames[transferType];
/* cannot transfer shard to the same node */
ErrorIfSameNode(sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort,
"move");
Oid relationId = RelationIdForShard(shardId);
ErrorIfMoveUnsupportedTableType(relationId);
ErrorIfTargetNodeIsNotSafeToMove(targetNodeName, targetNodePort);
AcquirePlacementColocationLock(relationId, ExclusiveLock, "move");
operationName);
ShardInterval *shardInterval = LoadShardInterval(shardId);
Oid distributedTableId = shardInterval->relationId;
/* error if unsupported shard transfer */
if (transferType == SHARD_TRANSFER_MOVE)
{
ErrorIfMoveUnsupportedTableType(distributedTableId);
}
else if (transferType == SHARD_TRANSFER_COPY)
{
ErrorIfTableCannotBeReplicated(distributedTableId);
EnsureNoModificationsHaveBeenDone();
}
ErrorIfTargetNodeIsNotSafeForTransfer(targetNodeName, targetNodePort, transferType);
AcquirePlacementColocationLock(distributedTableId, ExclusiveLock, operationName);
List *colocatedTableList = ColocatedTableList(distributedTableId);
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
foreach(colocatedTableCell, colocatedTableList)
EnsureTableListOwner(colocatedTableList);
if (transferType == SHARD_TRANSFER_MOVE)
{
Oid colocatedTableId = lfirst_oid(colocatedTableCell);
/* check that user has owner rights in all co-located tables */
EnsureTableOwner(colocatedTableId);
/*
* Block concurrent DDL / TRUNCATE commands on the relation. Similarly,
* block concurrent citus_move_shard_placement() on any shard of
* the same relation. This is OK for now since we're executing shard
* moves sequentially anyway.
*/
LockRelationOid(colocatedTableId, ShareUpdateExclusiveLock);
LockColocatedRelationsForMove(colocatedTableList);
}
if (IsForeignTable(relationId))
ErrorIfForeignTableForShardTransfer(colocatedTableList, transferType);
if (transferType == SHARD_TRANSFER_COPY)
{
char *relationName = get_rel_name(colocatedTableId);
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot move shard"),
errdetail("Table %s is a foreign table. Moving "
"shards backed by foreign tables is "
"not supported.", relationName)));
ErrorIfReplicatingDistributedTableWithFKeys(colocatedTableList);
}
}
/* we sort colocatedShardList so that lock operations will not cause any deadlocks */
colocatedShardList = SortList(colocatedShardList, CompareShardIntervalsById);
/*
* If there are no active placements on the source and only active placements on
* the target node, we assume the copy to already be done.
* We sort shardIntervalList so that lock operations will not cause any
* deadlocks.
*/
if (IsShardListOnNode(colocatedShardList, targetNodeName, targetNodePort) &&
!IsShardListOnNode(colocatedShardList, sourceNodeName, sourceNodePort))
colocatedShardList = SortList(colocatedShardList, CompareShardIntervalsById);
if (TransferAlreadyCompleted(colocatedShardList,
sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort,
transferType))
{
/* if the transfer is already completed, we can return right away */
ereport(WARNING, (errmsg("shard is already present on node %s:%d",
targetNodeName, targetNodePort),
errdetail("Move may have already completed.")));
errdetail("%s may have already completed.",
operationNameCapitalized)));
return;
}
foreach(colocatedShardCell, colocatedShardList)
{
ShardInterval *colocatedShard = (ShardInterval *) lfirst(colocatedShardCell);
uint64 colocatedShardId = colocatedShard->shardId;
EnsureShardCanBeCopied(colocatedShardId, sourceNodeName, sourceNodePort,
EnsureAllShardsCanBeCopied(colocatedShardList, sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort);
}
char shardReplicationMode = LookupShardTransferMode(shardReplicationModeOid);
if (shardReplicationMode == TRANSFER_MODE_AUTOMATIC)
{
VerifyTablesHaveReplicaIdentity(colocatedTableList);
}
EnsureEnoughDiskSpaceForShardMove(colocatedShardList, sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort);
EnsureEnoughDiskSpaceForShardMove(colocatedShardList,
sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort, transferType);
/*
* We want to be able to track progress of shard moves using
* get_rebalancer_progress. If this move is initiated by the rebalancer,
* then the rebalancer call has already set up the shared memory that is
* used to do that. But if citus_move_shard_placement is called directly by
* the user (or through any other mechanism), then the shared memory is not
* set up yet. In that case we do it here.
*/
if (!IsRebalancerInternalBackend())
{
WorkerNode *sourceNode = FindWorkerNode(sourceNodeName, sourceNodePort);
WorkerNode *targetNode = FindWorkerNode(targetNodeName, targetNodePort);
PlacementUpdateEvent *placementUpdateEvent = palloc0(
sizeof(PlacementUpdateEvent));
placementUpdateEvent->updateType = PLACEMENT_UPDATE_MOVE;
placementUpdateEvent->shardId = shardId;
placementUpdateEvent->sourceNode = sourceNode;
placementUpdateEvent->targetNode = targetNode;
SetupRebalanceMonitor(list_make1(placementUpdateEvent), relationId,
REBALANCE_PROGRESS_MOVING,
PLACEMENT_UPDATE_STATUS_SETTING_UP);
}
SetupRebalanceMonitorForShardTransfer(shardId, distributedTableId,
sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort,
transferType);
UpdatePlacementUpdateStatusForShardIntervalList(
colocatedShardList,
@ -428,7 +450,7 @@ citus_move_shard_placement_internal(int64 shardId, char *sourceNodeName,
{
BlockWritesToShardList(colocatedShardList);
}
else
else if (transferType == SHARD_TRANSFER_MOVE)
{
/*
* We prevent multiple shard moves in a transaction that use logical
@ -452,6 +474,20 @@ citus_move_shard_placement_internal(int64 shardId, char *sourceNodeName,
PlacementMovedUsingLogicalReplicationInTX = true;
}
if (transferType == SHARD_TRANSFER_COPY &&
!IsCitusTableType(distributedTableId, REFERENCE_TABLE))
{
/*
* When copying a shard to a new node, we should first ensure that reference
* tables are present such that joins work immediately after copying the shard.
* When copying a reference table, we are probably trying to achieve just that.
*
* Since this a long-running operation we do this after the error checks, but
* before taking metadata locks.
*/
EnsureReferenceTablesExistOnAllNodesExtended(shardReplicationMode);
}
DropOrphanedResourcesInSeparateTransaction();
ShardInterval *colocatedShard = NULL;
@ -466,18 +502,21 @@ citus_move_shard_placement_internal(int64 shardId, char *sourceNodeName,
ErrorIfCleanupRecordForShardExists(qualifiedShardName);
}
/*
* CopyColocatedShardPlacement function copies given shard with its co-located
* shards.
*/
CopyShardTables(colocatedShardList, sourceNodeName, sourceNodePort, targetNodeName,
targetNodePort, useLogicalReplication, "citus_move_shard_placement");
targetNodePort, useLogicalReplication, operationFunctionName);
if (transferType == SHARD_TRANSFER_MOVE)
{
/* delete old shards metadata and mark the shards as to be deferred drop */
int32 sourceGroupId = GroupForNode(sourceNodeName, sourceNodePort);
InsertCleanupRecordsForShardPlacementsOnNode(colocatedShardList,
sourceGroupId);
}
/*
* Finally insert the placements to pg_dist_placement and sync it to the
* metadata workers.
*/
colocatedShard = NULL;
foreach_ptr(colocatedShard, colocatedShardList)
{
@ -488,17 +527,30 @@ citus_move_shard_placement_internal(int64 shardId, char *sourceNodeName,
InsertShardPlacementRow(colocatedShardId, placementId,
ShardLength(colocatedShardId),
groupId);
if (transferType == SHARD_TRANSFER_COPY &&
ShouldSyncTableMetadata(colocatedShard->relationId))
{
char *placementCommand = PlacementUpsertCommand(colocatedShardId, placementId,
0, groupId);
SendCommandToWorkersWithMetadata(placementCommand);
}
}
if (transferType == SHARD_TRANSFER_MOVE)
{
/*
* Since this is move operation, we remove the placements from the metadata
* for the source node after copy.
*/
DropShardPlacementsFromMetadata(colocatedShardList, sourceNodeName, sourceNodePort);
DropShardPlacementsFromMetadata(colocatedShardList,
sourceNodeName, sourceNodePort);
UpdateColocatedShardPlacementMetadataOnWorkers(shardId, sourceNodeName,
sourceNodePort, targetNodeName,
targetNodePort);
}
UpdatePlacementUpdateStatusForShardIntervalList(
colocatedShardList,
@ -611,6 +663,70 @@ IsShardListOnNode(List *colocatedShardList, char *targetNodeName, uint32 targetN
}
/*
* LockColocatedRelationsForMove takes a list of relations, locks all of them
* using ShareUpdateExclusiveLock
*/
static void
LockColocatedRelationsForMove(List *colocatedTableList)
{
Oid colocatedTableId = InvalidOid;
foreach_oid(colocatedTableId, colocatedTableList)
{
LockRelationOid(colocatedTableId, ShareUpdateExclusiveLock);
}
}
/*
* ErrorIfForeignTableForShardTransfer takes a list of relations, errors out if
* there's a foreign table in the list.
*/
static void
ErrorIfForeignTableForShardTransfer(List *colocatedTableList,
ShardTransferType transferType)
{
Oid colocatedTableId = InvalidOid;
foreach_oid(colocatedTableId, colocatedTableList)
{
if (IsForeignTable(colocatedTableId))
{
char *relationName = get_rel_name(colocatedTableId);
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot %s shard",
ShardTransferTypeNames[transferType]),
errdetail("Table %s is a foreign table. "
"%s shards backed by foreign tables is "
"not supported.", relationName,
ShardTransferTypeNamesContinuous[transferType])));
}
}
}
/*
* EnsureAllShardsCanBeCopied is a wrapper around EnsureShardCanBeCopied.
*/
static void
EnsureAllShardsCanBeCopied(List *colocatedShardList,
char *sourceNodeName, uint32 sourceNodePort,
char *targetNodeName, uint32 targetNodePort)
{
ShardInterval *colocatedShard = NULL;
foreach_ptr(colocatedShard, colocatedShardList)
{
uint64 colocatedShardId = colocatedShard->shardId;
/*
* To transfer shard, there should be healthy placement in source node and no
* placement in the target node.
*/
EnsureShardCanBeCopied(colocatedShardId, sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort);
}
}
/*
* EnsureEnoughDiskSpaceForShardMove checks that there is enough space for
* shard moves of the given colocated shard list from source node to target node.
@ -619,9 +735,10 @@ IsShardListOnNode(List *colocatedShardList, char *targetNodeName, uint32 targetN
static void
EnsureEnoughDiskSpaceForShardMove(List *colocatedShardList,
char *sourceNodeName, uint32 sourceNodePort,
char *targetNodeName, uint32 targetNodePort)
char *targetNodeName, uint32 targetNodePort,
ShardTransferType transferType)
{
if (!CheckAvailableSpaceBeforeMove)
if (!CheckAvailableSpaceBeforeMove || transferType != SHARD_TRANSFER_MOVE)
{
return;
}
@ -636,6 +753,34 @@ EnsureEnoughDiskSpaceForShardMove(List *colocatedShardList,
}
/*
* TransferAlreadyCompleted returns true if the given shard transfer is already done.
* Returns false otherwise.
*/
static bool
TransferAlreadyCompleted(List *colocatedShardList,
char *sourceNodeName, uint32 sourceNodePort,
char *targetNodeName, uint32 targetNodePort,
ShardTransferType transferType)
{
if (transferType == SHARD_TRANSFER_MOVE &&
IsShardListOnNode(colocatedShardList, targetNodeName, targetNodePort) &&
!IsShardListOnNode(colocatedShardList, sourceNodeName, sourceNodePort))
{
return true;
}
if (transferType == SHARD_TRANSFER_COPY &&
IsShardListOnNode(colocatedShardList, targetNodeName, targetNodePort) &&
IsShardListOnNode(colocatedShardList, sourceNodeName, sourceNodePort))
{
return true;
}
return false;
}
/*
* ShardListSizeInBytes returns the size in bytes of a set of shard tables.
*/
@ -682,6 +827,49 @@ ShardListSizeInBytes(List *shardList, char *workerNodeName, uint32
}
/*
* SetupRebalanceMonitorForShardTransfer prepares the parameters and
* calls SetupRebalanceMonitor, unless the current transfer is a move
* initiated by the rebalancer.
* See comments on SetupRebalanceMonitor
*/
static void
SetupRebalanceMonitorForShardTransfer(uint64 shardId, Oid distributedTableId,
char *sourceNodeName, uint32 sourceNodePort,
char *targetNodeName, uint32 targetNodePort,
ShardTransferType transferType)
{
if (transferType == SHARD_TRANSFER_MOVE && IsRebalancerInternalBackend())
{
/*
* We want to be able to track progress of shard moves using
* get_rebalancer_progress. If this move is initiated by the rebalancer,
* then the rebalancer call has already set up the shared memory that is
* used to do that, so we should return here.
* But if citus_move_shard_placement is called directly by the user
* (or through any other mechanism), then the shared memory is not
* set up yet. In that case we do it here.
*/
return;
}
WorkerNode *sourceNode = FindWorkerNode(sourceNodeName, sourceNodePort);
WorkerNode *targetNode = FindWorkerNode(targetNodeName, targetNodePort);
PlacementUpdateEvent *placementUpdateEvent = palloc0(
sizeof(PlacementUpdateEvent));
placementUpdateEvent->updateType =
transferType == SHARD_TRANSFER_COPY ? PLACEMENT_UPDATE_COPY :
PLACEMENT_UPDATE_MOVE;
placementUpdateEvent->shardId = shardId;
placementUpdateEvent->sourceNode = sourceNode;
placementUpdateEvent->targetNode = targetNode;
SetupRebalanceMonitor(list_make1(placementUpdateEvent), distributedTableId,
REBALANCE_PROGRESS_MOVING,
PLACEMENT_UPDATE_STATUS_SETTING_UP);
}
/*
* CheckSpaceConstraints checks there is enough space to place the colocation
* on the node that the connection is connected to.
@ -729,17 +917,19 @@ CheckSpaceConstraints(MultiConnection *connection, uint64 colocationSizeInBytes)
/*
* ErrorIfTargetNodeIsNotSafeToMove throws error if the target node is not
* eligible for moving shards.
* ErrorIfTargetNodeIsNotSafeForTransfer throws error if the target node is not
* eligible for shard transfers.
*/
void
ErrorIfTargetNodeIsNotSafeToMove(const char *targetNodeName, int targetNodePort)
static void
ErrorIfTargetNodeIsNotSafeForTransfer(const char *targetNodeName, int targetNodePort,
ShardTransferType transferType)
{
WorkerNode *workerNode = FindWorkerNode(targetNodeName, targetNodePort);
if (workerNode == NULL)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Moving shards to a non-existing node is not supported"),
errmsg("%s shards to a non-existing node is not supported",
ShardTransferTypeNamesContinuous[transferType]),
errhint(
"Add the target node via SELECT citus_add_node('%s', %d);",
targetNodeName, targetNodePort)));
@ -748,13 +938,14 @@ ErrorIfTargetNodeIsNotSafeToMove(const char *targetNodeName, int targetNodePort)
if (!workerNode->isActive)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Moving shards to a non-active node is not supported"),
errmsg("%s shards to a non-active node is not supported",
ShardTransferTypeNamesContinuous[transferType]),
errhint(
"Activate the target node via SELECT citus_activate_node('%s', %d);",
targetNodeName, targetNodePort)));
}
if (!workerNode->shouldHaveShards)
if (transferType == SHARD_TRANSFER_MOVE && !workerNode->shouldHaveShards)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Moving shards to a node that shouldn't have a shard is "
@ -767,8 +958,9 @@ ErrorIfTargetNodeIsNotSafeToMove(const char *targetNodeName, int targetNodePort)
if (!NodeIsPrimary(workerNode))
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Moving shards to a secondary (e.g., replica) node is "
"not supported")));
errmsg("%s shards to a secondary (e.g., replica) node is "
"not supported",
ShardTransferTypeNamesContinuous[transferType])));
}
}
@ -1046,41 +1238,6 @@ ErrorIfTableCannotBeReplicated(Oid relationId)
}
/*
* ErrorIfTargetNodeIsNotSafeToCopyTo throws an error if the target node is not
* eligible for copying shards.
*/
static void
ErrorIfTargetNodeIsNotSafeToCopyTo(const char *targetNodeName, int targetNodePort)
{
WorkerNode *workerNode = FindWorkerNode(targetNodeName, targetNodePort);
if (workerNode == NULL)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Copying shards to a non-existing node is not supported"),
errhint(
"Add the target node via SELECT citus_add_node('%s', %d);",
targetNodeName, targetNodePort)));
}
if (!workerNode->isActive)
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Copying shards to a non-active node is not supported"),
errhint(
"Activate the target node via SELECT citus_activate_node('%s', %d);",
targetNodeName, targetNodePort)));
}
if (!NodeIsPrimary(workerNode))
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Copying shards to a secondary (e.g., replica) node is "
"not supported")));
}
}
/*
* LookupShardTransferMode maps the oids of citus.shard_transfer_mode enum
* values to a char.
@ -1114,154 +1271,6 @@ LookupShardTransferMode(Oid shardReplicationModeOid)
}
/*
* ReplicateColocatedShardPlacement replicates the given shard and its
* colocated shards from a source node to target node.
*/
static void
ReplicateColocatedShardPlacement(int64 shardId, char *sourceNodeName,
int32 sourceNodePort, char *targetNodeName,
int32 targetNodePort, char shardReplicationMode)
{
ShardInterval *shardInterval = LoadShardInterval(shardId);
Oid distributedTableId = shardInterval->relationId;
ErrorIfSameNode(sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort,
"copy");
ErrorIfTableCannotBeReplicated(shardInterval->relationId);
ErrorIfTargetNodeIsNotSafeToCopyTo(targetNodeName, targetNodePort);
EnsureNoModificationsHaveBeenDone();
AcquirePlacementColocationLock(shardInterval->relationId, ExclusiveLock, "copy");
List *colocatedTableList = ColocatedTableList(distributedTableId);
List *colocatedShardList = ColocatedShardIntervalList(shardInterval);
EnsureTableListOwner(colocatedTableList);
EnsureTableListSuitableForReplication(colocatedTableList);
/*
* We sort shardIntervalList so that lock operations will not cause any
* deadlocks.
*/
colocatedShardList = SortList(colocatedShardList, CompareShardIntervalsById);
/*
* If there are active placements on both nodes, we assume the copy to already
* be done.
*/
if (IsShardListOnNode(colocatedShardList, targetNodeName, targetNodePort) &&
IsShardListOnNode(colocatedShardList, sourceNodeName, sourceNodePort))
{
ereport(WARNING, (errmsg("shard is already present on node %s:%d",
targetNodeName, targetNodePort),
errdetail("Copy may have already completed.")));
return;
}
WorkerNode *sourceNode = FindWorkerNode(sourceNodeName, sourceNodePort);
WorkerNode *targetNode = FindWorkerNode(targetNodeName, targetNodePort);
Oid relationId = RelationIdForShard(shardId);
PlacementUpdateEvent *placementUpdateEvent = palloc0(
sizeof(PlacementUpdateEvent));
placementUpdateEvent->updateType = PLACEMENT_UPDATE_COPY;
placementUpdateEvent->shardId = shardId;
placementUpdateEvent->sourceNode = sourceNode;
placementUpdateEvent->targetNode = targetNode;
SetupRebalanceMonitor(list_make1(placementUpdateEvent), relationId,
REBALANCE_PROGRESS_MOVING,
PLACEMENT_UPDATE_STATUS_SETTING_UP);
UpdatePlacementUpdateStatusForShardIntervalList(
colocatedShardList,
sourceNodeName,
sourceNodePort,
PLACEMENT_UPDATE_STATUS_SETTING_UP);
/*
* At this point of the shard replication, we don't need to block the writes to
* shards when logical replication is used.
*/
bool useLogicalReplication = CanUseLogicalReplication(distributedTableId,
shardReplicationMode);
if (!useLogicalReplication)
{
BlockWritesToShardList(colocatedShardList);
}
ShardInterval *colocatedShard = NULL;
foreach_ptr(colocatedShard, colocatedShardList)
{
uint64 colocatedShardId = colocatedShard->shardId;
/*
* For shard copy, there should be healthy placement in source node and no
* placement in the target node.
*/
EnsureShardCanBeCopied(colocatedShardId, sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort);
}
if (shardReplicationMode == TRANSFER_MODE_AUTOMATIC)
{
VerifyTablesHaveReplicaIdentity(colocatedTableList);
}
if (!IsCitusTableType(distributedTableId, REFERENCE_TABLE))
{
/*
* When copying a shard to a new node, we should first ensure that reference
* tables are present such that joins work immediately after copying the shard.
* When copying a reference table, we are probably trying to achieve just that.
*
* Since this a long-running operation we do this after the error checks, but
* before taking metadata locks.
*/
EnsureReferenceTablesExistOnAllNodesExtended(shardReplicationMode);
}
DropOrphanedResourcesInSeparateTransaction();
CopyShardTables(colocatedShardList, sourceNodeName, sourceNodePort,
targetNodeName, targetNodePort, useLogicalReplication,
"citus_copy_shard_placement");
/*
* Finally insert the placements to pg_dist_placement and sync it to the
* metadata workers.
*/
foreach_ptr(colocatedShard, colocatedShardList)
{
uint64 colocatedShardId = colocatedShard->shardId;
uint32 groupId = GroupForNode(targetNodeName, targetNodePort);
uint64 placementId = GetNextPlacementId();
InsertShardPlacementRow(colocatedShardId, placementId,
ShardLength(colocatedShardId),
groupId);
if (ShouldSyncTableMetadata(colocatedShard->relationId))
{
char *placementCommand = PlacementUpsertCommand(colocatedShardId, placementId,
0, groupId);
SendCommandToWorkersWithMetadata(placementCommand);
}
}
UpdatePlacementUpdateStatusForShardIntervalList(
colocatedShardList,
sourceNodeName,
sourceNodePort,
PLACEMENT_UPDATE_STATUS_COMPLETED);
FinalizeCurrentProgressMonitor();
}
/*
* EnsureTableListOwner ensures current user owns given tables. Superusers
* are regarded as owners.
@ -1278,25 +1287,15 @@ EnsureTableListOwner(List *tableIdList)
/*
* EnsureTableListSuitableForReplication errors out if given tables are not
* ErrorIfReplicatingDistributedTableWithFKeys errors out if given tables are not
* suitable for replication.
*/
static void
EnsureTableListSuitableForReplication(List *tableIdList)
ErrorIfReplicatingDistributedTableWithFKeys(List *tableIdList)
{
Oid tableId = InvalidOid;
foreach_oid(tableId, tableIdList)
{
if (IsForeignTable(tableId))
{
char *relationName = get_rel_name(tableId);
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot replicate shard"),
errdetail("Table %s is a foreign table. Replicating "
"shards backed by foreign tables is "
"not supported.", relationName)));
}
List *foreignConstraintCommandList =
GetReferencingForeignConstaintCommands(tableId);
@ -1318,7 +1317,7 @@ EnsureTableListSuitableForReplication(List *tableIdList)
static void
CopyShardTables(List *shardIntervalList, char *sourceNodeName, int32 sourceNodePort,
char *targetNodeName, int32 targetNodePort, bool useLogicalReplication,
char *operationName)
const char *operationName)
{
if (list_length(shardIntervalList) < 1)
{

View File

@ -53,8 +53,14 @@ worker_copy_table_to_node(PG_FUNCTION_ARGS)
targetNodeId);
StringInfo selectShardQueryForCopy = makeStringInfo();
/*
* Even though we do COPY(SELECT ...) all the columns, we can't just do SELECT * because we need to not COPY generated colums.
*/
const char *columnList = CopyableColumnNamesFromRelationName(relationSchemaName,
relationName);
appendStringInfo(selectShardQueryForCopy,
"SELECT * FROM %s;", relationQualifiedName);
"SELECT %s FROM %s;", columnList, relationQualifiedName);
ParamListInfo params = NULL;
ExecuteQueryStringIntoDestReceiver(selectShardQueryForCopy->data, params,

View File

@ -24,6 +24,7 @@
#include "distributed/relation_utils.h"
#include "distributed/version_compat.h"
#include "distributed/local_executor.h"
#include "distributed/replication_origin_session_utils.h"
/*
* LocalCopyBuffer is used in copy callback to return the copied rows.
@ -73,13 +74,14 @@ static void ShardCopyDestReceiverDestroy(DestReceiver *destReceiver);
static bool CanUseLocalCopy(uint32_t destinationNodeId);
static StringInfo ConstructShardCopyStatement(List *destinationShardFullyQualifiedName,
bool
useBinaryFormat);
useBinaryFormat, TupleDesc tupleDesc);
static void WriteLocalTuple(TupleTableSlot *slot, ShardCopyDestReceiver *copyDest);
static int ReadFromLocalBufferCallback(void *outBuf, int minRead, int maxRead);
static void LocalCopyToShard(ShardCopyDestReceiver *copyDest, CopyOutState
localCopyOutState);
static void ConnectToRemoteAndStartCopy(ShardCopyDestReceiver *copyDest);
static bool
CanUseLocalCopy(uint32_t destinationNodeId)
{
@ -103,9 +105,16 @@ ConnectToRemoteAndStartCopy(ShardCopyDestReceiver *copyDest)
NULL /* database (current) */);
ClaimConnectionExclusively(copyDest->connection);
RemoteTransactionBeginIfNecessary(copyDest->connection);
SetupReplicationOriginRemoteSession(copyDest->connection);
StringInfo copyStatement = ConstructShardCopyStatement(
copyDest->destinationShardFullyQualifiedName,
copyDest->copyOutState->binary);
copyDest->copyOutState->binary,
copyDest->tupleDescriptor);
if (!SendRemoteCommand(copyDest->connection, copyStatement->data))
{
@ -184,6 +193,8 @@ ShardCopyDestReceiverReceive(TupleTableSlot *slot, DestReceiver *dest)
CopyOutState copyOutState = copyDest->copyOutState;
if (copyDest->useLocalCopy)
{
/* Setup replication origin session for local copy*/
WriteLocalTuple(slot, copyDest);
if (copyOutState->fe_msgbuf->len > LocalCopyFlushThresholdByte)
{
@ -259,6 +270,11 @@ ShardCopyDestReceiverStartup(DestReceiver *dest, int operation, TupleDesc
copyDest->columnOutputFunctions = ColumnOutputFunctions(inputTupleDescriptor,
copyOutState->binary);
copyDest->copyOutState = copyOutState;
if (copyDest->useLocalCopy)
{
/* Setup replication origin session for local copy*/
SetupReplicationOriginLocalSession();
}
}
@ -317,6 +333,9 @@ ShardCopyDestReceiverShutdown(DestReceiver *dest)
PQclear(result);
ForgetResults(copyDest->connection);
ResetReplicationOriginRemoteSession(copyDest->connection);
CloseConnection(copyDest->connection);
}
}
@ -329,6 +348,10 @@ static void
ShardCopyDestReceiverDestroy(DestReceiver *dest)
{
ShardCopyDestReceiver *copyDest = (ShardCopyDestReceiver *) dest;
if (copyDest->useLocalCopy)
{
ResetReplicationOriginLocalSession();
}
if (copyDest->copyOutState)
{
@ -344,21 +367,80 @@ ShardCopyDestReceiverDestroy(DestReceiver *dest)
}
/*
* CopyableColumnNamesFromTupleDesc function creates and returns a comma seperated column names string to be used in COPY
* and SELECT statements when copying a table. The COPY and SELECT statements should filter out the GENERATED columns since COPY
* statement fails to handle them. Iterating over the attributes of the table we also need to skip the dropped columns.
*/
const char *
CopyableColumnNamesFromTupleDesc(TupleDesc tupDesc)
{
StringInfo columnList = makeStringInfo();
bool firstInList = true;
for (int i = 0; i < tupDesc->natts; i++)
{
Form_pg_attribute att = TupleDescAttr(tupDesc, i);
if (att->attgenerated || att->attisdropped)
{
continue;
}
if (!firstInList)
{
appendStringInfo(columnList, ",");
}
firstInList = false;
appendStringInfo(columnList, "%s", quote_identifier(NameStr(att->attname)));
}
return columnList->data;
}
/*
* CopyableColumnNamesFromRelationName function is a wrapper for CopyableColumnNamesFromTupleDesc.
*/
const char *
CopyableColumnNamesFromRelationName(const char *schemaName, const char *relationName)
{
Oid namespaceOid = get_namespace_oid(schemaName, true);
Oid relationId = get_relname_relid(relationName, namespaceOid);
Relation relation = relation_open(relationId, AccessShareLock);
TupleDesc tupleDesc = RelationGetDescr(relation);
const char *columnList = CopyableColumnNamesFromTupleDesc(tupleDesc);
relation_close(relation, NoLock);
return columnList;
}
/*
* ConstructShardCopyStatement constructs the text of a COPY statement
* for copying into a result table
*/
static StringInfo
ConstructShardCopyStatement(List *destinationShardFullyQualifiedName, bool
useBinaryFormat)
useBinaryFormat,
TupleDesc tupleDesc)
{
char *destinationShardSchemaName = linitial(destinationShardFullyQualifiedName);
char *destinationShardRelationName = lsecond(destinationShardFullyQualifiedName);
StringInfo command = makeStringInfo();
appendStringInfo(command, "COPY %s.%s FROM STDIN",
const char *columnList = CopyableColumnNamesFromTupleDesc(tupleDesc);
appendStringInfo(command, "COPY %s.%s (%s) FROM STDIN",
quote_identifier(destinationShardSchemaName), quote_identifier(
destinationShardRelationName));
destinationShardRelationName), columnList);
if (useBinaryFormat)
{

View File

@ -110,8 +110,13 @@ worker_split_copy(PG_FUNCTION_ARGS)
splitCopyInfoList))));
StringInfo selectShardQueryForCopy = makeStringInfo();
const char *columnList = CopyableColumnNamesFromRelationName(
sourceShardToCopySchemaName,
sourceShardToCopyName);
appendStringInfo(selectShardQueryForCopy,
"SELECT * FROM %s;", sourceShardToCopyQualifiedName);
"SELECT %s FROM %s;", columnList,
sourceShardToCopyQualifiedName);
ParamListInfo params = NULL;
ExecuteQueryStringIntoDestReceiver(selectShardQueryForCopy->data, params,

View File

@ -34,6 +34,7 @@
#include "distributed/intermediate_results.h"
#include "distributed/listutils.h"
#include "distributed/coordinator_protocol.h"
#include "distributed/merge_planner.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_executor.h"
#include "distributed/distributed_planner.h"
@ -68,6 +69,17 @@
#include "utils/syscache.h"
/* RouterPlanType is used to determine the router plan to invoke */
typedef enum RouterPlanType
{
INSERT_SELECT_INTO_CITUS_TABLE,
INSERT_SELECT_INTO_LOCAL_TABLE,
DML_QUERY,
SELECT_QUERY,
MERGE_QUERY,
REPLAN_WITH_BOUND_PARAMETERS
} RouterPlanType;
static List *plannerRestrictionContextList = NIL;
int MultiTaskQueryLogLevel = CITUS_LOG_LEVEL_OFF; /* multi-task query log level */
static uint64 NextPlanId = 1;
@ -75,12 +87,8 @@ static uint64 NextPlanId = 1;
/* keep track of planner call stack levels */
int PlannerLevel = 0;
static void ErrorIfQueryHasUnsupportedMergeCommand(Query *queryTree,
List *rangeTableList);
static bool ContainsMergeCommandWalker(Node *node);
static bool ListContainsDistributedTableRTE(List *rangeTableList,
bool *maybeHasForeignDistributedTable);
static bool IsUpdateOrDelete(Query *query);
static PlannedStmt * CreateDistributedPlannedStmt(
DistributedPlanningContext *planContext);
static PlannedStmt * InlineCtesAndCreateDistributedPlannedStmt(uint64 planId,
@ -132,7 +140,10 @@ static PlannedStmt * PlanDistributedStmt(DistributedPlanningContext *planContext
static RTEListProperties * GetRTEListProperties(List *rangeTableList);
static List * TranslatedVars(PlannerInfo *root, int relationIndex);
static void WarnIfListHasForeignDistributedTable(List *rangeTableList);
static void ErrorIfMergeHasUnsupportedTables(Query *parse, List *rangeTableList);
static RouterPlanType GetRouterPlanType(Query *query,
Query *originalQuery,
bool hasUnresolvedParams);
/* Distributed planner hook */
PlannedStmt *
@ -156,7 +167,7 @@ distributed_planner(Query *parse,
* We cannot have merge command for this path as well because
* there cannot be recursively planned merge command.
*/
Assert(!ContainsMergeCommandWalker((Node *) parse));
Assert(!IsMergeQuery(parse));
needsDistributedPlanning = true;
}
@ -200,12 +211,6 @@ distributed_planner(Query *parse,
if (!fastPathRouterQuery)
{
/*
* Fast path queries cannot have merge command, and we
* prevent the remaining here.
*/
ErrorIfQueryHasUnsupportedMergeCommand(parse, rangeTableList);
/*
* When there are partitioned tables (not applicable to fast path),
* pretend that they are regular tables to avoid unnecessary work
@ -304,72 +309,6 @@ distributed_planner(Query *parse,
}
/*
* ErrorIfQueryHasUnsupportedMergeCommand walks over the query tree and bails out
* if there is no Merge command (e.g., CMD_MERGE) in the query tree. For merge,
* looks for all supported combinations, throws an exception if any violations
* are seen.
*/
static void
ErrorIfQueryHasUnsupportedMergeCommand(Query *queryTree, List *rangeTableList)
{
/*
* Postgres currently doesn't support Merge queries inside subqueries and
* ctes, but lets be defensive and do query tree walk anyway.
*
* We do not call this path for fast-path queries to avoid this additional
* overhead.
*/
if (!ContainsMergeCommandWalker((Node *) queryTree))
{
/* No MERGE found */
return;
}
/*
* In Citus we have limited support for MERGE, it's allowed
* only if all the tables(target, source or any CTE) tables
* are are local i.e. a combination of Citus local and Non-Citus
* tables (regular Postgres tables).
*/
ErrorIfMergeHasUnsupportedTables(queryTree, rangeTableList);
}
/*
* ContainsMergeCommandWalker walks over the node and finds if there are any
* Merge command (e.g., CMD_MERGE) in the node.
*/
static bool
ContainsMergeCommandWalker(Node *node)
{
#if PG_VERSION_NUM < PG_VERSION_15
return false;
#endif
if (node == NULL)
{
return false;
}
if (IsA(node, Query))
{
Query *query = (Query *) node;
if (IsMergeQuery(query))
{
return true;
}
return query_tree_walker((Query *) node, ContainsMergeCommandWalker, NULL, 0);
}
return expression_tree_walker(node, ContainsMergeCommandWalker, NULL);
return false;
}
/*
* ExtractRangeTableEntryList is a wrapper around ExtractRangeTableEntryWalker.
* The function traverses the input query and returns all the range table
@ -669,17 +608,6 @@ IsMultiTaskPlan(DistributedPlan *distributedPlan)
}
/*
* IsUpdateOrDelete returns true if the query performs an update or delete.
*/
bool
IsUpdateOrDelete(Query *query)
{
return query->commandType == CMD_UPDATE ||
query->commandType == CMD_DELETE;
}
/*
* PlanFastPathDistributedStmt creates a distributed planned statement using
* the FastPathPlanner.
@ -850,7 +778,7 @@ CreateDistributedPlannedStmt(DistributedPlanningContext *planContext)
* if it is planned as a multi shard modify query.
*/
if ((distributedPlan->planningError ||
(IsUpdateOrDelete(planContext->originalQuery) && IsMultiTaskPlan(
(UpdateOrDeleteOrMergeQuery(planContext->originalQuery) && IsMultiTaskPlan(
distributedPlan))) &&
hasUnresolvedParams)
{
@ -955,6 +883,51 @@ TryCreateDistributedPlannedStmt(PlannedStmt *localPlan,
}
/*
* GetRouterPlanType checks the parse tree to return appropriate plan type.
*/
static RouterPlanType
GetRouterPlanType(Query *query, Query *originalQuery, bool hasUnresolvedParams)
{
if (!IsModifyCommand(originalQuery))
{
return SELECT_QUERY;
}
Oid targetRelationId = ModifyQueryResultRelationId(query);
EnsureModificationsCanRunOnRelation(targetRelationId);
EnsurePartitionTableNotReplicated(targetRelationId);
/* Check the type of modification being done */
if (InsertSelectIntoCitusTable(originalQuery))
{
if (hasUnresolvedParams)
{
return REPLAN_WITH_BOUND_PARAMETERS;
}
return INSERT_SELECT_INTO_CITUS_TABLE;
}
else if (InsertSelectIntoLocalTable(originalQuery))
{
if (hasUnresolvedParams)
{
return REPLAN_WITH_BOUND_PARAMETERS;
}
return INSERT_SELECT_INTO_LOCAL_TABLE;
}
else if (IsMergeQuery(originalQuery))
{
return MERGE_QUERY;
}
else
{
return DML_QUERY;
}
}
/*
* CreateDistributedPlan generates a distributed plan for a query.
* It goes through 3 steps:
@ -972,51 +945,71 @@ CreateDistributedPlan(uint64 planId, bool allowRecursivePlanning, Query *origina
DistributedPlan *distributedPlan = NULL;
bool hasCtes = originalQuery->cteList != NIL;
if (IsModifyCommand(originalQuery))
/* Step 1: Try router planner */
RouterPlanType routerPlan = GetRouterPlanType(query, originalQuery,
hasUnresolvedParams);
switch (routerPlan)
{
Oid targetRelationId = ModifyQueryResultRelationId(query);
EnsureModificationsCanRunOnRelation(targetRelationId);
EnsurePartitionTableNotReplicated(targetRelationId);
if (InsertSelectIntoCitusTable(originalQuery))
case INSERT_SELECT_INTO_CITUS_TABLE:
{
if (hasUnresolvedParams)
{
/*
* Unresolved parameters can cause performance regressions in
* INSERT...SELECT when the partition column is a parameter
* because we don't perform any additional pruning in the executor.
*/
return NULL;
}
distributedPlan =
CreateInsertSelectPlan(planId, originalQuery, plannerRestrictionContext,
CreateInsertSelectPlan(planId,
originalQuery,
plannerRestrictionContext,
boundParams);
break;
}
else if (InsertSelectIntoLocalTable(originalQuery))
case INSERT_SELECT_INTO_LOCAL_TABLE:
{
if (hasUnresolvedParams)
{
/*
* Unresolved parameters can cause performance regressions in
* INSERT...SELECT when the partition column is a parameter
* because we don't perform any additional pruning in the executor.
*/
return NULL;
}
distributedPlan =
CreateInsertSelectIntoLocalTablePlan(planId, originalQuery, boundParams,
CreateInsertSelectIntoLocalTablePlan(planId,
originalQuery,
boundParams,
hasUnresolvedParams,
plannerRestrictionContext);
break;
}
else
case DML_QUERY:
{
/* modifications are always routed through the same planner/executor */
distributedPlan =
CreateModifyPlan(originalQuery, query, plannerRestrictionContext);
break;
}
case MERGE_QUERY:
{
distributedPlan =
CreateMergePlan(originalQuery, query, plannerRestrictionContext);
break;
}
case REPLAN_WITH_BOUND_PARAMETERS:
{
/*
* Unresolved parameters can cause performance regressions in
* INSERT...SELECT when the partition column is a parameter
* because we don't perform any additional pruning in the executor.
*/
return NULL;
}
case SELECT_QUERY:
{
/*
* For select queries we, if router executor is enabled, first try to
* plan the query as a router query. If not supported, otherwise try
* the full blown plan/optimize/physical planning process needed to
* produce distributed query plans.
*/
distributedPlan =
CreateRouterPlan(originalQuery, query, plannerRestrictionContext);
break;
}
}
/* the functions above always return a plan, possibly with an error */
@ -1030,31 +1023,6 @@ CreateDistributedPlan(uint64 planId, bool allowRecursivePlanning, Query *origina
{
RaiseDeferredError(distributedPlan->planningError, DEBUG2);
}
}
else
{
/*
* For select queries we, if router executor is enabled, first try to
* plan the query as a router query. If not supported, otherwise try
* the full blown plan/optimize/physical planning process needed to
* produce distributed query plans.
*/
distributedPlan = CreateRouterPlan(originalQuery, query,
plannerRestrictionContext);
if (distributedPlan->planningError == NULL)
{
return distributedPlan;
}
else
{
/*
* For debugging it's useful to display why query was not
* router plannable.
*/
RaiseDeferredError(distributedPlan->planningError, DEBUG2);
}
}
if (hasUnresolvedParams)
{
@ -1082,6 +1050,8 @@ CreateDistributedPlan(uint64 planId, bool allowRecursivePlanning, Query *origina
boundParams);
Assert(originalQuery != NULL);
/* Step 2: Generate subplans for CTEs and complex subqueries */
/*
* Plan subqueries and CTEs that cannot be pushed down by recursively
* calling the planner and return the resulting plans to subPlanList.
@ -1182,6 +1152,8 @@ CreateDistributedPlan(uint64 planId, bool allowRecursivePlanning, Query *origina
query->cteList = NIL;
Assert(originalQuery->cteList == NIL);
/* Step 3: Try Logical planner */
MultiTreeRoot *logicalPlan = MultiLogicalPlanCreate(originalQuery, query,
plannerRestrictionContext);
MultiLogicalPlanOptimize(logicalPlan);
@ -2611,148 +2583,3 @@ WarnIfListHasForeignDistributedTable(List *rangeTableList)
}
}
}
/*
* IsMergeAllowedOnRelation takes a relation entry and checks if MERGE command is
* permitted on special relations, such as materialized view, returns true only if
* it's a "source" relation.
*/
bool
IsMergeAllowedOnRelation(Query *parse, RangeTblEntry *rte)
{
if (!IsMergeQuery(parse))
{
return false;
}
RangeTblEntry *targetRte = rt_fetch(parse->resultRelation, parse->rtable);
/* Is it a target relation? */
if (targetRte->relid == rte->relid)
{
return false;
}
return true;
}
/*
* ErrorIfMergeHasUnsupportedTables checks if all the tables(target, source or any CTE
* present) in the MERGE command are local i.e. a combination of Citus local and Non-Citus
* tables (regular Postgres tables), raises an exception for all other combinations.
*/
static void
ErrorIfMergeHasUnsupportedTables(Query *parse, List *rangeTableList)
{
ListCell *tableCell = NULL;
foreach(tableCell, rangeTableList)
{
RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(tableCell);
Oid relationId = rangeTableEntry->relid;
switch (rangeTableEntry->rtekind)
{
case RTE_RELATION:
{
/* Check the relation type */
break;
}
case RTE_SUBQUERY:
case RTE_FUNCTION:
case RTE_TABLEFUNC:
case RTE_VALUES:
case RTE_JOIN:
case RTE_CTE:
{
/* Skip them as base table(s) will be checked */
continue;
}
/*
* RTE_NAMEDTUPLESTORE is typically used in ephmeral named relations,
* such as, trigger data; until we find a genuine use case, raise an
* exception.
* RTE_RESULT is a node added by the planner and we shouldn't
* encounter it in the parse tree.
*/
case RTE_NAMEDTUPLESTORE:
case RTE_RESULT:
{
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("MERGE command is not supported with "
"Tuplestores and results")));
break;
}
default:
{
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("MERGE command: Unrecognized range table entry.")));
}
}
/* RTE Relation can be of various types, check them now */
/* skip the regular views as they are replaced with subqueries */
if (rangeTableEntry->relkind == RELKIND_VIEW)
{
continue;
}
if (rangeTableEntry->relkind == RELKIND_MATVIEW ||
rangeTableEntry->relkind == RELKIND_FOREIGN_TABLE)
{
/* Materialized view or Foreign table as target is not allowed */
if (IsMergeAllowedOnRelation(parse, rangeTableEntry))
{
/* Non target relation is ok */
continue;
}
else
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("MERGE command is not allowed "
"on materialized view")));
}
}
if (rangeTableEntry->relkind != RELKIND_RELATION &&
rangeTableEntry->relkind != RELKIND_PARTITIONED_TABLE)
{
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Unexpected relation type(relkind:%c) in MERGE command",
rangeTableEntry->relkind)));
}
Assert(rangeTableEntry->relid != 0);
/* Distributed tables and Reference tables are not supported yet */
if (IsCitusTableType(relationId, REFERENCE_TABLE) ||
IsCitusTableType(relationId, DISTRIBUTED_TABLE))
{
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("MERGE command is not supported on "
"distributed/reference tables yet")));
}
/* Regular Postgres tables and Citus local tables are allowed */
if (!IsCitusTable(relationId) ||
IsCitusTableType(relationId, CITUS_LOCAL_TABLE))
{
continue;
}
/* Any other Citus table type missing ? */
}
/* All the tables are local, supported */
}

View File

@ -54,10 +54,11 @@
bool EnableFastPathRouterPlanner = true;
static bool ColumnAppearsMultipleTimes(Node *quals, Var *distributionKey);
static bool ConjunctionContainsColumnFilter(Node *node, Var *column,
Node **distributionKeyValue);
static bool DistKeyInSimpleOpExpression(Expr *clause, Var *distColumn,
Node **distributionKeyValue);
static bool ConjunctionContainsColumnFilter(Node *node,
Var *column,
Node **distributionKeyValue);
/*

View File

@ -875,7 +875,7 @@ RouterModifyTaskForShardInterval(Query *originalQuery,
&prunedShardIntervalListList,
replacePrunedQueryWithDummy,
&multiShardModifyQuery, NULL,
false);
NULL);
Assert(!multiShardModifyQuery);
@ -938,6 +938,7 @@ RouterModifyTaskForShardInterval(Query *originalQuery,
modifyTask->taskPlacementList = insertShardPlacementList;
modifyTask->relationShardList = relationShardList;
modifyTask->replicationModel = targetTableCacheEntry->replicationModel;
modifyTask->isLocalTableModification = false;
return modifyTask;
}

View File

@ -0,0 +1,738 @@
/*-------------------------------------------------------------------------
*
* merge_planner.c
*
* This file contains functions to help plan MERGE queries.
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include <stddef.h>
#include "postgres.h"
#include "nodes/makefuncs.h"
#include "optimizer/optimizer.h"
#include "parser/parsetree.h"
#include "utils/lsyscache.h"
#include "distributed/citus_clauses.h"
#include "distributed/listutils.h"
#include "distributed/merge_planner.h"
#include "distributed/multi_logical_optimizer.h"
#include "distributed/multi_router_planner.h"
#include "distributed/pg_version_constants.h"
#include "distributed/query_pushdown_planning.h"
#if PG_VERSION_NUM >= PG_VERSION_15
static DeferredErrorMessage * CheckIfRTETypeIsUnsupported(Query *parse,
RangeTblEntry *rangeTableEntry);
static DeferredErrorMessage * ErrorIfDistTablesNotColocated(Query *parse,
List *
distTablesList,
PlannerRestrictionContext
*
plannerRestrictionContext);
static DeferredErrorMessage * ErrorIfMergeHasUnsupportedTables(Query *parse,
List *rangeTableList,
PlannerRestrictionContext *
restrictionContext);
static bool IsDistributionColumnInMergeSource(Expr *columnExpression, Query *query, bool
skipOuterVars);
static DeferredErrorMessage * InsertDistributionColumnMatchesSource(Query *query,
RangeTblEntry *
resultRte);
static DeferredErrorMessage * MergeQualAndTargetListFunctionsSupported(Oid
resultRelationId,
FromExpr *joinTree,
Node *quals,
List *targetList,
CmdType commandType);
#endif
/*
* CreateMergePlan attempts to create a plan for the given MERGE SQL
* statement. If planning fails ->planningError is set to a description
* of the failure.
*/
DistributedPlan *
CreateMergePlan(Query *originalQuery, Query *query,
PlannerRestrictionContext *plannerRestrictionContext)
{
DistributedPlan *distributedPlan = CitusMakeNode(DistributedPlan);
bool multiShardQuery = false;
Assert(originalQuery->commandType == CMD_MERGE);
distributedPlan->modLevel = RowModifyLevelForQuery(query);
distributedPlan->planningError = MergeQuerySupported(originalQuery,
multiShardQuery,
plannerRestrictionContext);
if (distributedPlan->planningError != NULL)
{
return distributedPlan;
}
Job *job = RouterJob(originalQuery, plannerRestrictionContext,
&distributedPlan->planningError);
if (distributedPlan->planningError != NULL)
{
return distributedPlan;
}
ereport(DEBUG1, (errmsg("Creating MERGE router plan")));
distributedPlan->workerJob = job;
distributedPlan->combineQuery = NULL;
/* MERGE doesn't support RETURNING clause */
distributedPlan->expectResults = false;
distributedPlan->targetRelationId = ResultRelationOidForQuery(query);
distributedPlan->fastPathRouterPlan =
plannerRestrictionContext->fastPathRestrictionContext->fastPathRouterQuery;
return distributedPlan;
}
/*
* MergeQuerySupported does check for a MERGE command in the query, if it finds
* one, it will verify the below criteria
* - Supported tables and combinations in ErrorIfMergeHasUnsupportedTables
* - Distributed tables requirements in ErrorIfDistTablesNotColocated
* - Checks target-lists and functions-in-quals in TargetlistAndFunctionsSupported
*/
DeferredErrorMessage *
MergeQuerySupported(Query *originalQuery, bool multiShardQuery,
PlannerRestrictionContext *plannerRestrictionContext)
{
/* function is void for pre-15 versions of Postgres */
#if PG_VERSION_NUM < PG_VERSION_15
return NULL;
#else
/*
* TODO: For now, we are adding an exception where any volatile or stable
* functions are not allowed in the MERGE query, but this will become too
* restrictive as this will prevent many useful and simple cases, such as,
* INSERT VALUES(ts::timestamp), bigserial column inserts etc. But without
* this restriction, we have a potential danger of some of the function(s)
* getting executed at the worker which will result in incorrect behavior.
*/
if (contain_mutable_functions((Node *) originalQuery))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"non-IMMUTABLE functions are not yet supported "
"in MERGE sql with distributed tables ",
NULL, NULL);
}
List *rangeTableList = ExtractRangeTableEntryList(originalQuery);
RangeTblEntry *resultRte = ExtractResultRelationRTE(originalQuery);
/*
* Fast path queries cannot have merge command, and we prevent the remaining here.
* In Citus we have limited support for MERGE, it's allowed only if all
* the tables(target, source or any CTE) tables are are local i.e. a
* combination of Citus local and Non-Citus tables (regular Postgres tables)
* or distributed tables with some restrictions, please see header of routine
* ErrorIfDistTablesNotColocated for details.
*/
DeferredErrorMessage *deferredError =
ErrorIfMergeHasUnsupportedTables(originalQuery,
rangeTableList,
plannerRestrictionContext);
if (deferredError)
{
/* MERGE's unsupported combination, raise the exception */
RaiseDeferredError(deferredError, ERROR);
}
Oid resultRelationId = resultRte->relid;
deferredError = MergeQualAndTargetListFunctionsSupported(resultRelationId,
originalQuery->jointree,
originalQuery->jointree->
quals,
originalQuery->targetList,
originalQuery->commandType);
if (deferredError)
{
return deferredError;
}
/*
* MERGE is a special case where we have multiple modify statements
* within itself. Check each INSERT/UPDATE/DELETE individually.
*/
MergeAction *action = NULL;
foreach_ptr(action, originalQuery->mergeActionList)
{
Assert(originalQuery->returningList == NULL);
deferredError = MergeQualAndTargetListFunctionsSupported(resultRelationId,
originalQuery->jointree,
action->qual,
action->targetList,
action->commandType);
if (deferredError)
{
/* MERGE's unsupported scenario, raise the exception */
RaiseDeferredError(deferredError, ERROR);
}
}
deferredError =
InsertDistributionColumnMatchesSource(originalQuery, resultRte);
if (deferredError)
{
/* MERGE's unsupported scenario, raise the exception */
RaiseDeferredError(deferredError, ERROR);
}
if (multiShardQuery)
{
deferredError =
DeferErrorIfUnsupportedSubqueryPushdown(originalQuery,
plannerRestrictionContext);
if (deferredError)
{
return deferredError;
}
}
if (HasDangerousJoinUsing(originalQuery->rtable, (Node *) originalQuery->jointree))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"a join with USING causes an internal naming "
"conflict, use ON instead", NULL, NULL);
}
return NULL;
#endif
}
/*
* IsMergeAllowedOnRelation takes a relation entry and checks if MERGE command is
* permitted on special relations, such as materialized view, returns true only if
* it's a "source" relation.
*/
bool
IsMergeAllowedOnRelation(Query *parse, RangeTblEntry *rte)
{
if (!IsMergeQuery(parse))
{
return false;
}
/* Fetch the MERGE target relation */
RangeTblEntry *targetRte = rt_fetch(parse->resultRelation, parse->rtable);
/* Is it a target relation? */
if (targetRte->relid == rte->relid)
{
return false;
}
return true;
}
#if PG_VERSION_NUM >= PG_VERSION_15
/*
* ErrorIfDistTablesNotColocated Checks to see if
*
* - There are a minimum of two distributed tables (source and a target).
* - All the distributed tables are indeed colocated.
*
* If any of the conditions are not met, it raises an exception.
*/
static DeferredErrorMessage *
ErrorIfDistTablesNotColocated(Query *parse, List *distTablesList,
PlannerRestrictionContext *
plannerRestrictionContext)
{
/* All MERGE tables must be distributed */
if (list_length(distTablesList) < 2)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"For MERGE command, both the source and target "
"must be distributed", NULL, NULL);
}
/* All distributed tables must be colocated */
if (!AllDistributedRelationsInRTEListColocated(distTablesList))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"For MERGE command, all the distributed tables "
"must be colocated", NULL, NULL);
}
return NULL;
}
/*
* ErrorIfRTETypeIsUnsupported Checks for types of tables that are not supported, such
* as, reference tables, append-distributed tables and materialized view as target relation.
* Routine returns NULL for the supported types, error message for everything else.
*/
static DeferredErrorMessage *
CheckIfRTETypeIsUnsupported(Query *parse, RangeTblEntry *rangeTableEntry)
{
if (rangeTableEntry->relkind == RELKIND_MATVIEW ||
rangeTableEntry->relkind == RELKIND_FOREIGN_TABLE)
{
/* Materialized view or Foreign table as target is not allowed */
if (IsMergeAllowedOnRelation(parse, rangeTableEntry))
{
/* Non target relation is ok */
return NULL;
}
else
{
/* Usually we don't reach this exception as the Postgres parser catches it */
StringInfo errorMessage = makeStringInfo();
appendStringInfo(errorMessage, "MERGE command is not allowed on "
"relation type(relkind:%c)",
rangeTableEntry->relkind);
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
errorMessage->data, NULL, NULL);
}
}
if (rangeTableEntry->relkind != RELKIND_RELATION &&
rangeTableEntry->relkind != RELKIND_PARTITIONED_TABLE)
{
StringInfo errorMessage = makeStringInfo();
appendStringInfo(errorMessage, "Unexpected table type(relkind:%c) "
"in MERGE command", rangeTableEntry->relkind);
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
errorMessage->data, NULL, NULL);
}
Assert(rangeTableEntry->relid != 0);
/* Reference tables are not supported yet */
if (IsCitusTableType(rangeTableEntry->relid, REFERENCE_TABLE))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"MERGE command is not supported on reference "
"tables yet", NULL, NULL);
}
/* Append/Range tables are not supported */
if (IsCitusTableType(rangeTableEntry->relid, APPEND_DISTRIBUTED) ||
IsCitusTableType(rangeTableEntry->relid, RANGE_DISTRIBUTED))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"For MERGE command, all the distributed tables "
"must be colocated, for append/range distribution, "
"colocation is not supported", NULL,
"Consider using hash distribution instead");
}
return NULL;
}
/*
* ErrorIfMergeHasUnsupportedTables checks if all the tables(target, source or any CTE
* present) in the MERGE command are local i.e. a combination of Citus local and Non-Citus
* tables (regular Postgres tables), or distributed tables with some restrictions, please
* see header of routine ErrorIfDistTablesNotColocated for details, raises an exception
* for all other combinations.
*/
static DeferredErrorMessage *
ErrorIfMergeHasUnsupportedTables(Query *parse, List *rangeTableList,
PlannerRestrictionContext *restrictionContext)
{
List *distTablesList = NIL;
bool foundLocalTables = false;
RangeTblEntry *rangeTableEntry = NULL;
foreach_ptr(rangeTableEntry, rangeTableList)
{
Oid relationId = rangeTableEntry->relid;
switch (rangeTableEntry->rtekind)
{
case RTE_RELATION:
{
/* Check the relation type */
break;
}
case RTE_SUBQUERY:
case RTE_FUNCTION:
case RTE_TABLEFUNC:
case RTE_VALUES:
case RTE_JOIN:
case RTE_CTE:
{
/* Skip them as base table(s) will be checked */
continue;
}
/*
* RTE_NAMEDTUPLESTORE is typically used in ephmeral named relations,
* such as, trigger data; until we find a genuine use case, raise an
* exception.
* RTE_RESULT is a node added by the planner and we shouldn't
* encounter it in the parse tree.
*/
case RTE_NAMEDTUPLESTORE:
case RTE_RESULT:
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"MERGE command is not supported with "
"Tuplestores and results",
NULL, NULL);
}
default:
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"MERGE command: Unrecognized range table entry.",
NULL, NULL);
}
}
/* RTE Relation can be of various types, check them now */
/* skip the regular views as they are replaced with subqueries */
if (rangeTableEntry->relkind == RELKIND_VIEW)
{
continue;
}
DeferredErrorMessage *errorMessage =
CheckIfRTETypeIsUnsupported(parse, rangeTableEntry);
if (errorMessage)
{
return errorMessage;
}
/*
* For now, save all distributed tables, later (below) we will
* check for supported combination(s).
*/
if (IsCitusTableType(relationId, DISTRIBUTED_TABLE))
{
distTablesList = lappend(distTablesList, rangeTableEntry);
continue;
}
/* Regular Postgres tables and Citus local tables are allowed */
if (!IsCitusTable(relationId) ||
IsCitusTableType(relationId, CITUS_LOCAL_TABLE))
{
foundLocalTables = true;
continue;
}
/* Any other Citus table type missing ? */
}
/* Ensure all tables are indeed local */
if (foundLocalTables && list_length(distTablesList) == 0)
{
/* All the tables are local, supported */
return NULL;
}
else if (foundLocalTables && list_length(distTablesList) > 0)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"MERGE command is not supported with "
"combination of distributed/local tables yet",
NULL, NULL);
}
/* Ensure all distributed tables are indeed co-located */
return ErrorIfDistTablesNotColocated(parse,
distTablesList,
restrictionContext);
}
/*
* IsPartitionColumnInMerge returns true if the given column is a partition column.
* The function uses FindReferencedTableColumn to find the original relation
* id and column that the column expression refers to. It then checks whether
* that column is a partition column of the relation.
*
* Also, the function returns always false for reference tables given that
* reference tables do not have partition column.
*
* If skipOuterVars is true, then it doesn't process the outervars.
*/
bool
IsDistributionColumnInMergeSource(Expr *columnExpression, Query *query, bool
skipOuterVars)
{
bool isDistributionColumn = false;
Var *column = NULL;
RangeTblEntry *relationRTE = NULL;
/* ParentQueryList is same as the original query for MERGE */
FindReferencedTableColumn(columnExpression, list_make1(query), query, &column,
&relationRTE,
skipOuterVars);
Oid relationId = relationRTE ? relationRTE->relid : InvalidOid;
if (relationId != InvalidOid && column != NULL)
{
Var *distributionColumn = DistPartitionKey(relationId);
/* not all distributed tables have partition column */
if (distributionColumn != NULL && column->varattno ==
distributionColumn->varattno)
{
isDistributionColumn = true;
}
}
return isDistributionColumn;
}
/*
* InsertDistributionColumnMatchesSource check to see if MERGE is inserting a
* value into the target which is not from the source table, if so, it
* raises an exception.
* Note: Inserting random values other than the joined column values will
* result in unexpected behaviour of rows ending up in incorrect shards, to
* prevent such mishaps, we disallow such inserts here.
*/
static DeferredErrorMessage *
InsertDistributionColumnMatchesSource(Query *query, RangeTblEntry *resultRte)
{
Assert(IsMergeQuery(query));
if (!IsCitusTableType(resultRte->relid, DISTRIBUTED_TABLE))
{
return NULL;
}
bool foundDistributionColumn = false;
MergeAction *action = NULL;
foreach_ptr(action, query->mergeActionList)
{
/* Skip MATCHED clause as INSERTS are not allowed in it*/
if (action->matched)
{
continue;
}
/* NOT MATCHED can have either INSERT or DO NOTHING */
if (action->commandType == CMD_NOTHING)
{
return NULL;
}
if (action->targetList == NIL)
{
/* INSERT DEFAULT VALUES is not allowed */
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"cannot perform MERGE INSERT with DEFAULTS",
NULL, NULL);
}
Assert(action->commandType == CMD_INSERT);
Var *targetKey = PartitionColumn(resultRte->relid, 1);
TargetEntry *targetEntry = NULL;
foreach_ptr(targetEntry, action->targetList)
{
AttrNumber originalAttrNo = targetEntry->resno;
/* skip processing of target table non-partition columns */
if (originalAttrNo != targetKey->varattno)
{
continue;
}
foundDistributionColumn = true;
if (IsA(targetEntry->expr, Var))
{
if (IsDistributionColumnInMergeSource(targetEntry->expr, query, true))
{
return NULL;
}
else
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"MERGE INSERT must use the source table "
"distribution column value",
NULL, NULL);
}
}
else
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"MERGE INSERT must refer a source column "
"for distribution column ",
NULL, NULL);
}
}
if (!foundDistributionColumn)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"MERGE INSERT must have distribution column as value",
NULL, NULL);
}
}
return NULL;
}
/*
* MergeQualAndTargetListFunctionsSupported Checks WHEN/ON clause actions to see what functions
* are allowed, if we are updating distribution column, etc.
*/
static DeferredErrorMessage *
MergeQualAndTargetListFunctionsSupported(Oid resultRelationId, FromExpr *joinTree,
Node *quals,
List *targetList, CmdType commandType)
{
uint32 rangeTableId = 1;
Var *distributionColumn = NULL;
if (IsCitusTable(resultRelationId) && HasDistributionKey(resultRelationId))
{
distributionColumn = PartitionColumn(resultRelationId, rangeTableId);
}
ListCell *targetEntryCell = NULL;
bool hasVarArgument = false; /* A STABLE function is passed a Var argument */
bool hasBadCoalesce = false; /* CASE/COALESCE passed a mutable function */
foreach(targetEntryCell, targetList)
{
TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell);
bool targetEntryDistributionColumn = false;
AttrNumber targetColumnAttrNumber = InvalidAttrNumber;
if (distributionColumn)
{
if (commandType == CMD_UPDATE)
{
/*
* Note that it is not possible to give an alias to
* UPDATE table SET ...
*/
if (targetEntry->resname)
{
targetColumnAttrNumber = get_attnum(resultRelationId,
targetEntry->resname);
if (targetColumnAttrNumber == distributionColumn->varattno)
{
targetEntryDistributionColumn = true;
}
}
}
}
if (targetEntryDistributionColumn &&
TargetEntryChangesValue(targetEntry, distributionColumn, joinTree))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"updating the distribution column is not "
"allowed in MERGE actions",
NULL, NULL);
}
if (FindNodeMatchingCheckFunction((Node *) targetEntry->expr,
CitusIsVolatileFunction))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"functions used in MERGE actions on distributed "
"tables must not be VOLATILE",
NULL, NULL);
}
if (MasterIrreducibleExpression((Node *) targetEntry->expr,
&hasVarArgument, &hasBadCoalesce))
{
Assert(hasVarArgument || hasBadCoalesce);
}
if (FindNodeMatchingCheckFunction((Node *) targetEntry->expr,
NodeIsFieldStore))
{
/* DELETE cannot do field indirection already */
Assert(commandType == CMD_UPDATE || commandType == CMD_INSERT);
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"inserting or modifying composite type fields is not "
"supported", NULL,
"Use the column name to insert or update the composite "
"type as a single value");
}
}
/*
* Check the condition, convert list of expressions into expression tree for further processing
*/
if (quals)
{
if (IsA(quals, List))
{
quals = (Node *) make_ands_explicit((List *) quals);
}
if (FindNodeMatchingCheckFunction((Node *) quals, CitusIsVolatileFunction))
{
StringInfo errorMessage = makeStringInfo();
appendStringInfo(errorMessage, "functions used in the %s clause of MERGE "
"queries on distributed tables must not be VOLATILE",
(commandType == CMD_MERGE) ? "ON" : "WHEN");
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
errorMessage->data, NULL, NULL);
}
else if (MasterIrreducibleExpression(quals, &hasVarArgument, &hasBadCoalesce))
{
Assert(hasVarArgument || hasBadCoalesce);
}
}
if (hasVarArgument)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"STABLE functions used in MERGE queries "
"cannot be called with column references",
NULL, NULL);
}
if (hasBadCoalesce)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"non-IMMUTABLE functions are not allowed in CASE or "
"COALESCE statements",
NULL, NULL);
}
if (quals != NULL && nodeTag(quals) == T_CurrentOfExpr)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"cannot run MERGE actions with cursors",
NULL, NULL);
}
return NULL;
}
#endif

View File

@ -29,6 +29,7 @@
#include "distributed/citus_nodefuncs.h"
#include "distributed/connection_management.h"
#include "distributed/deparse_shard_query.h"
#include "distributed/executor_util.h"
#include "distributed/insert_select_planner.h"
#include "distributed/insert_select_executor.h"
#include "distributed/listutils.h"
@ -199,20 +200,6 @@ CitusExplainScan(CustomScanState *node, List *ancestors, struct ExplainState *es
return;
}
/*
* ALTER TABLE statements are not explained by postgres. However ALTER TABLE statements
* may trigger SELECT statements causing explain hook to run. This situation causes a crash in a worker.
* Therefore we will detect if we are explaining a triggered query when we are processing
* an ALTER TABLE statement and stop explain in this situation.
*/
if (AlterTableInProgress())
{
ExplainPropertyText("Citus Explain Scan",
"Explain for triggered constraint validation queries during ALTER TABLE commands are not supported by Citus",
es);
return;
}
ExplainOpenGroup("Distributed Query", "Distributed Query", true, es);
/*

View File

@ -1383,7 +1383,7 @@ DistPartitionKey(Oid relationId)
CitusTableCacheEntry *partitionEntry = GetCitusTableCacheEntry(relationId);
/* non-distributed tables do not have partition column */
if (IsCitusTableTypeCacheEntry(partitionEntry, CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKeyCacheEntry(partitionEntry))
{
return NULL;
}

View File

@ -3385,6 +3385,13 @@ GetAggregateType(Aggref *aggregateExpression)
{
Oid aggFunctionId = aggregateExpression->aggfnoid;
/* custom aggregates with combine func take precedence over name-based logic */
if (aggFunctionId >= FirstNormalObjectId &&
AggregateEnabledCustom(aggregateExpression))
{
return AGGREGATE_CUSTOM_COMBINE;
}
/* look up the function name */
char *aggregateProcName = get_func_name(aggFunctionId);
if (aggregateProcName == NULL)
@ -3395,8 +3402,6 @@ GetAggregateType(Aggref *aggregateExpression)
uint32 aggregateCount = lengthof(AggregateNames);
Assert(AGGREGATE_INVALID_FIRST == 0);
for (uint32 aggregateIndex = 1; aggregateIndex < aggregateCount; aggregateIndex++)
{
const char *aggregateName = AggregateNames[aggregateIndex];
@ -3465,7 +3470,7 @@ GetAggregateType(Aggref *aggregateExpression)
}
}
/* handle any remaining built-in aggregates with a suitable combinefn */
if (AggregateEnabledCustom(aggregateExpression))
{
return AGGREGATE_CUSTOM_COMBINE;

View File

@ -228,7 +228,7 @@ TargetListOnPartitionColumn(Query *query, List *targetEntryList)
* If the expression belongs to a non-distributed table continue searching for
* other partition keys.
*/
if (IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY))
if (IsCitusTable(relationId) && !HasDistributionKey(relationId))
{
continue;
}

View File

@ -2199,7 +2199,7 @@ QueryPushdownSqlTaskList(Query *query, uint64 jobId,
Oid relationId = relationRestriction->relationId;
CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(relationId);
if (IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKeyCacheEntry(cacheEntry))
{
continue;
}
@ -2377,7 +2377,7 @@ ErrorIfUnsupportedShardDistribution(Query *query)
nonReferenceRelations = lappend_oid(nonReferenceRelations,
relationId);
}
else if (IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY))
else if (IsCitusTable(relationId) && !HasDistributionKey(relationId))
{
/* do not need to handle non-distributed tables */
continue;
@ -2482,7 +2482,7 @@ QueryPushdownTaskCreate(Query *originalQuery, int shardIndex,
ShardInterval *shardInterval = NULL;
CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(relationId);
if (IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKeyCacheEntry(cacheEntry))
{
/* non-distributed tables have only one shard */
shardInterval = cacheEntry->sortedShardIntervalArray[0];
@ -3697,7 +3697,7 @@ PartitionedOnColumn(Var *column, List *rangeTableList, List *dependentJobList)
Var *partitionColumn = PartitionColumn(relationId, rangeTableId);
/* non-distributed tables do not have partition columns */
if (IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY))
if (IsCitusTable(relationId) && !HasDistributionKey(relationId))
{
return false;
}
@ -4573,7 +4573,8 @@ RowModifyLevelForQuery(Query *query)
}
if (commandType == CMD_UPDATE ||
commandType == CMD_DELETE)
commandType == CMD_DELETE ||
commandType == CMD_MERGE)
{
return ROW_MODIFY_NONCOMMUTATIVE;
}
@ -5343,8 +5344,7 @@ ActiveShardPlacementLists(List *taskList)
/*
* CompareShardPlacements compares two shard placements by their tuple oid; this
* oid reflects the tuple's insertion order into pg_dist_placement.
* CompareShardPlacements compares two shard placements by placement id.
*/
int
CompareShardPlacements(const void *leftElement, const void *rightElement)
@ -5370,6 +5370,35 @@ CompareShardPlacements(const void *leftElement, const void *rightElement)
}
/*
* CompareGroupShardPlacements compares two group shard placements by placement id.
*/
int
CompareGroupShardPlacements(const void *leftElement, const void *rightElement)
{
const GroupShardPlacement *leftPlacement =
*((const GroupShardPlacement **) leftElement);
const GroupShardPlacement *rightPlacement =
*((const GroupShardPlacement **) rightElement);
uint64 leftPlacementId = leftPlacement->placementId;
uint64 rightPlacementId = rightPlacement->placementId;
if (leftPlacementId < rightPlacementId)
{
return -1;
}
else if (leftPlacementId > rightPlacementId)
{
return 1;
}
else
{
return 0;
}
}
/*
* LeftRotateList returns a copy of the given list that has been cyclically
* shifted to the left by the given rotation count. For this, the function

View File

@ -28,11 +28,13 @@
#include "distributed/deparse_shard_query.h"
#include "distributed/distribution_column.h"
#include "distributed/errormessage.h"
#include "distributed/executor_util.h"
#include "distributed/log_utils.h"
#include "distributed/insert_select_planner.h"
#include "distributed/intermediate_result_pruning.h"
#include "distributed/metadata_utility.h"
#include "distributed/coordinator_protocol.h"
#include "distributed/merge_planner.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_executor.h"
#include "distributed/multi_join_order.h"
@ -113,6 +115,7 @@ typedef struct WalkerState
} WalkerState;
bool EnableRouterExecution = true;
bool EnableNonColocatedRouterQueryPushdown = false;
/* planner functions forward declarations */
@ -121,34 +124,24 @@ static void CreateSingleTaskRouterSelectPlan(DistributedPlan *distributedPlan,
Query *query,
PlannerRestrictionContext *
plannerRestrictionContext);
static Oid ResultRelationOidForQuery(Query *query);
static bool IsTidColumn(Node *node);
static DeferredErrorMessage * ModifyPartialQuerySupported(Query *queryTree, bool
multiShardQuery,
Oid *distributedTableId);
static bool NodeIsFieldStore(Node *node);
static DeferredErrorMessage * MultiShardUpdateDeleteMergeSupported(Query *originalQuery,
static DeferredErrorMessage * MultiShardUpdateDeleteSupported(Query *originalQuery,
PlannerRestrictionContext
*
plannerRestrictionContext);
static DeferredErrorMessage * SingleShardUpdateDeleteSupported(Query *originalQuery,
PlannerRestrictionContext *
plannerRestrictionContext);
static bool HasDangerousJoinUsing(List *rtableList, Node *jtnode);
static bool MasterIrreducibleExpression(Node *expression, bool *varArgument,
bool *badCoalesce);
static bool MasterIrreducibleExpressionWalker(Node *expression, WalkerState *state);
static bool MasterIrreducibleExpressionFunctionChecker(Oid func_id, void *context);
static bool TargetEntryChangesValue(TargetEntry *targetEntry, Var *column,
FromExpr *joinTree);
static Job * RouterInsertJob(Query *originalQuery);
static void ErrorIfNoShardsExist(CitusTableCacheEntry *cacheEntry);
static DeferredErrorMessage * DeferErrorIfModifyView(Query *queryTree);
static Job * CreateJob(Query *query);
static Task * CreateTask(TaskType taskType);
static Job * RouterJob(Query *originalQuery,
PlannerRestrictionContext *plannerRestrictionContext,
DeferredErrorMessage **planningError);
static bool RelationPrunesToMultipleShards(List *relationShardList);
static void NormalizeMultiRowInsertTargetList(Query *query);
static void AppendNextDummyColReference(Alias *expendedReferenceNames);
@ -445,7 +438,7 @@ ModifyQueryResultRelationId(Query *query)
* ResultRelationOidForQuery returns the OID of the relation this is modified
* by a given query.
*/
static Oid
Oid
ResultRelationOidForQuery(Query *query)
{
RangeTblEntry *resultRTE = rt_fetch(query->resultRelation, query->rtable);
@ -512,6 +505,161 @@ IsTidColumn(Node *node)
}
/*
* TargetlistAndFunctionsSupported implements a subset of what ModifyPartialQuerySupported
* checks, that subset being checking what functions are allowed, if we are
* updating distribution column, etc.
* Note: This subset of checks are repeated for each MERGE modify action.
*/
DeferredErrorMessage *
TargetlistAndFunctionsSupported(Oid resultRelationId, FromExpr *joinTree, Node *quals,
List *targetList,
CmdType commandType, List *returningList)
{
uint32 rangeTableId = 1;
Var *partitionColumn = NULL;
if (IsCitusTable(resultRelationId))
{
partitionColumn = PartitionColumn(resultRelationId, rangeTableId);
}
bool hasVarArgument = false; /* A STABLE function is passed a Var argument */
bool hasBadCoalesce = false; /* CASE/COALESCE passed a mutable function */
ListCell *targetEntryCell = NULL;
foreach(targetEntryCell, targetList)
{
TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell);
/* skip resjunk entries: UPDATE adds some for ctid, etc. */
if (targetEntry->resjunk)
{
continue;
}
bool targetEntryPartitionColumn = false;
AttrNumber targetColumnAttrNumber = InvalidAttrNumber;
/* reference tables do not have partition column */
if (partitionColumn == NULL)
{
targetEntryPartitionColumn = false;
}
else
{
if (commandType == CMD_UPDATE)
{
/*
* Note that it is not possible to give an alias to
* UPDATE table SET ...
*/
if (targetEntry->resname)
{
targetColumnAttrNumber = get_attnum(resultRelationId,
targetEntry->resname);
if (targetColumnAttrNumber == partitionColumn->varattno)
{
targetEntryPartitionColumn = true;
}
}
}
}
if (commandType == CMD_UPDATE &&
FindNodeMatchingCheckFunction((Node *) targetEntry->expr,
CitusIsVolatileFunction))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"functions used in UPDATE queries on distributed "
"tables must not be VOLATILE",
NULL, NULL);
}
if (commandType == CMD_UPDATE && targetEntryPartitionColumn &&
TargetEntryChangesValue(targetEntry, partitionColumn,
joinTree))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"modifying the partition value of rows is not "
"allowed",
NULL, NULL);
}
if (commandType == CMD_UPDATE &&
MasterIrreducibleExpression((Node *) targetEntry->expr,
&hasVarArgument, &hasBadCoalesce))
{
Assert(hasVarArgument || hasBadCoalesce);
}
if (FindNodeMatchingCheckFunction((Node *) targetEntry->expr,
NodeIsFieldStore))
{
/* DELETE cannot do field indirection already */
Assert(commandType == CMD_UPDATE || commandType == CMD_INSERT);
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"inserting or modifying composite type fields is not "
"supported", NULL,
"Use the column name to insert or update the composite "
"type as a single value");
}
}
if (joinTree != NULL)
{
if (FindNodeMatchingCheckFunction((Node *) quals,
CitusIsVolatileFunction))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"functions used in the WHERE/ON/WHEN clause of modification "
"queries on distributed tables must not be VOLATILE",
NULL, NULL);
}
else if (MasterIrreducibleExpression(quals, &hasVarArgument,
&hasBadCoalesce))
{
Assert(hasVarArgument || hasBadCoalesce);
}
}
if (hasVarArgument)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"STABLE functions used in UPDATE queries "
"cannot be called with column references",
NULL, NULL);
}
if (hasBadCoalesce)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"non-IMMUTABLE functions are not allowed in CASE or "
"COALESCE statements",
NULL, NULL);
}
if (contain_mutable_functions((Node *) returningList))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"non-IMMUTABLE functions are not allowed in the "
"RETURNING clause",
NULL, NULL);
}
if (quals != NULL &&
nodeTag(quals) == T_CurrentOfExpr)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"cannot run DML queries with cursors", NULL,
NULL);
}
return NULL;
}
/*
* ModifyPartialQuerySupported implements a subset of what ModifyQuerySupported checks,
* that subset being what's necessary to check modifying CTEs for.
@ -620,148 +768,21 @@ ModifyPartialQuerySupported(Query *queryTree, bool multiShardQuery,
Oid resultRelationId = ModifyQueryResultRelationId(queryTree);
*distributedTableIdOutput = resultRelationId;
uint32 rangeTableId = 1;
Var *partitionColumn = NULL;
if (IsCitusTable(resultRelationId))
{
partitionColumn = PartitionColumn(resultRelationId, rangeTableId);
}
commandType = queryTree->commandType;
if (commandType == CMD_INSERT || commandType == CMD_UPDATE ||
commandType == CMD_DELETE)
{
bool hasVarArgument = false; /* A STABLE function is passed a Var argument */
bool hasBadCoalesce = false; /* CASE/COALESCE passed a mutable function */
FromExpr *joinTree = queryTree->jointree;
ListCell *targetEntryCell = NULL;
foreach(targetEntryCell, queryTree->targetList)
deferredError =
TargetlistAndFunctionsSupported(resultRelationId,
queryTree->jointree,
queryTree->jointree->quals,
queryTree->targetList,
commandType,
queryTree->returningList);
if (deferredError)
{
TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell);
/* skip resjunk entries: UPDATE adds some for ctid, etc. */
if (targetEntry->resjunk)
{
continue;
}
bool targetEntryPartitionColumn = false;
AttrNumber targetColumnAttrNumber = InvalidAttrNumber;
/* reference tables do not have partition column */
if (partitionColumn == NULL)
{
targetEntryPartitionColumn = false;
}
else
{
if (commandType == CMD_UPDATE)
{
/*
* Note that it is not possible to give an alias to
* UPDATE table SET ...
*/
if (targetEntry->resname)
{
targetColumnAttrNumber = get_attnum(resultRelationId,
targetEntry->resname);
if (targetColumnAttrNumber == partitionColumn->varattno)
{
targetEntryPartitionColumn = true;
}
}
}
}
if (commandType == CMD_UPDATE &&
FindNodeMatchingCheckFunction((Node *) targetEntry->expr,
CitusIsVolatileFunction))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"functions used in UPDATE queries on distributed "
"tables must not be VOLATILE",
NULL, NULL);
}
if (commandType == CMD_UPDATE && targetEntryPartitionColumn &&
TargetEntryChangesValue(targetEntry, partitionColumn,
queryTree->jointree))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"modifying the partition value of rows is not "
"allowed",
NULL, NULL);
}
if (commandType == CMD_UPDATE &&
MasterIrreducibleExpression((Node *) targetEntry->expr,
&hasVarArgument, &hasBadCoalesce))
{
Assert(hasVarArgument || hasBadCoalesce);
}
if (FindNodeMatchingCheckFunction((Node *) targetEntry->expr,
NodeIsFieldStore))
{
/* DELETE cannot do field indirection already */
Assert(commandType == CMD_UPDATE || commandType == CMD_INSERT);
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"inserting or modifying composite type fields is not "
"supported", NULL,
"Use the column name to insert or update the composite "
"type as a single value");
}
}
if (joinTree != NULL)
{
if (FindNodeMatchingCheckFunction((Node *) joinTree->quals,
CitusIsVolatileFunction))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"functions used in the WHERE clause of modification "
"queries on distributed tables must not be VOLATILE",
NULL, NULL);
}
else if (MasterIrreducibleExpression(joinTree->quals, &hasVarArgument,
&hasBadCoalesce))
{
Assert(hasVarArgument || hasBadCoalesce);
}
}
if (hasVarArgument)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"STABLE functions used in UPDATE queries "
"cannot be called with column references",
NULL, NULL);
}
if (hasBadCoalesce)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"non-IMMUTABLE functions are not allowed in CASE or "
"COALESCE statements",
NULL, NULL);
}
if (contain_mutable_functions((Node *) queryTree->returningList))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"non-IMMUTABLE functions are not allowed in the "
"RETURNING clause",
NULL, NULL);
}
if (queryTree->jointree->quals != NULL &&
nodeTag(queryTree->jointree->quals) == T_CurrentOfExpr)
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"cannot run DML queries with cursors", NULL,
NULL);
return deferredError;
}
}
@ -866,7 +887,7 @@ IsLocallyAccessibleCitusLocalTable(Oid relationId)
/*
* NodeIsFieldStore returns true if given Node is a FieldStore object.
*/
static bool
bool
NodeIsFieldStore(Node *node)
{
return node && IsA(node, FieldStore);
@ -888,7 +909,9 @@ ModifyQuerySupported(Query *queryTree, Query *originalQuery, bool multiShardQuer
PlannerRestrictionContext *plannerRestrictionContext)
{
Oid distributedTableId = InvalidOid;
DeferredErrorMessage *error = ModifyPartialQuerySupported(queryTree, multiShardQuery,
DeferredErrorMessage *error =
ModifyPartialQuerySupported(queryTree, multiShardQuery,
&distributedTableId);
if (error)
{
@ -953,19 +976,12 @@ ModifyQuerySupported(Query *queryTree, Query *originalQuery, bool multiShardQuer
*/
}
else if (rangeTableEntry->relkind == RELKIND_MATVIEW)
{
if (IsMergeAllowedOnRelation(originalQuery, rangeTableEntry))
{
continue;
}
else
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"materialized views in "
"modify queries are not supported",
NULL, NULL);
}
}
/* for other kinds of relations, check if it's distributed */
else
{
@ -1065,7 +1081,7 @@ ModifyQuerySupported(Query *queryTree, Query *originalQuery, bool multiShardQuer
if (multiShardQuery)
{
errorMessage = MultiShardUpdateDeleteMergeSupported(
errorMessage = MultiShardUpdateDeleteSupported(
originalQuery,
plannerRestrictionContext);
}
@ -1246,11 +1262,11 @@ ErrorIfOnConflictNotSupported(Query *queryTree)
/*
* MultiShardUpdateDeleteMergeSupported returns the error message if the update/delete is
* MultiShardUpdateDeleteSupported returns the error message if the update/delete is
* not pushdownable, otherwise it returns NULL.
*/
static DeferredErrorMessage *
MultiShardUpdateDeleteMergeSupported(Query *originalQuery,
MultiShardUpdateDeleteSupported(Query *originalQuery,
PlannerRestrictionContext *plannerRestrictionContext)
{
DeferredErrorMessage *errorMessage = NULL;
@ -1282,7 +1298,8 @@ MultiShardUpdateDeleteMergeSupported(Query *originalQuery,
}
else
{
errorMessage = DeferErrorIfUnsupportedSubqueryPushdown(originalQuery,
errorMessage = DeferErrorIfUnsupportedSubqueryPushdown(
originalQuery,
plannerRestrictionContext);
}
@ -1323,7 +1340,7 @@ SingleShardUpdateDeleteSupported(Query *originalQuery,
* HasDangerousJoinUsing search jointree for unnamed JOIN USING. Check the
* implementation of has_dangerous_join_using in ruleutils.
*/
static bool
bool
HasDangerousJoinUsing(List *rtableList, Node *joinTreeNode)
{
if (IsA(joinTreeNode, RangeTblRef))
@ -1427,7 +1444,7 @@ IsMergeQuery(Query *query)
* which do, but for now we just error out. That makes both the code and user-education
* easier.
*/
static bool
bool
MasterIrreducibleExpression(Node *expression, bool *varArgument, bool *badCoalesce)
{
WalkerState data;
@ -1575,7 +1592,7 @@ MasterIrreducibleExpressionFunctionChecker(Oid func_id, void *context)
* expression is a value that is implied by the qualifiers of the join
* tree, or the target entry sets a different column.
*/
static bool
bool
TargetEntryChangesValue(TargetEntry *targetEntry, Var *column, FromExpr *joinTree)
{
bool isColumnValueChanged = true;
@ -1796,7 +1813,7 @@ ExtractFirstCitusTableId(Query *query)
* RouterJob builds a Job to represent a single shard select/update/delete and
* multiple shard update/delete queries.
*/
static Job *
Job *
RouterJob(Query *originalQuery, PlannerRestrictionContext *plannerRestrictionContext,
DeferredErrorMessage **planningError)
{
@ -1846,8 +1863,8 @@ RouterJob(Query *originalQuery, PlannerRestrictionContext *plannerRestrictionCon
if (*planningError)
{
/*
* For MERGE, we do _not_ plan anything other than Router job, let's
* not continue further down the lane in distributed planning, simply
* For MERGE, we do _not_ plan any other router job than the MERGE job itself,
* let's not continue further down the lane in distributed planning, simply
* bail out.
*/
if (IsMergeQuery(originalQuery))
@ -2320,9 +2337,20 @@ PlanRouterQuery(Query *originalQuery,
}
Assert(UpdateOrDeleteOrMergeQuery(originalQuery));
if (IsMergeQuery(originalQuery))
{
planningError = MergeQuerySupported(originalQuery,
isMultiShardQuery,
plannerRestrictionContext);
}
else
{
planningError = ModifyQuerySupported(originalQuery, originalQuery,
isMultiShardQuery,
plannerRestrictionContext);
}
if (planningError != NULL)
{
return planningError;
@ -2643,7 +2671,7 @@ TargetShardIntervalForFastPathQuery(Query *query, bool *isMultiShardQuery,
{
Oid relationId = ExtractFirstCitusTableId(query);
if (IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKey(relationId))
{
/* we don't need to do shard pruning for non-distributed tables */
return list_make1(LoadShardIntervalList(relationId));
@ -2936,7 +2964,7 @@ BuildRoutesForInsert(Query *query, DeferredErrorMessage **planningError)
Assert(query->commandType == CMD_INSERT);
/* reference tables and citus local tables can only have one shard */
if (IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKeyCacheEntry(cacheEntry))
{
List *shardIntervalList = LoadShardIntervalList(distributedTableId);
@ -3477,7 +3505,7 @@ ExtractInsertPartitionKeyValue(Query *query)
uint32 rangeTableId = 1;
Const *singlePartitionValueConst = NULL;
if (IsCitusTableType(distributedTableId, CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKey(distributedTableId))
{
return NULL;
}
@ -3589,6 +3617,8 @@ DeferErrorIfUnsupportedRouterPlannableSelectQuery(Query *query)
bool hasDistributedTable = false;
bool hasReferenceTable = false;
List *distributedRelationList = NIL;
ExtractRangeTableRelationWalker((Node *) query, &rangeTableRelationList);
foreach(rangeTableRelationCell, rangeTableRelationList)
{
@ -3626,6 +3656,8 @@ DeferErrorIfUnsupportedRouterPlannableSelectQuery(Query *query)
if (IsCitusTableType(distributedTableId, DISTRIBUTED_TABLE))
{
hasDistributedTable = true;
distributedRelationList = lappend_oid(distributedRelationList,
distributedTableId);
}
/*
@ -3680,6 +3712,15 @@ DeferErrorIfUnsupportedRouterPlannableSelectQuery(Query *query)
NULL, NULL);
}
if (!EnableNonColocatedRouterQueryPushdown &&
!AllDistributedRelationsInListColocated(distributedRelationList))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"router planner does not support queries that "
"reference non-colocated distributed tables",
NULL, NULL);
}
#if PG_VERSION_NUM >= PG_VERSION_14
DeferredErrorMessage *CTEWithSearchClauseError =
ErrorIfQueryHasCTEWithSearchClause(query);
@ -3797,8 +3838,7 @@ ErrorIfQueryHasUnroutableModifyingCTE(Query *queryTree)
CitusTableCacheEntry *modificationTableCacheEntry =
GetCitusTableCacheEntry(distributedTableId);
if (IsCitusTableTypeCacheEntry(modificationTableCacheEntry,
CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKeyCacheEntry(modificationTableCacheEntry))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"cannot router plan modification of a non-distributed table",

View File

@ -168,7 +168,7 @@ AnchorRte(Query *subquery)
{
Oid relationId = currentRte->relid;
if (IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY))
if (IsCitusTable(relationId) && !HasDistributionKey(relationId))
{
/*
* Non-distributed tables should not be the anchor rte since they

View File

@ -591,10 +591,16 @@ DeferErrorIfUnsupportedSubqueryPushdown(Query *originalQuery,
}
else if (!RestrictionEquivalenceForPartitionKeys(plannerRestrictionContext))
{
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
"complex joins are only supported when all distributed tables are "
StringInfo errorMessage = makeStringInfo();
bool isMergeCmd = IsMergeQuery(originalQuery);
appendStringInfo(errorMessage,
"%s"
"only supported when all distributed tables are "
"co-located and joined on their distribution columns",
NULL, NULL);
isMergeCmd ? "MERGE command is " : "complex joins are ");
return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED,
errorMessage->data, NULL, NULL);
}
/* we shouldn't allow reference tables in the FROM clause when the query has sublinks */

View File

@ -151,7 +151,8 @@ static void ListConcatUniqueAttributeClassMemberLists(AttributeEquivalenceClass
secondClass);
static Var * PartitionKeyForRTEIdentityInQuery(Query *query, int targetRTEIndex,
Index *partitionKeyIndex);
static bool AllRelationsInRestrictionContextColocated(RelationRestrictionContext *
static bool AllDistributedRelationsInRestrictionContextColocated(
RelationRestrictionContext *
restrictionContext);
static bool IsNotSafeRestrictionToRecursivelyPlan(Node *node);
static JoinRestrictionContext * FilterJoinRestrictionContext(
@ -383,7 +384,7 @@ SafeToPushdownUnionSubquery(Query *originalQuery,
return false;
}
if (!AllRelationsInRestrictionContextColocated(restrictionContext))
if (!AllDistributedRelationsInRestrictionContextColocated(restrictionContext))
{
/* distribution columns are equal, but tables are not co-located */
return false;
@ -703,8 +704,8 @@ EquivalenceListContainsRelationsEquality(List *attributeEquivalenceList,
int rteIdentity = GetRTEIdentity(relationRestriction->rte);
/* we shouldn't check for the equality of non-distributed tables */
if (IsCitusTableType(relationRestriction->relationId,
CITUS_TABLE_WITH_NO_DIST_KEY))
if (IsCitusTable(relationRestriction->relationId) &&
!HasDistributionKey(relationRestriction->relationId))
{
continue;
}
@ -1919,22 +1920,66 @@ FindQueryContainingRTEIdentityInternal(Node *node,
/*
* AllRelationsInRestrictionContextColocated determines whether all of the relations in the
* given relation restrictions list are co-located.
* AllDistributedRelationsInRestrictionContextColocated determines whether all of the
* distributed relations in the given relation restrictions list are co-located.
*/
static bool
AllRelationsInRestrictionContextColocated(RelationRestrictionContext *restrictionContext)
AllDistributedRelationsInRestrictionContextColocated(
RelationRestrictionContext *restrictionContext)
{
RelationRestriction *relationRestriction = NULL;
int initialColocationId = INVALID_COLOCATION_ID;
List *relationIdList = NIL;
/* check whether all relations exists in the main restriction list */
foreach_ptr(relationRestriction, restrictionContext->relationRestrictionList)
{
Oid relationId = relationRestriction->relationId;
relationIdList = lappend_oid(relationIdList, relationRestriction->relationId);
}
if (IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY))
return AllDistributedRelationsInListColocated(relationIdList);
}
/*
* AllDistributedRelationsInRTEListColocated determines whether all of the
* distributed relations in the given RangeTableEntry list are co-located.
*/
bool
AllDistributedRelationsInRTEListColocated(List *rangeTableEntryList)
{
RangeTblEntry *rangeTableEntry = NULL;
List *relationIdList = NIL;
foreach_ptr(rangeTableEntry, rangeTableEntryList)
{
relationIdList = lappend_oid(relationIdList, rangeTableEntry->relid);
}
return AllDistributedRelationsInListColocated(relationIdList);
}
/*
* AllDistributedRelationsInListColocated determines whether all of the
* distributed relations in the given list are co-located.
*/
bool
AllDistributedRelationsInListColocated(List *relationList)
{
int initialColocationId = INVALID_COLOCATION_ID;
Oid relationId = InvalidOid;
foreach_oid(relationId, relationList)
{
if (!IsCitusTable(relationId))
{
/* not interested in Postgres tables */
continue;
}
if (!IsCitusTableType(relationId, DISTRIBUTED_TABLE))
{
/* not interested in non-distributed tables */
continue;
}

View File

@ -333,7 +333,7 @@ PruneShards(Oid relationId, Index rangeTableId, List *whereClauseList,
}
/* short circuit for non-distributed tables such as reference table */
if (IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKeyCacheEntry(cacheEntry))
{
prunedList = ShardArrayToList(cacheEntry->sortedShardIntervalArray,
cacheEntry->shardIntervalArrayLength);

View File

@ -88,6 +88,8 @@ static const char *replicationSlotPrefix[] = {
* IMPORTANT: All the subscription names should start with "citus_". Otherwise
* our utility hook does not defend against non-superusers altering or dropping
* them, which is important for security purposes.
*
* We should also keep these in sync with IsCitusShardTransferBackend().
*/
static const char *subscriptionPrefix[] = {
[SHARD_MOVE] = "citus_shard_move_subscription_",
@ -1338,7 +1340,9 @@ CreatePublications(MultiConnection *connection,
worker->groupId,
CLEANUP_ALWAYS);
ExecuteCriticalRemoteCommand(connection, DISABLE_DDL_PROPAGATION);
ExecuteCriticalRemoteCommand(connection, createPublicationCommand->data);
ExecuteCriticalRemoteCommand(connection, ENABLE_DDL_PROPAGATION);
pfree(createPublicationCommand->data);
pfree(createPublicationCommand);
}

View File

@ -10,18 +10,27 @@
#include "postgres.h"
#include "distributed/shardinterval_utils.h"
#include "distributed/shardsplit_shared_memory.h"
#include "distributed/worker_shard_visibility.h"
#include "distributed/worker_protocol.h"
#include "distributed/listutils.h"
#include "distributed/metadata/distobject.h"
#include "replication/logical.h"
#include "utils/typcache.h"
#include "utils/lsyscache.h"
#include "catalog/pg_namespace.h"
extern void _PG_output_plugin_init(OutputPluginCallbacks *cb);
static LogicalDecodeChangeCB pgoutputChangeCB;
static LogicalDecodeChangeCB pgOutputPluginChangeCB;
#define InvalidRepOriginId 0
static HTAB *SourceToDestinationShardMap = NULL;
static bool replication_origin_filter_cb(LogicalDecodingContext *ctx, RepOriginId
origin_id);
/* Plugin callback */
static void split_change_cb(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
static void shard_split_change_cb(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change);
/* Helper methods */
@ -47,7 +56,8 @@ void
_PG_output_plugin_init(OutputPluginCallbacks *cb)
{
LogicalOutputPluginInit plugin_init =
(LogicalOutputPluginInit) (void *) load_external_function("pgoutput",
(LogicalOutputPluginInit) (void *)
load_external_function("pgoutput",
"_PG_output_plugin_init",
false, NULL);
@ -60,25 +70,56 @@ _PG_output_plugin_init(OutputPluginCallbacks *cb)
plugin_init(cb);
/* actual pgoutput callback will be called with the appropriate destination shard */
pgoutputChangeCB = cb->change_cb;
cb->change_cb = split_change_cb;
pgOutputPluginChangeCB = cb->change_cb;
cb->change_cb = shard_split_change_cb;
cb->filter_by_origin_cb = replication_origin_filter_cb;
}
/*
* split_change function emits the incoming tuple change
* replication_origin_filter_cb call back function filters out publication of changes
* originated from any other node other than the current node. This is
* identified by the "origin_id" of the changes. The origin_id is set to
* a non-zero value in the origin node as part of WAL replication for internal
* operations like shard split/moves/create_distributed_table etc.
*/
static bool
replication_origin_filter_cb(LogicalDecodingContext *ctx, RepOriginId origin_id)
{
return (origin_id != InvalidRepOriginId);
}
/*
* shard_split_change_cb function emits the incoming tuple change
* to the appropriate destination shard.
*/
static void
split_change_cb(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
shard_split_change_cb(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
Relation relation, ReorderBufferChange *change)
{
/*
* If Citus has not been loaded yet, pass the changes
* through to the undrelying decoder plugin.
*/
if (!CitusHasBeenLoaded())
{
pgOutputPluginChangeCB(ctx, txn, relation, change);
return;
}
/* check if the relation is publishable.*/
if (!is_publishable_relation(relation))
{
return;
}
char *replicationSlotName = ctx->slot->data.name.data;
if (replicationSlotName == NULL)
{
elog(ERROR, "Replication slot name is NULL!");
return;
}
/*
* Initialize SourceToDestinationShardMap if not already initialized.
@ -198,7 +239,7 @@ split_change_cb(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
}
}
pgoutputChangeCB(ctx, txn, targetRelation, change);
pgOutputPluginChangeCB(ctx, txn, targetRelation, change);
RelationClose(targetRelation);
}

View File

@ -74,6 +74,7 @@
#include "distributed/recursive_planning.h"
#include "distributed/reference_table_utils.h"
#include "distributed/relation_access_tracking.h"
#include "distributed/replication_origin_session_utils.h"
#include "distributed/run_from_same_connection.h"
#include "distributed/shard_cleaner.h"
#include "distributed/shard_transfer.h"
@ -135,6 +136,8 @@ ReadColumnarOptions_type extern_ReadColumnarOptions = NULL;
CppConcat(extern_, funcname) = \
(typename) (void *) lookup_external_function(handle, # funcname)
#define CDC_DECODER_DYNAMIC_LIB_PATH "$libdir/citus_decoders:$libdir"
DEFINE_COLUMNAR_PASSTHROUGH_FUNC(columnar_handler)
DEFINE_COLUMNAR_PASSTHROUGH_FUNC(alter_columnar_table_set)
DEFINE_COLUMNAR_PASSTHROUGH_FUNC(alter_columnar_table_reset)
@ -206,7 +209,7 @@ static bool StatisticsCollectionGucCheckHook(bool *newval, void **extra, GucSour
source);
static void CitusAuthHook(Port *port, int status);
static bool IsSuperuser(char *userName);
static void AdjustDynamicLibraryPathForCdcDecoders(void);
static ClientAuthentication_hook_type original_client_auth_hook = NULL;
@ -359,6 +362,11 @@ static const struct config_enum_entry cpu_priority_options[] = {
{ NULL, 0, false}
};
static const struct config_enum_entry metadata_sync_mode_options[] = {
{ "transactional", METADATA_SYNC_TRANSACTIONAL, false },
{ "nontransactional", METADATA_SYNC_NON_TRANSACTIONAL, false },
{ NULL, 0, false }
};
/* *INDENT-ON* */
@ -469,6 +477,17 @@ _PG_init(void)
InitializeLocallyReservedSharedConnections();
InitializeClusterClockMem();
/*
* Adjust the Dynamic Library Path to prepend citus_decodes to the dynamic
* library path. This is needed to make sure that the citus decoders are
* loaded before the default decoders for CDC.
*/
if (EnableChangeDataCapture)
{
AdjustDynamicLibraryPathForCdcDecoders();
}
/* initialize shard split shared memory handle management */
InitializeShardSplitSMHandleManagement();
@ -536,6 +555,22 @@ _PG_init(void)
}
/*
* PrependCitusDecodersToDynamicLibrayPath prepends the $libdir/citus_decoders
* to the dynamic library path. This is needed to make sure that the citus
* decoders are loaded before the default decoders for CDC.
*/
static void
AdjustDynamicLibraryPathForCdcDecoders(void)
{
if (strcmp(Dynamic_library_path, "$libdir") == 0)
{
SetConfigOption("dynamic_library_path", CDC_DECODER_DYNAMIC_LIB_PATH,
PGC_POSTMASTER, PGC_S_OVERRIDE);
}
}
#if PG_VERSION_NUM >= PG_VERSION_15
/*
@ -1132,6 +1167,16 @@ RegisterCitusConfigVariables(void)
GUC_STANDARD,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citus.enable_change_data_capture",
gettext_noop("Enables using replication origin tracking for change data capture"),
NULL,
&EnableChangeDataCapture,
false,
PGC_USERSET,
GUC_STANDARD,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citus.enable_cluster_clock",
gettext_noop("When users explicitly call UDF citus_get_transaction_clock() "
@ -1268,6 +1313,26 @@ RegisterCitusConfigVariables(void)
GUC_NO_SHOW_ALL,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citus.enable_non_colocated_router_query_pushdown",
gettext_noop("Enables router planner for the queries that reference "
"non-colocated distributed tables."),
gettext_noop("Normally, router planner planner is only enabled for "
"the queries that reference colocated distributed tables "
"because it is not guaranteed to have the target shards "
"always on the same node, e.g., after rebalancing the "
"shards. For this reason, while enabling this flag allows "
"some degree of optimization for the queries that reference "
"non-colocated distributed tables, it is not guaranteed "
"that the same query will work after rebalancing the shards "
"or altering the shard count of one of those distributed "
"tables."),
&EnableNonColocatedRouterQueryPushdown,
true,
PGC_USERSET,
GUC_NO_SHOW_ALL,
NULL, NULL, NULL);
DefineCustomBoolVariable(
"citus.enable_repartition_joins",
gettext_noop("Allows Citus to repartition data between nodes."),
@ -1849,6 +1914,21 @@ RegisterCitusConfigVariables(void)
GUC_UNIT_MS | GUC_NO_SHOW_ALL,
NULL, NULL, NULL);
DefineCustomEnumVariable(
"citus.metadata_sync_mode",
gettext_noop("Sets transaction mode for metadata syncs."),
gettext_noop("metadata sync can be run inside a single coordinated "
"transaction or with multiple small transactions in "
"idempotent way. By default we sync metadata in single "
"coordinated transaction. When we hit memory problems "
"at workers, we have alternative nontransactional mode "
"where we send each command with separate transaction."),
&MetadataSyncTransMode,
METADATA_SYNC_TRANSACTIONAL, metadata_sync_mode_options,
PGC_SUSET,
GUC_SUPERUSER_ONLY | GUC_NO_SHOW_ALL,
NULL, NULL, NULL);
DefineCustomIntVariable(
"citus.metadata_sync_retry_interval",
gettext_noop("Sets the interval to retry failed metadata syncs."),
@ -2406,7 +2486,6 @@ RegisterCitusConfigVariables(void)
GUC_STANDARD,
NULL, NULL, NULL);
/* warn about config items in the citus namespace that are not registered above */
EmitWarningsOnPlaceholders("citus");

View File

@ -1,4 +1,12 @@
-- citus--11.2-1--11.3-1
#include "udfs/repl_origin_helper/11.3-1.sql"
#include "udfs/worker_adjust_identity_column_seq_ranges/11.3-1.sql"
ALTER TABLE pg_catalog.pg_dist_authinfo REPLICA IDENTITY USING INDEX pg_dist_authinfo_identification_index;
ALTER TABLE pg_catalog.pg_dist_partition REPLICA IDENTITY USING INDEX pg_dist_partition_logical_relid_index;
ALTER TABLE pg_catalog.pg_dist_placement REPLICA IDENTITY USING INDEX pg_dist_placement_placementid_index;
ALTER TABLE pg_catalog.pg_dist_rebalance_strategy REPLICA IDENTITY USING INDEX pg_dist_rebalance_strategy_name_key;
ALTER TABLE pg_catalog.pg_dist_shard REPLICA IDENTITY USING INDEX pg_dist_shard_shardid_index;
ALTER TABLE pg_catalog.pg_dist_transaction REPLICA IDENTITY USING INDEX pg_dist_transaction_unique_constraint;
-- bump version to 11.3-1
#include "udfs/worker_drop_all_shell_tables/11.3-1.sql"
#include "udfs/citus_internal_mark_node_not_synced/11.3-1.sql"

View File

@ -1,2 +1,22 @@
-- citus--11.3-1--11.2-1
-- this is an empty downgrade path since citus--11.2-1--11.3-1.sql is empty for now
DROP FUNCTION pg_catalog.citus_internal_start_replication_origin_tracking();
DROP FUNCTION pg_catalog.citus_internal_stop_replication_origin_tracking();
DROP FUNCTION pg_catalog.citus_internal_is_replication_origin_tracking_active();
DROP FUNCTION IF EXISTS pg_catalog.worker_adjust_identity_column_seq_ranges(regclass);
ALTER TABLE pg_catalog.pg_dist_authinfo REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_partition REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_placement REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_rebalance_strategy REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_shard REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_transaction REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_authinfo REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_partition REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_placement REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_rebalance_strategy REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_shard REPLICA IDENTITY NOTHING;
ALTER TABLE pg_catalog.pg_dist_transaction REPLICA IDENTITY NOTHING;
DROP PROCEDURE pg_catalog.worker_drop_all_shell_tables(bool);
DROP FUNCTION pg_catalog.citus_internal_mark_node_not_synced(int, int);

View File

@ -0,0 +1,6 @@
CREATE OR REPLACE FUNCTION pg_catalog.citus_internal_mark_node_not_synced(parent_pid int, nodeid int)
RETURNS VOID
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_internal_mark_node_not_synced$$;
COMMENT ON FUNCTION citus_internal_mark_node_not_synced(int, int)
IS 'marks given node not synced by unsetting metadatasynced column at the start of the nontransactional sync.';

View File

@ -0,0 +1,6 @@
CREATE OR REPLACE FUNCTION pg_catalog.citus_internal_mark_node_not_synced(parent_pid int, nodeid int)
RETURNS VOID
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_internal_mark_node_not_synced$$;
COMMENT ON FUNCTION citus_internal_mark_node_not_synced(int, int)
IS 'marks given node not synced by unsetting metadatasynced column at the start of the nontransactional sync.';

View File

@ -0,0 +1,20 @@
CREATE OR REPLACE FUNCTION pg_catalog.citus_internal_start_replication_origin_tracking()
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_internal_start_replication_origin_tracking$$;
COMMENT ON FUNCTION pg_catalog.citus_internal_start_replication_origin_tracking()
IS 'To start replication origin tracking for skipping publishing of duplicated events during internal data movements for CDC';
CREATE OR REPLACE FUNCTION pg_catalog.citus_internal_stop_replication_origin_tracking()
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_internal_stop_replication_origin_tracking$$;
COMMENT ON FUNCTION pg_catalog.citus_internal_stop_replication_origin_tracking()
IS 'To stop replication origin tracking for skipping publishing of duplicated events during internal data movements for CDC';
CREATE OR REPLACE FUNCTION pg_catalog.citus_internal_is_replication_origin_tracking_active()
RETURNS boolean
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_internal_is_replication_origin_tracking_active$$;
COMMENT ON FUNCTION pg_catalog.citus_internal_is_replication_origin_tracking_active()
IS 'To check if replication origin tracking is active for skipping publishing of duplicated events during internal data movements for CDC';

View File

@ -0,0 +1,20 @@
CREATE OR REPLACE FUNCTION pg_catalog.citus_internal_start_replication_origin_tracking()
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_internal_start_replication_origin_tracking$$;
COMMENT ON FUNCTION pg_catalog.citus_internal_start_replication_origin_tracking()
IS 'To start replication origin tracking for skipping publishing of duplicated events during internal data movements for CDC';
CREATE OR REPLACE FUNCTION pg_catalog.citus_internal_stop_replication_origin_tracking()
RETURNS void
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_internal_stop_replication_origin_tracking$$;
COMMENT ON FUNCTION pg_catalog.citus_internal_stop_replication_origin_tracking()
IS 'To stop replication origin tracking for skipping publishing of duplicated events during internal data movements for CDC';
CREATE OR REPLACE FUNCTION pg_catalog.citus_internal_is_replication_origin_tracking_active()
RETURNS boolean
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$citus_internal_is_replication_origin_tracking_active$$;
COMMENT ON FUNCTION pg_catalog.citus_internal_is_replication_origin_tracking_active()
IS 'To check if replication origin tracking is active for skipping publishing of duplicated events during internal data movements for CDC';

View File

@ -0,0 +1,7 @@
CREATE OR REPLACE FUNCTION pg_catalog.worker_adjust_identity_column_seq_ranges(regclass)
RETURNS VOID
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_adjust_identity_column_seq_ranges$$;
COMMENT ON FUNCTION pg_catalog.worker_adjust_identity_column_seq_ranges(regclass)
IS 'modify identity column seq ranges to produce globally unique values';

View File

@ -0,0 +1,7 @@
CREATE OR REPLACE FUNCTION pg_catalog.worker_adjust_identity_column_seq_ranges(regclass)
RETURNS VOID
LANGUAGE C STRICT
AS 'MODULE_PATHNAME', $$worker_adjust_identity_column_seq_ranges$$;
COMMENT ON FUNCTION pg_catalog.worker_adjust_identity_column_seq_ranges(regclass)
IS 'modify identity column seq ranges to produce globally unique values';

View File

@ -0,0 +1,23 @@
-- During metadata sync, when we send many ddls over single transaction, worker node can error due
-- to reaching at max allocation block size for invalidation messages. To find a workaround for the problem,
-- we added nontransactional metadata sync mode where we create many transaction while dropping shell tables
-- via https://github.com/citusdata/citus/pull/6728.
CREATE OR REPLACE PROCEDURE pg_catalog.worker_drop_all_shell_tables(singleTransaction bool DEFAULT true)
LANGUAGE plpgsql
AS $$
DECLARE
table_name text;
BEGIN
-- drop shell tables within single or multiple transactions according to the flag singleTransaction
FOR table_name IN SELECT logicalrelid::regclass::text FROM pg_dist_partition
LOOP
PERFORM pg_catalog.worker_drop_shell_table(table_name);
IF not singleTransaction THEN
COMMIT;
END IF;
END LOOP;
END;
$$;
COMMENT ON PROCEDURE worker_drop_all_shell_tables(singleTransaction bool)
IS 'drop all distributed tables only without the metadata within single transaction or '
'multiple transaction specified by the flag singleTransaction';

View File

@ -0,0 +1,23 @@
-- During metadata sync, when we send many ddls over single transaction, worker node can error due
-- to reaching at max allocation block size for invalidation messages. To find a workaround for the problem,
-- we added nontransactional metadata sync mode where we create many transaction while dropping shell tables
-- via https://github.com/citusdata/citus/pull/6728.
CREATE OR REPLACE PROCEDURE pg_catalog.worker_drop_all_shell_tables(singleTransaction bool DEFAULT true)
LANGUAGE plpgsql
AS $$
DECLARE
table_name text;
BEGIN
-- drop shell tables within single or multiple transactions according to the flag singleTransaction
FOR table_name IN SELECT logicalrelid::regclass::text FROM pg_dist_partition
LOOP
PERFORM pg_catalog.worker_drop_shell_table(table_name);
IF not singleTransaction THEN
COMMIT;
END IF;
END LOOP;
END;
$$;
COMMENT ON PROCEDURE worker_drop_all_shell_tables(singleTransaction bool)
IS 'drop all distributed tables only without the metadata within single transaction or '
'multiple transaction specified by the flag singleTransaction';

View File

@ -49,26 +49,23 @@ activate_node_snapshot(PG_FUNCTION_ARGS)
*/
WorkerNode *dummyWorkerNode = GetFirstPrimaryWorkerNode();
List *updateLocalGroupCommand =
list_make1(LocalGroupIdUpdateCommand(dummyWorkerNode->groupId));
List *syncDistObjCommands = SyncDistributedObjectsCommandList(dummyWorkerNode);
List *dropSnapshotCommands = NodeMetadataDropCommands();
List *createSnapshotCommands = NodeMetadataCreateCommands();
List *pgDistTableMetadataSyncCommands = PgDistTableMetadataSyncCommandList();
/*
* Create MetadataSyncContext which is used throughout nodes' activation.
* As we set collectCommands to true, it would not create connections to workers.
* Instead it would collect and return sync commands to be sent to workers.
*/
bool collectCommands = true;
bool nodesAddedInSameTransaction = false;
MetadataSyncContext *context = CreateMetadataSyncContext(list_make1(dummyWorkerNode),
collectCommands,
nodesAddedInSameTransaction);
List *activateNodeCommandList = NIL;
ActivateNodeList(context);
List *activateNodeCommandList = context->collectedCommands;
int activateNodeCommandIndex = 0;
Oid ddlCommandTypeId = TEXTOID;
activateNodeCommandList = list_concat(activateNodeCommandList,
updateLocalGroupCommand);
activateNodeCommandList = list_concat(activateNodeCommandList, syncDistObjCommands);
activateNodeCommandList = list_concat(activateNodeCommandList, dropSnapshotCommands);
activateNodeCommandList = list_concat(activateNodeCommandList,
createSnapshotCommands);
activateNodeCommandList = list_concat(activateNodeCommandList,
pgDistTableMetadataSyncCommands);
int activateNodeCommandCount = list_length(activateNodeCommandList);
Datum *activateNodeCommandDatumArray = palloc0(activateNodeCommandCount *
sizeof(Datum));

View File

@ -147,6 +147,26 @@ shard_placement_rebalance_array(PG_FUNCTION_ARGS)
shardPlacementList = SortList(shardPlacementList, CompareShardPlacements);
shardPlacementListList = lappend(shardPlacementListList, shardPlacementList);
List *unbalancedShards = NIL;
ListCell *shardPlacementListCell = NULL;
foreach(shardPlacementListCell, shardPlacementListList)
{
List *placementList = (List *) lfirst(shardPlacementListCell);
if (list_length(placementList) < list_length(workerNodeList))
{
unbalancedShards = list_concat(unbalancedShards,
placementList);
shardPlacementListList = foreach_delete_current(shardPlacementListList,
shardPlacementListCell);
}
}
if (list_length(unbalancedShards) > 0)
{
shardPlacementListList = lappend(shardPlacementListList, unbalancedShards);
}
rebalancePlanFunctions.context = &context;
/* sort the lists to make the function more deterministic */

View File

@ -1270,23 +1270,6 @@ MyBackendGotCancelledDueToDeadlock(bool clearState)
}
/*
* MyBackendIsInDisributedTransaction returns true if MyBackendData
* is in a distributed transaction.
*/
bool
MyBackendIsInDisributedTransaction(void)
{
/* backend might not have used citus yet and thus not initialized backend data */
if (!MyBackendData)
{
return false;
}
return IsInDistributedTransaction(MyBackendData);
}
/*
* ActiveDistributedTransactionNumbers returns a list of pointers to
* transaction numbers of distributed transactions that are in progress
@ -1452,6 +1435,21 @@ IsExternalClientBackend(void)
}
/*
* IsRebalancerInitiatedBackend returns true if we are in a backend that citus
* rebalancer initiated.
*/
bool
IsCitusShardTransferBackend(void)
{
int prefixLength = strlen(CITUS_SHARD_TRANSFER_APPLICATION_NAME_PREFIX);
return strncmp(application_name,
CITUS_SHARD_TRANSFER_APPLICATION_NAME_PREFIX,
prefixLength) == 0;
}
/*
* DetermineCitusBackendType determines the type of backend based on the application_name.
*/

View File

@ -195,7 +195,7 @@ RecordRelationAccessIfNonDistTable(Oid relationId, ShardPlacementAccessType acce
* recursively calling RecordRelationAccessBase(), so becareful about
* removing this check.
*/
if (!IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY))
if (IsCitusTable(relationId) && HasDistributionKey(relationId))
{
return;
}
@ -732,8 +732,8 @@ CheckConflictingRelationAccesses(Oid relationId, ShardPlacementAccessType access
CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(relationId);
if (!(IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY) &&
cacheEntry->referencingRelationsViaForeignKey != NIL))
if (HasDistributionKeyCacheEntry(cacheEntry) ||
cacheEntry->referencingRelationsViaForeignKey == NIL)
{
return;
}
@ -931,7 +931,7 @@ HoldsConflictingLockWithReferencedRelations(Oid relationId, ShardPlacementAccess
* We're only interested in foreign keys to reference tables and citus
* local tables.
*/
if (!IsCitusTableType(referencedRelation, CITUS_TABLE_WITH_NO_DIST_KEY))
if (IsCitusTable(referencedRelation) && HasDistributionKey(referencedRelation))
{
continue;
}
@ -993,7 +993,7 @@ HoldsConflictingLockWithReferencingRelations(Oid relationId, ShardPlacementAcces
CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(relationId);
bool holdsConflictingLocks = false;
Assert(IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY));
Assert(!HasDistributionKeyCacheEntry(cacheEntry));
Oid referencingRelation = InvalidOid;
foreach_oid(referencingRelation, cacheEntry->referencingRelationsViaForeignKey)

View File

@ -34,6 +34,7 @@
#include "distributed/multi_logical_replication.h"
#include "distributed/multi_explain.h"
#include "distributed/repartition_join_execution.h"
#include "distributed/replication_origin_session_utils.h"
#include "distributed/transaction_management.h"
#include "distributed/placement_connection.h"
#include "distributed/relation_access_tracking.h"
@ -391,6 +392,9 @@ CoordinatedTransactionCallback(XactEvent event, void *arg)
ResetGlobalVariables();
ResetRelationAccessHash();
/* Reset any local replication origin session since transaction has been aborted.*/
ResetReplicationOriginLocalSession();
/* empty the CitusXactCallbackContext to ensure we're not leaking memory */
MemoryContextReset(CitusXactCallbackContext);
@ -715,6 +719,8 @@ CoordinatedSubTransactionCallback(SubXactEvent event, SubTransactionId subId,
SetCreateCitusTransactionLevel(0);
}
/* Reset any local replication origin session since subtransaction has been aborted.*/
ResetReplicationOriginLocalSession();
MemoryContextSwitchTo(previousContext);
break;

View File

@ -374,6 +374,54 @@ SendCommandListToWorkerOutsideTransactionWithConnection(MultiConnection *workerC
}
/*
* SendCommandListToWorkerListWithBareConnections sends the command list
* over the specified bare connections. This function is mainly useful to
* avoid opening an closing connections excessively by allowing reusing
* connections to send multiple separate bare commands. The function
* raises an error if any of the queries fail.
*/
void
SendCommandListToWorkerListWithBareConnections(List *workerConnectionList,
List *commandList)
{
Assert(!InCoordinatedTransaction());
Assert(!GetCoordinatedTransactionShouldUse2PC());
if (list_length(commandList) == 0 || list_length(workerConnectionList) == 0)
{
/* nothing to do */
return;
}
/*
* In order to avoid round-trips per query in queryStringList,
* we join the string and send as a single command. Also,
* if there is only a single command, avoid additional call to
* StringJoin given that some strings can be quite large.
*/
char *stringToSend = (list_length(commandList) == 1) ?
linitial(commandList) : StringJoin(commandList, ';');
/* send commands in parallel */
MultiConnection *connection = NULL;
foreach_ptr(connection, workerConnectionList)
{
int querySent = SendRemoteCommand(connection, stringToSend);
if (querySent == 0)
{
ReportConnectionError(connection, ERROR);
}
}
bool failOnError = true;
foreach_ptr(connection, workerConnectionList)
{
ClearResults(connection, failOnError);
}
}
/*
* SendCommandListToWorkerInCoordinatedTransaction opens connection to the node
* with the given nodeName and nodePort. The commands are sent as part of the
@ -390,6 +438,8 @@ SendMetadataCommandListToWorkerListInCoordinatedTransaction(List *workerNodeList
return;
}
ErrorIfAnyMetadataNodeOutOfSync(workerNodeList);
UseCoordinatedTransaction();
List *connectionList = NIL;

View File

@ -442,8 +442,7 @@ ShardsIntervalsEqual(ShardInterval *leftShardInterval, ShardInterval *rightShard
{
return HashPartitionedShardIntervalsEqual(leftShardInterval, rightShardInterval);
}
else if (IsCitusTableType(leftShardInterval->relationId,
CITUS_TABLE_WITH_NO_DIST_KEY))
else if (!HasDistributionKey(leftShardInterval->relationId))
{
/*
* Reference tables has only a single shard and all reference tables

View File

@ -503,12 +503,11 @@ GetReferenceTableColocationId()
/*
* DeleteAllReplicatedTablePlacementsFromNodeGroup function iterates over
* list of reference and replicated hash distributed tables and deletes
* all placements from pg_dist_placement table for given group.
* GetAllReplicatedTableList returns all tables which has replicated placements.
* i.e. (all reference tables) + (distributed tables with more than 1 placements)
*/
void
DeleteAllReplicatedTablePlacementsFromNodeGroup(int32 groupId, bool localOnly)
List *
GetAllReplicatedTableList(void)
{
List *referenceTableList = CitusTableTypeIdList(REFERENCE_TABLE);
List *replicatedMetadataSyncedDistributedTableList =
@ -517,13 +516,25 @@ DeleteAllReplicatedTablePlacementsFromNodeGroup(int32 groupId, bool localOnly)
List *replicatedTableList =
list_concat(referenceTableList, replicatedMetadataSyncedDistributedTableList);
/* if there are no reference tables, we do not need to do anything */
if (list_length(replicatedTableList) == 0)
{
return;
return replicatedTableList;
}
StringInfo deletePlacementCommand = makeStringInfo();
/*
* ReplicatedPlacementsForNodeGroup filters all replicated placements for given
* node group id.
*/
List *
ReplicatedPlacementsForNodeGroup(int32 groupId)
{
List *replicatedTableList = GetAllReplicatedTableList();
if (list_length(replicatedTableList) == 0)
{
return NIL;
}
List *replicatedPlacementsForNodeGroup = NIL;
Oid replicatedTableId = InvalidOid;
foreach_oid(replicatedTableId, replicatedTableList)
{
@ -538,25 +549,104 @@ DeleteAllReplicatedTablePlacementsFromNodeGroup(int32 groupId, bool localOnly)
continue;
}
replicatedPlacementsForNodeGroup = list_concat(replicatedPlacementsForNodeGroup,
placements);
}
return replicatedPlacementsForNodeGroup;
}
/*
* DeleteShardPlacementCommand returns a command for deleting given placement from
* metadata.
*/
char *
DeleteShardPlacementCommand(uint64 placementId)
{
StringInfo deletePlacementCommand = makeStringInfo();
appendStringInfo(deletePlacementCommand,
"DELETE FROM pg_catalog.pg_dist_placement "
"WHERE placementid = " UINT64_FORMAT, placementId);
return deletePlacementCommand->data;
}
/*
* DeleteAllReplicatedTablePlacementsFromNodeGroup function iterates over
* list of reference and replicated hash distributed tables and deletes
* all placements from pg_dist_placement table for given group.
*/
void
DeleteAllReplicatedTablePlacementsFromNodeGroup(int32 groupId, bool localOnly)
{
List *replicatedPlacementListForGroup = ReplicatedPlacementsForNodeGroup(groupId);
/* if there are no replicated tables for the group, we do not need to do anything */
if (list_length(replicatedPlacementListForGroup) == 0)
{
return;
}
GroupShardPlacement *placement = NULL;
foreach_ptr(placement, placements)
foreach_ptr(placement, replicatedPlacementListForGroup)
{
LockShardDistributionMetadata(placement->shardId, ExclusiveLock);
if (!localOnly)
{
char *deletePlacementCommand =
DeleteShardPlacementCommand(placement->placementId);
SendCommandToWorkersWithMetadata(deletePlacementCommand);
}
DeleteShardPlacementRow(placement->placementId);
}
}
/*
* DeleteAllReplicatedTablePlacementsFromNodeGroupViaMetadataContext does the same as
* DeleteAllReplicatedTablePlacementsFromNodeGroup except it uses metadataSyncContext for
* connections.
*/
void
DeleteAllReplicatedTablePlacementsFromNodeGroupViaMetadataContext(
MetadataSyncContext *context, int32 groupId, bool localOnly)
{
List *replicatedPlacementListForGroup = ReplicatedPlacementsForNodeGroup(groupId);
/* if there are no replicated tables for the group, we do not need to do anything */
if (list_length(replicatedPlacementListForGroup) == 0)
{
return;
}
MemoryContext oldContext = MemoryContextSwitchTo(context->context);
GroupShardPlacement *placement = NULL;
foreach_ptr(placement, replicatedPlacementListForGroup)
{
LockShardDistributionMetadata(placement->shardId, ExclusiveLock);
if (!localOnly)
{
resetStringInfo(deletePlacementCommand);
appendStringInfo(deletePlacementCommand,
"DELETE FROM pg_catalog.pg_dist_placement "
"WHERE placementid = " UINT64_FORMAT,
placement->placementId);
char *deletePlacementCommand =
DeleteShardPlacementCommand(placement->placementId);
SendCommandToWorkersWithMetadata(deletePlacementCommand->data);
SendOrCollectCommandListToMetadataNodes(context,
list_make1(deletePlacementCommand));
}
/* do not execute local transaction if we collect commands */
if (!MetadataSyncCollectsCommands(context))
{
DeleteShardPlacementRow(placement->placementId);
}
ResetMetadataSyncMemoryContext(context);
}
MemoryContextSwitchTo(oldContext);
}

View File

@ -0,0 +1,239 @@
/*-------------------------------------------------------------------------
*
* replication_origin_session_utils.c
* Functions for managing replication origin session.
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "distributed/replication_origin_session_utils.h"
#include "distributed/remote_commands.h"
#include "distributed/metadata_cache.h"
#include "utils/builtins.h"
#include "miscadmin.h"
static bool IsRemoteReplicationOriginSessionSetup(MultiConnection *connection);
static void SetupMemoryContextResetReplicationOriginHandler(void);
static void SetupReplicationOriginSessionHelper(bool isContexResetSetupNeeded);
static inline bool IsLocalReplicationOriginSessionActive(void);
PG_FUNCTION_INFO_V1(citus_internal_start_replication_origin_tracking);
PG_FUNCTION_INFO_V1(citus_internal_stop_replication_origin_tracking);
PG_FUNCTION_INFO_V1(citus_internal_is_replication_origin_tracking_active);
/*
* This variable is used to remember the replication origin id of the current session
* before resetting it to DoNotReplicateId in SetupReplicationOriginLocalSession.
*/
static RepOriginId OriginalOriginId = InvalidRepOriginId;
/*
* Setting that controls whether replication origin tracking is enabled
*/
bool EnableChangeDataCapture = false;
/* citus_internal_start_replication_origin_tracking starts a new replication origin session
* in the local node. This function is used to avoid publishing the WAL records to the
* replication slot by setting replication origin to DoNotReplicateId in WAL records.
* It remembers the previous replication origin for the current session which will be
* used to reset the replication origin to the previous value when the session ends.
*/
Datum
citus_internal_start_replication_origin_tracking(PG_FUNCTION_ARGS)
{
if (!EnableChangeDataCapture)
{
PG_RETURN_VOID();
}
SetupReplicationOriginSessionHelper(false);
PG_RETURN_VOID();
}
/* citus_internal_stop_replication_origin_tracking ends the current replication origin session
* in the local node. This function is used to reset the replication origin to the
* earlier value of replication origin.
*/
Datum
citus_internal_stop_replication_origin_tracking(PG_FUNCTION_ARGS)
{
ResetReplicationOriginLocalSession();
PG_RETURN_VOID();
}
/* citus_internal_is_replication_origin_tracking_active checks if the current replication origin
* session is active in the local node.
*/
Datum
citus_internal_is_replication_origin_tracking_active(PG_FUNCTION_ARGS)
{
bool result = IsLocalReplicationOriginSessionActive();
PG_RETURN_BOOL(result);
}
/* IsLocalReplicationOriginSessionActive checks if the current replication origin
* session is active in the local node.
*/
inline bool
IsLocalReplicationOriginSessionActive(void)
{
return (replorigin_session_origin == DoNotReplicateId);
}
/*
* SetupMemoryContextResetReplicationOriginHandler registers a callback function
* that resets the replication origin session in case of any error for the current
* memory context.
*/
static void
SetupMemoryContextResetReplicationOriginHandler()
{
MemoryContextCallback *replicationOriginResetCallback = palloc0(
sizeof(MemoryContextCallback));
replicationOriginResetCallback->func =
ResetReplicationOriginLocalSessionCallbackHandler;
replicationOriginResetCallback->arg = NULL;
MemoryContextRegisterResetCallback(CurrentMemoryContext,
replicationOriginResetCallback);
}
/*
* SetupReplicationOriginSessionHelper sets up a new replication origin session in a
* local session. It takes an argument isContexResetSetupNeeded to decide whether
* to register a callback function that resets the replication origin session in case
* of any error for the current memory context.
*/
static void
SetupReplicationOriginSessionHelper(bool isContexResetSetupNeeded)
{
if (!EnableChangeDataCapture)
{
return;
}
OriginalOriginId = replorigin_session_origin;
replorigin_session_origin = DoNotReplicateId;
if (isContexResetSetupNeeded)
{
SetupMemoryContextResetReplicationOriginHandler();
}
}
/*
* SetupReplicationOriginLocalSession sets up a new replication origin session in a
* local session.
*/
void
SetupReplicationOriginLocalSession()
{
SetupReplicationOriginSessionHelper(true);
}
/*
* ResetReplicationOriginLocalSession resets the replication origin session in a
* local node.
*/
void
ResetReplicationOriginLocalSession(void)
{
if (replorigin_session_origin != DoNotReplicateId)
{
return;
}
replorigin_session_origin = OriginalOriginId;
}
/*
* ResetReplicationOriginLocalSessionCallbackHandler is a callback function that
* resets the replication origin session in a local node. This is used to register
* with MemoryContextRegisterResetCallback to reset the replication origin session
* in case of any error for the given memory context.
*/
void
ResetReplicationOriginLocalSessionCallbackHandler(void *arg)
{
ResetReplicationOriginLocalSession();
}
/*
* SetupReplicationOriginRemoteSession sets up a new replication origin session in a
* remote session. The identifier is used to create a unique replication origin name
* for the session in the remote node.
*/
void
SetupReplicationOriginRemoteSession(MultiConnection *connection)
{
if (!EnableChangeDataCapture)
{
return;
}
if (connection != NULL && !IsRemoteReplicationOriginSessionSetup(connection))
{
StringInfo replicationOriginSessionSetupQuery = makeStringInfo();
appendStringInfo(replicationOriginSessionSetupQuery,
"select pg_catalog.citus_internal_start_replication_origin_tracking();");
ExecuteCriticalRemoteCommand(connection,
replicationOriginSessionSetupQuery->data);
connection->isReplicationOriginSessionSetup = true;
}
}
/*
* ResetReplicationOriginRemoteSession resets the replication origin session in a
* remote node.
*/
void
ResetReplicationOriginRemoteSession(MultiConnection *connection)
{
if (connection != NULL && connection->isReplicationOriginSessionSetup)
{
StringInfo replicationOriginSessionResetQuery = makeStringInfo();
appendStringInfo(replicationOriginSessionResetQuery,
"select pg_catalog.citus_internal_stop_replication_origin_tracking();");
ExecuteCriticalRemoteCommand(connection,
replicationOriginSessionResetQuery->data);
connection->isReplicationOriginSessionSetup = false;
}
}
/*
* IsRemoteReplicationOriginSessionSetup checks if the replication origin is setup
* already in the remote session by calliing the UDF
* citus_internal_is_replication_origin_tracking_active(). This is also remembered
* in the connection object to avoid calling the UDF again next time.
*/
static bool
IsRemoteReplicationOriginSessionSetup(MultiConnection *connection)
{
if (connection->isReplicationOriginSessionSetup)
{
return true;
}
StringInfo isReplicationOriginSessionSetupQuery = makeStringInfo();
appendStringInfo(isReplicationOriginSessionSetupQuery,
"SELECT pg_catalog.citus_internal_is_replication_origin_tracking_active()");
bool result =
ExecuteRemoteCommandAndCheckResult(connection,
isReplicationOriginSessionSetupQuery->data,
"t");
connection->isReplicationOriginSessionSetup = result;
return result;
}

View File

@ -503,45 +503,6 @@ SetLocktagForShardDistributionMetadata(int64 shardId, LOCKTAG *tag)
}
/*
* LockPlacementCleanup takes an exclusive lock to ensure that only one process
* can cleanup placements at the same time.
*/
void
LockPlacementCleanup(void)
{
LOCKTAG tag;
const bool sessionLock = false;
const bool dontWait = false;
/* Moves acquire lock with a constant operation id CITUS_SHARD_MOVE.
* This will change as we add support for parallel moves.
*/
SET_LOCKTAG_CITUS_OPERATION(tag, CITUS_SHARD_MOVE);
(void) LockAcquire(&tag, ExclusiveLock, sessionLock, dontWait);
}
/*
* TryLockPlacementCleanup takes an exclusive lock to ensure that only one
* process can cleanup placements at the same time.
*/
bool
TryLockPlacementCleanup(void)
{
LOCKTAG tag;
const bool sessionLock = false;
const bool dontWait = true;
/* Moves acquire lock with a constant operation id CITUS_SHARD_MOVE.
* This will change as we add support for parallel moves.
*/
SET_LOCKTAG_CITUS_OPERATION(tag, CITUS_SHARD_MOVE);
bool lockAcquired = LockAcquire(&tag, ExclusiveLock, sessionLock, dontWait);
return lockAcquired;
}
/*
* LockReferencedReferenceShardDistributionMetadata acquires shard distribution
* metadata locks with the given lock mode on the reference tables which has a

View File

@ -223,8 +223,7 @@ ShardIndex(ShardInterval *shardInterval)
* currently it is not required.
*/
if (!IsCitusTableTypeCacheEntry(cacheEntry, HASH_DISTRIBUTED) &&
!IsCitusTableTypeCacheEntry(
cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY))
HasDistributionKeyCacheEntry(cacheEntry))
{
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("finding index of a given shard is only supported for "
@ -233,7 +232,7 @@ ShardIndex(ShardInterval *shardInterval)
}
/* short-circuit for reference tables */
if (IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY))
if (!HasDistributionKeyCacheEntry(cacheEntry))
{
/*
* Reference tables and citus local tables have only a single shard,
@ -333,7 +332,7 @@ FindShardIntervalIndex(Datum searchedValue, CitusTableCacheEntry *cacheEntry)
shardIndex = CalculateUniformHashRangeIndex(hashedValue, shardCount);
}
}
else if (IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY))
else if (!HasDistributionKeyCacheEntry(cacheEntry))
{
/* non-distributed tables have a single shard, all values mapped to that shard */
Assert(shardCount == 1);

View File

@ -35,8 +35,22 @@
#include "distributed/worker_create_or_replace.h"
#include "distributed/worker_protocol.h"
/*
* OnCollisionAction describes what to do when the created object
* and existing object do not match.
*/
typedef enum OnCollisionAction
{
ON_COLLISION_RENAME,
ON_COLLISION_DROP
} OnCollisionAction;
static List * CreateStmtListByObjectAddress(const ObjectAddress *address);
static bool CompareStringList(List *list1, List *list2);
static OnCollisionAction GetOnCollisionAction(const ObjectAddress *address);
PG_FUNCTION_INFO_V1(worker_create_or_replace_object);
PG_FUNCTION_INFO_V1(worker_create_or_replace_object_array);
@ -192,7 +206,8 @@ WorkerCreateOrReplaceObject(List *sqlStatements)
/*
* Object with name from statement is already found locally, check if states are
* identical. If objects differ we will rename the old object (non- destructively)
* as to make room to create the new object according to the spec sent.
* or drop it (if safe) as to make room to create the new object according to the
* spec sent.
*/
/*
@ -213,11 +228,22 @@ WorkerCreateOrReplaceObject(List *sqlStatements)
return false;
}
char *newName = GenerateBackupNameForCollision(address);
Node *utilityStmt = NULL;
RenameStmt *renameStmt = CreateRenameStatement(address, newName);
const char *sqlRenameStmt = DeparseTreeNode((Node *) renameStmt);
ProcessUtilityParseTree((Node *) renameStmt, sqlRenameStmt,
if (GetOnCollisionAction(address) == ON_COLLISION_DROP)
{
/* drop the existing object */
utilityStmt = (Node *) CreateDropStmt(address);
}
else
{
/* rename the existing object */
char *newName = GenerateBackupNameForCollision(address);
utilityStmt = (Node *) CreateRenameStatement(address, newName);
}
const char *commandString = DeparseTreeNode(utilityStmt);
ProcessUtilityParseTree(utilityStmt, commandString,
PROCESS_UTILITY_QUERY,
NULL, None_Receiver, NULL);
}
@ -286,6 +312,11 @@ CreateStmtListByObjectAddress(const ObjectAddress *address)
return list_make1(GetFunctionDDLCommand(address->objectId, false));
}
case OCLASS_PUBLICATION:
{
return list_make1(CreatePublicationDDLCommand(address->objectId));
}
case OCLASS_TSCONFIG:
{
List *stmts = GetCreateTextSearchConfigStatements(address);
@ -312,6 +343,37 @@ CreateStmtListByObjectAddress(const ObjectAddress *address)
}
/*
* GetOnCollisionAction decides what to do when the object already exists.
*/
static OnCollisionAction
GetOnCollisionAction(const ObjectAddress *address)
{
switch (getObjectClass(address))
{
case OCLASS_PUBLICATION:
{
/*
* We prefer to drop publications because they can be
* harmful (cause update/delete failures) and are relatively
* safe to drop.
*/
return ON_COLLISION_DROP;
}
case OCLASS_COLLATION:
case OCLASS_PROC:
case OCLASS_TSCONFIG:
case OCLASS_TSDICT:
case OCLASS_TYPE:
default:
{
return ON_COLLISION_RENAME;
}
}
}
/*
* GenerateBackupNameForCollision calculate a backup name for a given object by its
* address. This name should be used when renaming an existing object before creating the
@ -362,6 +424,64 @@ GenerateBackupNameForCollision(const ObjectAddress *address)
}
/*
* CreateDropPublicationStmt creates a DROP PUBLICATION statement for the
* publication at the given address.
*/
static DropStmt *
CreateDropPublicationStmt(const ObjectAddress *address)
{
Assert(address->classId == PublicationRelationId);
DropStmt *dropStmt = makeNode(DropStmt);
dropStmt->removeType = OBJECT_PUBLICATION;
dropStmt->behavior = DROP_RESTRICT;
HeapTuple publicationTuple =
SearchSysCache1(PUBLICATIONOID, ObjectIdGetDatum(address->objectId));
if (!HeapTupleIsValid(publicationTuple))
{
ereport(ERROR, (errmsg("cannot find publication with oid: %d",
address->objectId)));
}
Form_pg_publication publicationForm =
(Form_pg_publication) GETSTRUCT(publicationTuple);
char *publicationName = NameStr(publicationForm->pubname);
dropStmt->objects = list_make1(makeString(publicationName));
ReleaseSysCache(publicationTuple);
return dropStmt;
}
/*
* CreateDropStmt returns a DROP statement for the given object.
*/
DropStmt *
CreateDropStmt(const ObjectAddress *address)
{
switch (getObjectClass(address))
{
case OCLASS_PUBLICATION:
{
return CreateDropPublicationStmt(address);
}
default:
{
break;
}
}
ereport(ERROR, (errmsg("unsupported object to construct a drop statement"),
errdetail("unable to generate a parsetree for the drop")));
}
/*
* CreateRenameTypeStmt creates a rename statement for a type based on its ObjectAddress.
* The rename statement will rename the existing object on its address to the value

View File

@ -70,6 +70,7 @@ static void AlterSequenceMinMax(Oid sequenceId, char *schemaName, char *sequence
PG_FUNCTION_INFO_V1(worker_apply_shard_ddl_command);
PG_FUNCTION_INFO_V1(worker_apply_inter_shard_ddl_command);
PG_FUNCTION_INFO_V1(worker_apply_sequence_command);
PG_FUNCTION_INFO_V1(worker_adjust_identity_column_seq_ranges);
PG_FUNCTION_INFO_V1(worker_append_table_to_shard);
PG_FUNCTION_INFO_V1(worker_nextval);
@ -133,6 +134,60 @@ worker_apply_inter_shard_ddl_command(PG_FUNCTION_ARGS)
}
/*
* worker_adjust_identity_column_seq_ranges takes a table oid, runs an ALTER SEQUENCE statement
* for each identity column to adjust the minvalue and maxvalue of the sequence owned by
* identity column such that the sequence creates globally unique values.
* We use table oid instead of sequence name to avoid any potential conflicts between sequences of different tables. This way, we can safely iterate through identity columns on a specific table without any issues. While this may introduce a small amount of business logic to workers, it's a much safer approach overall.
*/
Datum
worker_adjust_identity_column_seq_ranges(PG_FUNCTION_ARGS)
{
CheckCitusVersion(ERROR);
Oid tableRelationId = PG_GETARG_OID(0);
EnsureTableOwner(tableRelationId);
Relation tableRelation = relation_open(tableRelationId, AccessShareLock);
TupleDesc tableTupleDesc = RelationGetDescr(tableRelation);
bool missingSequenceOk = false;
for (int attributeIndex = 0; attributeIndex < tableTupleDesc->natts;
attributeIndex++)
{
Form_pg_attribute attributeForm = TupleDescAttr(tableTupleDesc,
attributeIndex);
/* skip dropped columns */
if (attributeForm->attisdropped)
{
continue;
}
if (attributeForm->attidentity)
{
Oid sequenceOid = getIdentitySequence(tableRelationId,
attributeForm->attnum,
missingSequenceOk);
Oid sequenceSchemaOid = get_rel_namespace(sequenceOid);
char *sequenceSchemaName = get_namespace_name(sequenceSchemaOid);
char *sequenceName = get_rel_name(sequenceOid);
Oid sequenceTypeId = pg_get_sequencedef(sequenceOid)->seqtypid;
AlterSequenceMinMax(sequenceOid, sequenceSchemaName, sequenceName,
sequenceTypeId);
}
}
relation_close(tableRelation, NoLock);
PG_RETURN_VOID();
}
/*
* worker_apply_sequence_command takes a CREATE SEQUENCE command string, runs the
* CREATE SEQUENCE command then creates and runs an ALTER SEQUENCE statement

View File

@ -351,18 +351,17 @@ ShouldHideShardsInternal(void)
return false;
}
}
else if (MyBackendType != B_BACKEND)
else if (MyBackendType != B_BACKEND && MyBackendType != B_WAL_SENDER)
{
/*
* We are aiming only to hide shards from client
* backends or certain background workers(see above),
* not backends like walsender or checkpointer.
*/
return false;
}
if (IsCitusInternalBackend() || IsRebalancerInternalBackend() ||
IsCitusRunCommandBackend())
IsCitusRunCommandBackend() || IsCitusShardTransferBackend())
{
/* we never hide shards from Citus */
return false;

Some files were not shown because too many files have changed in this diff Show More