From d59c93bc504ad32621d66583de6b65f936b0ed13 Mon Sep 17 00:00:00 2001 From: sminux Date: Tue, 5 Mar 2024 10:49:35 +0300 Subject: [PATCH 1/5] fix bad copy-paste rightComparisonLimit (#7547) DESCRIPTION: change for #7543 --- src/backend/columnar/columnar_tableam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/columnar/columnar_tableam.c b/src/backend/columnar/columnar_tableam.c index 40486d08f..ca3a5f4c4 100644 --- a/src/backend/columnar/columnar_tableam.c +++ b/src/backend/columnar/columnar_tableam.c @@ -2946,7 +2946,7 @@ MajorVersionsCompatibleColumnar(char *leftVersion, char *rightVersion) } else { - rightComparisionLimit = strlen(leftVersion); + rightComparisionLimit = strlen(rightVersion); } /* we can error out early if hypens are not in the same position */ From edcdbe67b11c6ec5023dd18e537f3b876f51187d Mon Sep 17 00:00:00 2001 From: eaydingol <60466783+eaydingol@users.noreply.github.com> Date: Wed, 6 Mar 2024 14:46:49 +0300 Subject: [PATCH 2/5] Fix: store the previous shard cost for order verification (#7550) Store the previous shard cost so that the invariant checking performs as expected. --- src/backend/distributed/operations/shard_rebalancer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/distributed/operations/shard_rebalancer.c b/src/backend/distributed/operations/shard_rebalancer.c index d1868d3c4..03dc4c1b8 100644 --- a/src/backend/distributed/operations/shard_rebalancer.c +++ b/src/backend/distributed/operations/shard_rebalancer.c @@ -384,6 +384,7 @@ CheckRebalanceStateInvariants(const RebalanceState *state) Assert(shardCost->cost <= prevShardCost->cost); } totalCost += shardCost->cost; + prevShardCost = shardCost; } /* Check that utilization field is up to date. */ From f0043b64a17382e340bed5c61ec0025d0af34379 Mon Sep 17 00:00:00 2001 From: Karina <55838532+Green-Chan@users.noreply.github.com> Date: Thu, 7 Mar 2024 13:08:19 +0300 Subject: [PATCH 3/5] Fix server crash when trying to execute activate_node_snapshot() on a single-node cluster (#7552) This fixes #7551 reported by Egor Chindyaskin Function activate_node_snapshot() is not meant to be called on a cluster without worker nodes. This commit adds ERROR report for such case to prevent server crash. --- src/backend/distributed/test/metadata_sync.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/backend/distributed/test/metadata_sync.c b/src/backend/distributed/test/metadata_sync.c index dec20c772..ce025cff9 100644 --- a/src/backend/distributed/test/metadata_sync.c +++ b/src/backend/distributed/test/metadata_sync.c @@ -50,6 +50,13 @@ activate_node_snapshot(PG_FUNCTION_ARGS) * so we are using first primary worker node just for test purposes. */ WorkerNode *dummyWorkerNode = GetFirstPrimaryWorkerNode(); + if (dummyWorkerNode == NULL) + { + ereport(ERROR, (errmsg("no worker nodes found"), + errdetail("Function activate_node_snapshot is meant to be " + "used when running tests on a multi-node cluster " + "with workers."))); + } /* * Create MetadataSyncContext which is used throughout nodes' activation. From 12f56438fc85a676a79b1a2886cc69ab872c9d14 Mon Sep 17 00:00:00 2001 From: copetol <40788226+copetol@users.noreply.github.com> Date: Fri, 8 Mar 2024 16:21:42 +0300 Subject: [PATCH 4/5] Fix segfault when using certain DO block in function (#7554) When using a CASE WHEN expression in the body of the function that is used in the DO block, a segmentation fault occured. This fixes that. Fixes #7381 --------- Co-authored-by: Konstantin Morozov --- .../planner/function_call_delegation.c | 4 +++ .../expected/function_with_case_when.out | 32 +++++++++++++++++++ src/test/regress/multi_schedule | 1 + .../regress/sql/function_with_case_when.sql | 27 ++++++++++++++++ 4 files changed, 64 insertions(+) create mode 100644 src/test/regress/expected/function_with_case_when.out create mode 100644 src/test/regress/sql/function_with_case_when.sql diff --git a/src/backend/distributed/planner/function_call_delegation.c b/src/backend/distributed/planner/function_call_delegation.c index f17b02347..4a79dc25a 100644 --- a/src/backend/distributed/planner/function_call_delegation.c +++ b/src/backend/distributed/planner/function_call_delegation.c @@ -91,6 +91,10 @@ bool InDelegatedFunctionCall = false; static bool contain_param_walker(Node *node, void *context) { + if (node == NULL) + { + return false; + } if (IsA(node, Param)) { Param *paramNode = (Param *) node; diff --git a/src/test/regress/expected/function_with_case_when.out b/src/test/regress/expected/function_with_case_when.out new file mode 100644 index 000000000..18df5be0a --- /dev/null +++ b/src/test/regress/expected/function_with_case_when.out @@ -0,0 +1,32 @@ +CREATE SCHEMA function_with_case; +SET search_path TO function_with_case; +-- create function +CREATE OR REPLACE FUNCTION test_err(v1 text) + RETURNS text + LANGUAGE plpgsql + SECURITY DEFINER +AS $function$ + +begin + return v1 || ' - ok'; +END; +$function$; +do $$ declare + lNewValues text; + val text; +begin + val = 'test'; + lNewValues = test_err(v1 => case when val::text = 'test'::text then 'yes' else 'no' end); + raise notice 'lNewValues= %', lNewValues; +end;$$ ; +NOTICE: lNewValues= yes - ok +CONTEXT: PL/pgSQL function inline_code_block line XX at RAISE +-- call function +SELECT test_err('test'); + test_err +--------------------------------------------------------------------- + test - ok +(1 row) + +DROP SCHEMA function_with_case CASCADE; +NOTICE: drop cascades to function test_err(text) diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 3fec50aac..67a6e23a8 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -110,6 +110,7 @@ test: run_command_on_all_nodes test: background_task_queue_monitor test: other_databases grant_role_from_non_maindb role_operations_from_non_maindb seclabel_non_maindb test: citus_internal_access +test: function_with_case_when # Causal clock test test: clock diff --git a/src/test/regress/sql/function_with_case_when.sql b/src/test/regress/sql/function_with_case_when.sql new file mode 100644 index 000000000..03c6678e4 --- /dev/null +++ b/src/test/regress/sql/function_with_case_when.sql @@ -0,0 +1,27 @@ +CREATE SCHEMA function_with_case; +SET search_path TO function_with_case; + +-- create function +CREATE OR REPLACE FUNCTION test_err(v1 text) + RETURNS text + LANGUAGE plpgsql + SECURITY DEFINER +AS $function$ + +begin + return v1 || ' - ok'; +END; +$function$; +do $$ declare + lNewValues text; + val text; +begin + val = 'test'; + lNewValues = test_err(v1 => case when val::text = 'test'::text then 'yes' else 'no' end); + raise notice 'lNewValues= %', lNewValues; +end;$$ ; + +-- call function +SELECT test_err('test'); + +DROP SCHEMA function_with_case CASCADE; From 8afa2d0386ff303be29060b709f88994b9648b6c Mon Sep 17 00:00:00 2001 From: eaydingol <60466783+eaydingol@users.noreply.github.com> Date: Sun, 10 Mar 2024 10:20:08 +0300 Subject: [PATCH 5/5] Change the order in which the locks are acquired (#7542) This PR changes the order in which the locks are acquired (for the target and reference tables), when a modify request is initiated from a worker node that is not the "FirstWorkerNode". To prevent concurrent writes, locks are acquired on the first worker node for the replicated tables. When the update statement originates from the first worker node, it acquires the lock on the reference table(s) first, followed by the target table(s). However, if the update statement is initiated in another worker node, the lock requests are sent to the first worker in a different order. This PR unifies the modification order on the first worker node. With the third commit, independent of the node that received the request, the locks are acquired for the modified table and then the reference tables on the first node. The first commit shows a sample output for the test prior to the fix. Fixes #7477 --------- Co-authored-by: Jelte Fennema-Nio --- src/backend/distributed/utils/resource_lock.c | 28 ++++++--- src/test/regress/expected/issue_7477.out | 62 +++++++++++++++++++ src/test/regress/multi_schedule | 2 +- src/test/regress/sql/issue_7477.sql | 44 +++++++++++++ 4 files changed, 127 insertions(+), 9 deletions(-) create mode 100644 src/test/regress/expected/issue_7477.out create mode 100644 src/test/regress/sql/issue_7477.sql diff --git a/src/backend/distributed/utils/resource_lock.c b/src/backend/distributed/utils/resource_lock.c index 13e88a16e..8ac269e43 100644 --- a/src/backend/distributed/utils/resource_lock.c +++ b/src/backend/distributed/utils/resource_lock.c @@ -707,13 +707,27 @@ SerializeNonCommutativeWrites(List *shardIntervalList, LOCKMODE lockMode) } List *replicatedShardList = NIL; - if (AnyTableReplicated(shardIntervalList, &replicatedShardList)) - { - if (ClusterHasKnownMetadataWorkers() && !IsFirstWorkerNode()) - { - LockShardListResourcesOnFirstWorker(lockMode, replicatedShardList); - } + bool anyTableReplicated = AnyTableReplicated(shardIntervalList, &replicatedShardList); + /* + * Acquire locks on the modified table. + * If the table is replicated, the locks are first acquired on the first worker node then locally. + * But if we're already on the first worker, acquiring on the first worker node and locally are the same operation. + * So we only acquire locally in that case. + */ + if (anyTableReplicated && ClusterHasKnownMetadataWorkers() && !IsFirstWorkerNode()) + { + LockShardListResourcesOnFirstWorker(lockMode, replicatedShardList); + } + LockShardListResources(shardIntervalList, lockMode); + + /* + * Next, acquire locks on the reference tables that are referenced by a foreign key if there are any. + * Note that LockReferencedReferenceShardResources() first acquires locks on the first worker, + * then locally. + */ + if (anyTableReplicated) + { ShardInterval *firstShardInterval = (ShardInterval *) linitial(replicatedShardList); if (ReferenceTableShardId(firstShardInterval->shardId)) @@ -728,8 +742,6 @@ SerializeNonCommutativeWrites(List *shardIntervalList, LOCKMODE lockMode) LockReferencedReferenceShardResources(firstShardInterval->shardId, lockMode); } } - - LockShardListResources(shardIntervalList, lockMode); } diff --git a/src/test/regress/expected/issue_7477.out b/src/test/regress/expected/issue_7477.out new file mode 100644 index 000000000..224d85c6e --- /dev/null +++ b/src/test/regress/expected/issue_7477.out @@ -0,0 +1,62 @@ +--- Test for updating a table that has a foreign key reference to another reference table. +--- Issue #7477: Distributed deadlock after issuing a simple UPDATE statement +--- https://github.com/citusdata/citus/issues/7477 +CREATE TABLE table1 (id INT PRIMARY KEY); +SELECT create_reference_table('table1'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO table1 VALUES (1); +CREATE TABLE table2 ( + id INT, + info TEXT, + CONSTRAINT table1_id_fk FOREIGN KEY (id) REFERENCES table1 (id) + ); +SELECT create_reference_table('table2'); + create_reference_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO table2 VALUES (1, 'test'); +--- Runs the update command in parallel on workers. +--- Due to bug #7477, before the fix, the result is non-deterministic +--- and have several rows of the form: +--- localhost | 57638 | f | ERROR: deadlock detected +--- localhost | 57637 | f | ERROR: deadlock detected +--- localhost | 57637 | f | ERROR: canceling the transaction since it was involved in a distributed deadlock +SELECT * FROM master_run_on_worker( + ARRAY['localhost', 'localhost','localhost', 'localhost','localhost', + 'localhost','localhost', 'localhost','localhost', 'localhost']::text[], + ARRAY[57638, 57637, 57637, 57638, 57637, 57638, 57637, 57638, 57638, 57637]::int[], + ARRAY['UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1' + ]::text[], + true); + node_name | node_port | success | result +--------------------------------------------------------------------- + localhost | 57638 | t | UPDATE 1 + localhost | 57637 | t | UPDATE 1 + localhost | 57637 | t | UPDATE 1 + localhost | 57638 | t | UPDATE 1 + localhost | 57637 | t | UPDATE 1 + localhost | 57638 | t | UPDATE 1 + localhost | 57637 | t | UPDATE 1 + localhost | 57638 | t | UPDATE 1 + localhost | 57638 | t | UPDATE 1 + localhost | 57637 | t | UPDATE 1 +(10 rows) + +--- cleanup +DROP TABLE table2; +DROP TABLE table1; diff --git a/src/test/regress/multi_schedule b/src/test/regress/multi_schedule index 67a6e23a8..af5921e60 100644 --- a/src/test/regress/multi_schedule +++ b/src/test/regress/multi_schedule @@ -103,7 +103,7 @@ test: multi_dropped_column_aliases foreign_key_restriction_enforcement test: binary_protocol test: alter_table_set_access_method test: alter_distributed_table -test: issue_5248 issue_5099 issue_5763 issue_6543 issue_6758 +test: issue_5248 issue_5099 issue_5763 issue_6543 issue_6758 issue_7477 test: object_propagation_debug test: undistribute_table test: run_command_on_all_nodes diff --git a/src/test/regress/sql/issue_7477.sql b/src/test/regress/sql/issue_7477.sql new file mode 100644 index 000000000..b9c1578e9 --- /dev/null +++ b/src/test/regress/sql/issue_7477.sql @@ -0,0 +1,44 @@ + +--- Test for updating a table that has a foreign key reference to another reference table. +--- Issue #7477: Distributed deadlock after issuing a simple UPDATE statement +--- https://github.com/citusdata/citus/issues/7477 + +CREATE TABLE table1 (id INT PRIMARY KEY); +SELECT create_reference_table('table1'); +INSERT INTO table1 VALUES (1); + +CREATE TABLE table2 ( + id INT, + info TEXT, + CONSTRAINT table1_id_fk FOREIGN KEY (id) REFERENCES table1 (id) + ); +SELECT create_reference_table('table2'); +INSERT INTO table2 VALUES (1, 'test'); + +--- Runs the update command in parallel on workers. +--- Due to bug #7477, before the fix, the result is non-deterministic +--- and have several rows of the form: +--- localhost | 57638 | f | ERROR: deadlock detected +--- localhost | 57637 | f | ERROR: deadlock detected +--- localhost | 57637 | f | ERROR: canceling the transaction since it was involved in a distributed deadlock + +SELECT * FROM master_run_on_worker( + ARRAY['localhost', 'localhost','localhost', 'localhost','localhost', + 'localhost','localhost', 'localhost','localhost', 'localhost']::text[], + ARRAY[57638, 57637, 57637, 57638, 57637, 57638, 57637, 57638, 57638, 57637]::int[], + ARRAY['UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1', + 'UPDATE table2 SET info = ''test_update'' WHERE id = 1' + ]::text[], + true); + +--- cleanup +DROP TABLE table2; +DROP TABLE table1;