From d127516dc8fabf73116e3fb147645e3f7a3850ab Mon Sep 17 00:00:00 2001 From: Nils Dijk Date: Mon, 25 Jan 2021 15:55:04 +0100 Subject: [PATCH] Mitigate segfault in connection statemachine (#4551) As described in the comment, we have observed crashes in production due to a segfault caused by the dereference of a NULL pointer in our connection statemachine. As a mitigation, preventing system crashes, we provide an error with a small explanation of the issue. Unfortunately the case is not reliably reproduced yet, hence the inability to add tests. DESCRIPTION: Prevent segfaults when SAVEPOINT handling cannot recover from connection failures --- .../distributed/executor/adaptive_executor.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/backend/distributed/executor/adaptive_executor.c b/src/backend/distributed/executor/adaptive_executor.c index 8ad9d7e11..4e4292f8f 100644 --- a/src/backend/distributed/executor/adaptive_executor.c +++ b/src/backend/distributed/executor/adaptive_executor.c @@ -3377,6 +3377,25 @@ TransactionStateMachine(WorkerSession *session) case REMOTE_TRANS_SENT_COMMAND: { TaskPlacementExecution *placementExecution = session->currentTask; + if (placementExecution == NULL) + { + /* + * We have seen accounts in production where the placementExecution + * could inadvertently be not set. Investigation documented on + * https://github.com/citusdata/citus-enterprise/issues/493 + * (due to sensitive data in the initial report it is not discussed + * in our community repository) + * + * Currently we don't have a reliable way of reproducing this issue. + * Erroring here seems to be a more desirable approach compared to a + * SEGFAULT on the dereference of placementExecution, with a possible + * crash recovery as a result. + */ + ereport(ERROR, (errmsg( + "unable to recover from inconsistent state in " + "the connection state machine on coordinator"))); + } + ShardCommandExecution *shardCommandExecution = placementExecution->shardCommandExecution; Task *task = shardCommandExecution->task;