mirror of https://github.com/citusdata/citus.git
Merge pull request #4199 from citusdata/terminate_connection
Forcefully terminate connections after citus.node_connection_timeoutpull/4213/head
commit
f3962fc7f6
|
@ -609,6 +609,7 @@ static int CalculateNewConnectionCount(WorkerPool *workerPool);
|
||||||
static void OpenNewConnections(WorkerPool *workerPool, int newConnectionCount,
|
static void OpenNewConnections(WorkerPool *workerPool, int newConnectionCount,
|
||||||
TransactionProperties *transactionProperties);
|
TransactionProperties *transactionProperties);
|
||||||
static void CheckConnectionTimeout(WorkerPool *workerPool);
|
static void CheckConnectionTimeout(WorkerPool *workerPool);
|
||||||
|
static void MarkEstablishingSessionsTimedOut(WorkerPool *workerPool);
|
||||||
static int UsableConnectionCount(WorkerPool *workerPool);
|
static int UsableConnectionCount(WorkerPool *workerPool);
|
||||||
static long NextEventTimeout(DistributedExecution *execution);
|
static long NextEventTimeout(DistributedExecution *execution);
|
||||||
static WaitEventSet * BuildWaitEventSet(List *sessionList);
|
static WaitEventSet * BuildWaitEventSet(List *sessionList);
|
||||||
|
@ -1663,7 +1664,8 @@ CleanUpSessions(DistributedExecution *execution)
|
||||||
|
|
||||||
if (connection->connectionState == MULTI_CONNECTION_CONNECTING ||
|
if (connection->connectionState == MULTI_CONNECTION_CONNECTING ||
|
||||||
connection->connectionState == MULTI_CONNECTION_FAILED ||
|
connection->connectionState == MULTI_CONNECTION_FAILED ||
|
||||||
connection->connectionState == MULTI_CONNECTION_LOST)
|
connection->connectionState == MULTI_CONNECTION_LOST ||
|
||||||
|
connection->connectionState == MULTI_CONNECTION_TIMED_OUT)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* We want the MultiConnection go away and not used in
|
* We want the MultiConnection go away and not used in
|
||||||
|
@ -2704,6 +2706,16 @@ CheckConnectionTimeout(WorkerPool *workerPool)
|
||||||
"%s:%d after %u ms", workerPool->nodeName,
|
"%s:%d after %u ms", workerPool->nodeName,
|
||||||
workerPool->nodePort,
|
workerPool->nodePort,
|
||||||
NodeConnectionTimeout)));
|
NodeConnectionTimeout)));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We hit the connection timeout. In that case, we should not let the
|
||||||
|
* connection establishment to continue because the execution logic
|
||||||
|
* pretends that failed sessions are not going to be used anymore.
|
||||||
|
*
|
||||||
|
* That's why we mark the connection as timed out to trigger the state
|
||||||
|
* changes in the executor.
|
||||||
|
*/
|
||||||
|
MarkEstablishingSessionsTimedOut(workerPool);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -2714,6 +2726,28 @@ CheckConnectionTimeout(WorkerPool *workerPool)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* MarkEstablishingSessionsTimedOut goes over the sessions in the given
|
||||||
|
* workerPool and marks them timed out. ConnectionStateMachine()
|
||||||
|
* later cleans up the sessions.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
MarkEstablishingSessionsTimedOut(WorkerPool *workerPool)
|
||||||
|
{
|
||||||
|
WorkerSession *session = NULL;
|
||||||
|
foreach_ptr(session, workerPool->sessionList)
|
||||||
|
{
|
||||||
|
MultiConnection *connection = session->connection;
|
||||||
|
|
||||||
|
if (connection->connectionState == MULTI_CONNECTION_CONNECTING ||
|
||||||
|
connection->connectionState == MULTI_CONNECTION_INITIAL)
|
||||||
|
{
|
||||||
|
connection->connectionState = MULTI_CONNECTION_TIMED_OUT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* UsableConnectionCount returns the number of connections in the worker pool
|
* UsableConnectionCount returns the number of connections in the worker pool
|
||||||
* that are (soon to be) usable for sending commands, this includes both idle
|
* that are (soon to be) usable for sending commands, this includes both idle
|
||||||
|
@ -2845,6 +2879,20 @@ ConnectionStateMachine(WorkerSession *session)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case MULTI_CONNECTION_TIMED_OUT:
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* When the connection timeout happens, the connection
|
||||||
|
* might still be able to successfuly established. However,
|
||||||
|
* the executor should not try to use this connection as
|
||||||
|
* the state machines might have already progressed and used
|
||||||
|
* new pools/sessions instead. That's why we terminate the
|
||||||
|
* connection, clear any state associated with it.
|
||||||
|
*/
|
||||||
|
connection->connectionState = MULTI_CONNECTION_FAILED;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case MULTI_CONNECTION_CONNECTING:
|
case MULTI_CONNECTION_CONNECTING:
|
||||||
{
|
{
|
||||||
ConnStatusType status = PQstatus(connection->pgConn);
|
ConnStatusType status = PQstatus(connection->pgConn);
|
||||||
|
@ -3988,8 +4036,14 @@ WorkerPoolFailed(WorkerPool *workerPool)
|
||||||
bool succeeded = false;
|
bool succeeded = false;
|
||||||
dlist_iter iter;
|
dlist_iter iter;
|
||||||
|
|
||||||
/* a pool cannot fail multiple times */
|
/*
|
||||||
Assert(!workerPool->failed);
|
* A pool cannot fail multiple times, the necessary actions
|
||||||
|
* has already be taken, so bail out.
|
||||||
|
*/
|
||||||
|
if (workerPool->failed)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
dlist_foreach(iter, &workerPool->pendingTaskQueue)
|
dlist_foreach(iter, &workerPool->pendingTaskQueue)
|
||||||
{
|
{
|
||||||
|
@ -4206,8 +4260,18 @@ ScheduleNextPlacementExecution(TaskPlacementExecution *placementExecution, bool
|
||||||
int nextPlacementExecutionIndex =
|
int nextPlacementExecutionIndex =
|
||||||
placementExecution->placementExecutionIndex + 1;
|
placementExecution->placementExecutionIndex + 1;
|
||||||
|
|
||||||
/* if all tasks failed then we should already have errored out */
|
/*
|
||||||
Assert(nextPlacementExecutionIndex < placementExecutionCount);
|
* If all tasks failed then we should already have errored out.
|
||||||
|
* Still, be defensive and throw error instead of crashes.
|
||||||
|
*/
|
||||||
|
if (nextPlacementExecutionIndex >= placementExecutionCount)
|
||||||
|
{
|
||||||
|
WorkerPool *workerPool = placementExecution->workerPool;
|
||||||
|
ereport(ERROR, (errmsg("execution cannot recover from multiple "
|
||||||
|
"connection failures. Last node failed "
|
||||||
|
"%s:%d", workerPool->nodeName,
|
||||||
|
workerPool->nodePort)));
|
||||||
|
}
|
||||||
|
|
||||||
/* get the next placement in the planning order */
|
/* get the next placement in the planning order */
|
||||||
nextPlacementExecution =
|
nextPlacementExecution =
|
||||||
|
|
|
@ -82,7 +82,8 @@ typedef enum MultiConnectionState
|
||||||
MULTI_CONNECTION_CONNECTING,
|
MULTI_CONNECTION_CONNECTING,
|
||||||
MULTI_CONNECTION_CONNECTED,
|
MULTI_CONNECTION_CONNECTED,
|
||||||
MULTI_CONNECTION_FAILED,
|
MULTI_CONNECTION_FAILED,
|
||||||
MULTI_CONNECTION_LOST
|
MULTI_CONNECTION_LOST,
|
||||||
|
MULTI_CONNECTION_TIMED_OUT
|
||||||
} MultiConnectionState;
|
} MultiConnectionState;
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue