mirror of https://github.com/citusdata/citus.git
Merge pull request #563 from citusdata/fix/345_task_cleanup_logic
make task cleanup logic less error pronepull/1938/head
commit
c952a0e006
|
@ -32,6 +32,7 @@
|
||||||
#include "storage/fd.h"
|
#include "storage/fd.h"
|
||||||
#include "utils/builtins.h"
|
#include "utils/builtins.h"
|
||||||
#include "utils/hsearch.h"
|
#include "utils/hsearch.h"
|
||||||
|
#include "utils/timestamp.h"
|
||||||
|
|
||||||
|
|
||||||
int MaxAssignTaskBatchSize = 64; /* maximum number of tasks to assign per round */
|
int MaxAssignTaskBatchSize = 64; /* maximum number of tasks to assign per round */
|
||||||
|
@ -2768,12 +2769,14 @@ TrackerHashCleanupJob(HTAB *taskTrackerHash, Task *jobCleanupTask)
|
||||||
{
|
{
|
||||||
uint64 jobId = jobCleanupTask->jobId;
|
uint64 jobId = jobCleanupTask->jobId;
|
||||||
List *taskTrackerList = NIL;
|
List *taskTrackerList = NIL;
|
||||||
ListCell *taskTrackerCell = NULL;
|
List *remainingTaskTrackerList = NIL;
|
||||||
long sleepInterval = 0;
|
const long timeoutDuration = 4000; /* milliseconds */
|
||||||
const int minimumSleepInterval = 150;
|
const long statusCheckInterval = 10000; /* microseconds */
|
||||||
|
bool timedOut = false;
|
||||||
|
TimestampTz startTime = 0;
|
||||||
TaskTracker *taskTracker = NULL;
|
TaskTracker *taskTracker = NULL;
|
||||||
HASH_SEQ_STATUS status;
|
HASH_SEQ_STATUS status;
|
||||||
|
|
||||||
hash_seq_init(&status, taskTrackerHash);
|
hash_seq_init(&status, taskTrackerHash);
|
||||||
|
|
||||||
/* walk over task trackers and try to issue job clean up requests */
|
/* walk over task trackers and try to issue job clean up requests */
|
||||||
|
@ -2821,15 +2824,33 @@ TrackerHashCleanupJob(HTAB *taskTrackerHash, Task *jobCleanupTask)
|
||||||
taskTracker = (TaskTracker *) hash_seq_search(&status);
|
taskTracker = (TaskTracker *) hash_seq_search(&status);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* give task trackers time to finish their clean up jobs */
|
/* record the time when we start waiting for cleanup jobs to be sent */
|
||||||
sleepInterval = Max(minimumSleepInterval, RemoteTaskCheckInterval * 2) * 1000L;
|
startTime = GetCurrentTimestamp();
|
||||||
pg_usleep(sleepInterval);
|
|
||||||
|
|
||||||
/* walk over task trackers to which we sent clean up requests */
|
/*
|
||||||
taskTrackerCell = NULL;
|
* Walk over task trackers to which we sent clean up requests. Perform
|
||||||
foreach(taskTrackerCell, taskTrackerList)
|
* these checks until it times out.
|
||||||
|
*
|
||||||
|
* We want to determine timedOut flag after the loop start to make sure
|
||||||
|
* we iterate one more time after time out occurs. This is necessary to report
|
||||||
|
* warning messages for timed out cleanup jobs.
|
||||||
|
*/
|
||||||
|
remainingTaskTrackerList = taskTrackerList;
|
||||||
|
while (list_length(remainingTaskTrackerList) > 0 && !timedOut)
|
||||||
{
|
{
|
||||||
TaskTracker *taskTracker = (TaskTracker *) lfirst(taskTrackerCell);
|
List *activeTackTrackerList = remainingTaskTrackerList;
|
||||||
|
ListCell *activeTaskTrackerCell = NULL;
|
||||||
|
TimestampTz currentTime = 0;
|
||||||
|
|
||||||
|
remainingTaskTrackerList = NIL;
|
||||||
|
|
||||||
|
pg_usleep(statusCheckInterval);
|
||||||
|
currentTime = GetCurrentTimestamp();
|
||||||
|
timedOut = TimestampDifferenceExceeds(startTime, currentTime, timeoutDuration);
|
||||||
|
|
||||||
|
foreach(activeTaskTrackerCell, activeTackTrackerList)
|
||||||
|
{
|
||||||
|
TaskTracker *taskTracker = (TaskTracker *) lfirst(activeTaskTrackerCell);
|
||||||
int32 connectionId = taskTracker->connectionId;
|
int32 connectionId = taskTracker->connectionId;
|
||||||
const char *nodeName = taskTracker->workerName;
|
const char *nodeName = taskTracker->workerName;
|
||||||
uint32 nodePort = taskTracker->workerPort;
|
uint32 nodePort = taskTracker->workerPort;
|
||||||
|
@ -2840,25 +2861,42 @@ TrackerHashCleanupJob(HTAB *taskTrackerHash, Task *jobCleanupTask)
|
||||||
QueryStatus queryStatus = MultiClientQueryStatus(connectionId);
|
QueryStatus queryStatus = MultiClientQueryStatus(connectionId);
|
||||||
if (queryStatus == CLIENT_QUERY_DONE)
|
if (queryStatus == CLIENT_QUERY_DONE)
|
||||||
{
|
{
|
||||||
ereport(DEBUG4, (errmsg("completed cleanup query for job " UINT64_FORMAT
|
ereport(DEBUG4, (errmsg("completed cleanup query for job "
|
||||||
" on node \"%s:%u\"", jobId, nodeName,
|
UINT64_FORMAT, jobId)));
|
||||||
nodePort)));
|
|
||||||
|
|
||||||
/* clear connection for future cleanup queries */
|
/* clear connection for future cleanup queries */
|
||||||
taskTracker->connectionBusy = false;
|
taskTracker->connectionBusy = false;
|
||||||
}
|
}
|
||||||
else
|
else if (timedOut)
|
||||||
{
|
{
|
||||||
ereport(WARNING, (errmsg("could not receive response for cleanup query "
|
ereport(WARNING, (errmsg("could not receive response for cleanup "
|
||||||
"for job " UINT64_FORMAT " on node \"%s:%u\"",
|
"query status for job " UINT64_FORMAT " "
|
||||||
jobId, nodeName, nodePort)));
|
"on node \"%s:%u\" with status %d", jobId,
|
||||||
}
|
nodeName, nodePort, (int) queryStatus),
|
||||||
|
errhint("Manually clean job resources on node "
|
||||||
|
"\"%s:%u\" by running \"%s\" ", nodeName,
|
||||||
|
nodePort, jobCleanupTask->queryString)));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
|
remainingTaskTrackerList = lappend(remainingTaskTrackerList,
|
||||||
|
taskTracker);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (timedOut)
|
||||||
{
|
{
|
||||||
ereport(WARNING, (errmsg("could not receive response for cleanup query "
|
ereport(WARNING, (errmsg("could not receive response for cleanup query "
|
||||||
"for job " UINT64_FORMAT " on node \"%s:%u\"",
|
"result for job " UINT64_FORMAT " on node "
|
||||||
jobId, nodeName, nodePort)));
|
"\"%s:%u\" with status %d", jobId, nodeName,
|
||||||
|
nodePort, (int) resultStatus),
|
||||||
|
errhint("Manually clean job resources on node "
|
||||||
|
"\"%s:%u\" by running \"%s\" ", nodeName,
|
||||||
|
nodePort, jobCleanupTask->queryString)));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
remainingTaskTrackerList = lappend(remainingTaskTrackerList, taskTracker);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -104,12 +104,12 @@ DETAIL: Creating dependency on merge taskId 13
|
||||||
DEBUG: assigned task 9 to node localhost:57637
|
DEBUG: assigned task 9 to node localhost:57637
|
||||||
DEBUG: assigned task 6 to node localhost:57638
|
DEBUG: assigned task 6 to node localhost:57638
|
||||||
DEBUG: assigned task 3 to node localhost:57637
|
DEBUG: assigned task 3 to node localhost:57637
|
||||||
DEBUG: completed cleanup query for job 1252 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1252
|
||||||
DEBUG: completed cleanup query for job 1252 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1252
|
||||||
DEBUG: completed cleanup query for job 1251 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1251
|
||||||
DEBUG: completed cleanup query for job 1251 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1251
|
||||||
DEBUG: completed cleanup query for job 1250 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1250
|
||||||
DEBUG: completed cleanup query for job 1250 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1250
|
||||||
DEBUG: CommitTransactionCommand
|
DEBUG: CommitTransactionCommand
|
||||||
l_partkey | o_orderkey | count
|
l_partkey | o_orderkey | count
|
||||||
-----------+------------+-------
|
-----------+------------+-------
|
||||||
|
@ -221,12 +221,12 @@ DEBUG: assigned task 3 to node localhost:57638
|
||||||
DEBUG: assigned task 6 to node localhost:57637
|
DEBUG: assigned task 6 to node localhost:57637
|
||||||
DEBUG: assigned task 9 to node localhost:57638
|
DEBUG: assigned task 9 to node localhost:57638
|
||||||
DEBUG: assigned task 12 to node localhost:57637
|
DEBUG: assigned task 12 to node localhost:57637
|
||||||
DEBUG: completed cleanup query for job 1255 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1255
|
||||||
DEBUG: completed cleanup query for job 1255 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1255
|
||||||
DEBUG: completed cleanup query for job 1253 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1253
|
||||||
DEBUG: completed cleanup query for job 1253 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1253
|
||||||
DEBUG: completed cleanup query for job 1254 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1254
|
||||||
DEBUG: completed cleanup query for job 1254 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1254
|
||||||
DEBUG: CommitTransactionCommand
|
DEBUG: CommitTransactionCommand
|
||||||
l_partkey | o_orderkey | count
|
l_partkey | o_orderkey | count
|
||||||
-----------+------------+-------
|
-----------+------------+-------
|
||||||
|
|
|
@ -104,12 +104,12 @@ DETAIL: Creating dependency on merge taskId 13
|
||||||
DEBUG: assigned task 9 to node localhost:57637
|
DEBUG: assigned task 9 to node localhost:57637
|
||||||
DEBUG: assigned task 6 to node localhost:57638
|
DEBUG: assigned task 6 to node localhost:57638
|
||||||
DEBUG: assigned task 3 to node localhost:57637
|
DEBUG: assigned task 3 to node localhost:57637
|
||||||
DEBUG: completed cleanup query for job 1252 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1252
|
||||||
DEBUG: completed cleanup query for job 1252 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1252
|
||||||
DEBUG: completed cleanup query for job 1251 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1251
|
||||||
DEBUG: completed cleanup query for job 1251 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1251
|
||||||
DEBUG: completed cleanup query for job 1250 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1250
|
||||||
DEBUG: completed cleanup query for job 1250 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1250
|
||||||
DEBUG: CommitTransactionCommand
|
DEBUG: CommitTransactionCommand
|
||||||
l_partkey | o_orderkey | count
|
l_partkey | o_orderkey | count
|
||||||
-----------+------------+-------
|
-----------+------------+-------
|
||||||
|
@ -221,12 +221,12 @@ DEBUG: assigned task 3 to node localhost:57638
|
||||||
DEBUG: assigned task 6 to node localhost:57637
|
DEBUG: assigned task 6 to node localhost:57637
|
||||||
DEBUG: assigned task 9 to node localhost:57638
|
DEBUG: assigned task 9 to node localhost:57638
|
||||||
DEBUG: assigned task 12 to node localhost:57637
|
DEBUG: assigned task 12 to node localhost:57637
|
||||||
DEBUG: completed cleanup query for job 1255 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1255
|
||||||
DEBUG: completed cleanup query for job 1255 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1255
|
||||||
DEBUG: completed cleanup query for job 1253 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1253
|
||||||
DEBUG: completed cleanup query for job 1253 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1253
|
||||||
DEBUG: completed cleanup query for job 1254 on node "localhost:57638"
|
DEBUG: completed cleanup query for job 1254
|
||||||
DEBUG: completed cleanup query for job 1254 on node "localhost:57637"
|
DEBUG: completed cleanup query for job 1254
|
||||||
DEBUG: CommitTransactionCommand
|
DEBUG: CommitTransactionCommand
|
||||||
l_partkey | o_orderkey | count
|
l_partkey | o_orderkey | count
|
||||||
-----------+------------+-------
|
-----------+------------+-------
|
||||||
|
|
Loading…
Reference in New Issue