Merge pull request #563 from citusdata/fix/345_task_cleanup_logic

make task cleanup logic less error prone
pull/1938/head
Murat Tuncer 2016-06-09 17:53:35 +03:00
commit c952a0e006
3 changed files with 95 additions and 57 deletions

View File

@ -32,6 +32,7 @@
#include "storage/fd.h" #include "storage/fd.h"
#include "utils/builtins.h" #include "utils/builtins.h"
#include "utils/hsearch.h" #include "utils/hsearch.h"
#include "utils/timestamp.h"
int MaxAssignTaskBatchSize = 64; /* maximum number of tasks to assign per round */ int MaxAssignTaskBatchSize = 64; /* maximum number of tasks to assign per round */
@ -2768,12 +2769,14 @@ TrackerHashCleanupJob(HTAB *taskTrackerHash, Task *jobCleanupTask)
{ {
uint64 jobId = jobCleanupTask->jobId; uint64 jobId = jobCleanupTask->jobId;
List *taskTrackerList = NIL; List *taskTrackerList = NIL;
ListCell *taskTrackerCell = NULL; List *remainingTaskTrackerList = NIL;
long sleepInterval = 0; const long timeoutDuration = 4000; /* milliseconds */
const int minimumSleepInterval = 150; const long statusCheckInterval = 10000; /* microseconds */
bool timedOut = false;
TimestampTz startTime = 0;
TaskTracker *taskTracker = NULL; TaskTracker *taskTracker = NULL;
HASH_SEQ_STATUS status; HASH_SEQ_STATUS status;
hash_seq_init(&status, taskTrackerHash); hash_seq_init(&status, taskTrackerHash);
/* walk over task trackers and try to issue job clean up requests */ /* walk over task trackers and try to issue job clean up requests */
@ -2821,15 +2824,33 @@ TrackerHashCleanupJob(HTAB *taskTrackerHash, Task *jobCleanupTask)
taskTracker = (TaskTracker *) hash_seq_search(&status); taskTracker = (TaskTracker *) hash_seq_search(&status);
} }
/* give task trackers time to finish their clean up jobs */ /* record the time when we start waiting for cleanup jobs to be sent */
sleepInterval = Max(minimumSleepInterval, RemoteTaskCheckInterval * 2) * 1000L; startTime = GetCurrentTimestamp();
pg_usleep(sleepInterval);
/* walk over task trackers to which we sent clean up requests */ /*
taskTrackerCell = NULL; * Walk over task trackers to which we sent clean up requests. Perform
foreach(taskTrackerCell, taskTrackerList) * these checks until it times out.
*
* We want to determine timedOut flag after the loop start to make sure
* we iterate one more time after time out occurs. This is necessary to report
* warning messages for timed out cleanup jobs.
*/
remainingTaskTrackerList = taskTrackerList;
while (list_length(remainingTaskTrackerList) > 0 && !timedOut)
{ {
TaskTracker *taskTracker = (TaskTracker *) lfirst(taskTrackerCell); List *activeTackTrackerList = remainingTaskTrackerList;
ListCell *activeTaskTrackerCell = NULL;
TimestampTz currentTime = 0;
remainingTaskTrackerList = NIL;
pg_usleep(statusCheckInterval);
currentTime = GetCurrentTimestamp();
timedOut = TimestampDifferenceExceeds(startTime, currentTime, timeoutDuration);
foreach(activeTaskTrackerCell, activeTackTrackerList)
{
TaskTracker *taskTracker = (TaskTracker *) lfirst(activeTaskTrackerCell);
int32 connectionId = taskTracker->connectionId; int32 connectionId = taskTracker->connectionId;
const char *nodeName = taskTracker->workerName; const char *nodeName = taskTracker->workerName;
uint32 nodePort = taskTracker->workerPort; uint32 nodePort = taskTracker->workerPort;
@ -2840,25 +2861,42 @@ TrackerHashCleanupJob(HTAB *taskTrackerHash, Task *jobCleanupTask)
QueryStatus queryStatus = MultiClientQueryStatus(connectionId); QueryStatus queryStatus = MultiClientQueryStatus(connectionId);
if (queryStatus == CLIENT_QUERY_DONE) if (queryStatus == CLIENT_QUERY_DONE)
{ {
ereport(DEBUG4, (errmsg("completed cleanup query for job " UINT64_FORMAT ereport(DEBUG4, (errmsg("completed cleanup query for job "
" on node \"%s:%u\"", jobId, nodeName, UINT64_FORMAT, jobId)));
nodePort)));
/* clear connection for future cleanup queries */ /* clear connection for future cleanup queries */
taskTracker->connectionBusy = false; taskTracker->connectionBusy = false;
} }
else else if (timedOut)
{ {
ereport(WARNING, (errmsg("could not receive response for cleanup query " ereport(WARNING, (errmsg("could not receive response for cleanup "
"for job " UINT64_FORMAT " on node \"%s:%u\"", "query status for job " UINT64_FORMAT " "
jobId, nodeName, nodePort))); "on node \"%s:%u\" with status %d", jobId,
} nodeName, nodePort, (int) queryStatus),
errhint("Manually clean job resources on node "
"\"%s:%u\" by running \"%s\" ", nodeName,
nodePort, jobCleanupTask->queryString)));
} }
else else
{
remainingTaskTrackerList = lappend(remainingTaskTrackerList,
taskTracker);
}
}
else if (timedOut)
{ {
ereport(WARNING, (errmsg("could not receive response for cleanup query " ereport(WARNING, (errmsg("could not receive response for cleanup query "
"for job " UINT64_FORMAT " on node \"%s:%u\"", "result for job " UINT64_FORMAT " on node "
jobId, nodeName, nodePort))); "\"%s:%u\" with status %d", jobId, nodeName,
nodePort, (int) resultStatus),
errhint("Manually clean job resources on node "
"\"%s:%u\" by running \"%s\" ", nodeName,
nodePort, jobCleanupTask->queryString)));
}
else
{
remainingTaskTrackerList = lappend(remainingTaskTrackerList, taskTracker);
}
} }
} }
} }

View File

@ -104,12 +104,12 @@ DETAIL: Creating dependency on merge taskId 13
DEBUG: assigned task 9 to node localhost:57637 DEBUG: assigned task 9 to node localhost:57637
DEBUG: assigned task 6 to node localhost:57638 DEBUG: assigned task 6 to node localhost:57638
DEBUG: assigned task 3 to node localhost:57637 DEBUG: assigned task 3 to node localhost:57637
DEBUG: completed cleanup query for job 1252 on node "localhost:57638" DEBUG: completed cleanup query for job 1252
DEBUG: completed cleanup query for job 1252 on node "localhost:57637" DEBUG: completed cleanup query for job 1252
DEBUG: completed cleanup query for job 1251 on node "localhost:57638" DEBUG: completed cleanup query for job 1251
DEBUG: completed cleanup query for job 1251 on node "localhost:57637" DEBUG: completed cleanup query for job 1251
DEBUG: completed cleanup query for job 1250 on node "localhost:57638" DEBUG: completed cleanup query for job 1250
DEBUG: completed cleanup query for job 1250 on node "localhost:57637" DEBUG: completed cleanup query for job 1250
DEBUG: CommitTransactionCommand DEBUG: CommitTransactionCommand
l_partkey | o_orderkey | count l_partkey | o_orderkey | count
-----------+------------+------- -----------+------------+-------
@ -221,12 +221,12 @@ DEBUG: assigned task 3 to node localhost:57638
DEBUG: assigned task 6 to node localhost:57637 DEBUG: assigned task 6 to node localhost:57637
DEBUG: assigned task 9 to node localhost:57638 DEBUG: assigned task 9 to node localhost:57638
DEBUG: assigned task 12 to node localhost:57637 DEBUG: assigned task 12 to node localhost:57637
DEBUG: completed cleanup query for job 1255 on node "localhost:57638" DEBUG: completed cleanup query for job 1255
DEBUG: completed cleanup query for job 1255 on node "localhost:57637" DEBUG: completed cleanup query for job 1255
DEBUG: completed cleanup query for job 1253 on node "localhost:57638" DEBUG: completed cleanup query for job 1253
DEBUG: completed cleanup query for job 1253 on node "localhost:57637" DEBUG: completed cleanup query for job 1253
DEBUG: completed cleanup query for job 1254 on node "localhost:57638" DEBUG: completed cleanup query for job 1254
DEBUG: completed cleanup query for job 1254 on node "localhost:57637" DEBUG: completed cleanup query for job 1254
DEBUG: CommitTransactionCommand DEBUG: CommitTransactionCommand
l_partkey | o_orderkey | count l_partkey | o_orderkey | count
-----------+------------+------- -----------+------------+-------

View File

@ -104,12 +104,12 @@ DETAIL: Creating dependency on merge taskId 13
DEBUG: assigned task 9 to node localhost:57637 DEBUG: assigned task 9 to node localhost:57637
DEBUG: assigned task 6 to node localhost:57638 DEBUG: assigned task 6 to node localhost:57638
DEBUG: assigned task 3 to node localhost:57637 DEBUG: assigned task 3 to node localhost:57637
DEBUG: completed cleanup query for job 1252 on node "localhost:57638" DEBUG: completed cleanup query for job 1252
DEBUG: completed cleanup query for job 1252 on node "localhost:57637" DEBUG: completed cleanup query for job 1252
DEBUG: completed cleanup query for job 1251 on node "localhost:57638" DEBUG: completed cleanup query for job 1251
DEBUG: completed cleanup query for job 1251 on node "localhost:57637" DEBUG: completed cleanup query for job 1251
DEBUG: completed cleanup query for job 1250 on node "localhost:57638" DEBUG: completed cleanup query for job 1250
DEBUG: completed cleanup query for job 1250 on node "localhost:57637" DEBUG: completed cleanup query for job 1250
DEBUG: CommitTransactionCommand DEBUG: CommitTransactionCommand
l_partkey | o_orderkey | count l_partkey | o_orderkey | count
-----------+------------+------- -----------+------------+-------
@ -221,12 +221,12 @@ DEBUG: assigned task 3 to node localhost:57638
DEBUG: assigned task 6 to node localhost:57637 DEBUG: assigned task 6 to node localhost:57637
DEBUG: assigned task 9 to node localhost:57638 DEBUG: assigned task 9 to node localhost:57638
DEBUG: assigned task 12 to node localhost:57637 DEBUG: assigned task 12 to node localhost:57637
DEBUG: completed cleanup query for job 1255 on node "localhost:57638" DEBUG: completed cleanup query for job 1255
DEBUG: completed cleanup query for job 1255 on node "localhost:57637" DEBUG: completed cleanup query for job 1255
DEBUG: completed cleanup query for job 1253 on node "localhost:57638" DEBUG: completed cleanup query for job 1253
DEBUG: completed cleanup query for job 1253 on node "localhost:57637" DEBUG: completed cleanup query for job 1253
DEBUG: completed cleanup query for job 1254 on node "localhost:57638" DEBUG: completed cleanup query for job 1254
DEBUG: completed cleanup query for job 1254 on node "localhost:57637" DEBUG: completed cleanup query for job 1254
DEBUG: CommitTransactionCommand DEBUG: CommitTransactionCommand
l_partkey | o_orderkey | count l_partkey | o_orderkey | count
-----------+------------+------- -----------+------------+-------