mirror of https://github.com/citusdata/citus.git
Add running state to rebalance job with pid reported
parent
3cf14ee816
commit
836950ff07
|
@ -145,6 +145,7 @@ typedef struct MetadataCacheData
|
||||||
Oid distRebalanceJobsJobsIndexId;
|
Oid distRebalanceJobsJobsIndexId;
|
||||||
Oid distRebalanceJobsStatusJobsIndexId;
|
Oid distRebalanceJobsStatusJobsIndexId;
|
||||||
Oid jobStatusScheduledId;
|
Oid jobStatusScheduledId;
|
||||||
|
Oid jobStatusRunningId;
|
||||||
Oid jobStatusDoneId;
|
Oid jobStatusDoneId;
|
||||||
Oid jobStatusErrorId;
|
Oid jobStatusErrorId;
|
||||||
Oid distRebalanceStrategyRelationId;
|
Oid distRebalanceStrategyRelationId;
|
||||||
|
@ -3135,6 +3136,19 @@ JobStatusScheduledId(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Oid
|
||||||
|
JobStatusRunningId(void)
|
||||||
|
{
|
||||||
|
if (!MetadataCache.jobStatusRunningId)
|
||||||
|
{
|
||||||
|
MetadataCache.jobStatusRunningId =
|
||||||
|
LookupStringEnumValueId("citus_job_status", "running");
|
||||||
|
}
|
||||||
|
|
||||||
|
return MetadataCache.jobStatusRunningId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Oid
|
Oid
|
||||||
JobStatusDoneId(void)
|
JobStatusDoneId(void)
|
||||||
{
|
{
|
||||||
|
|
|
@ -2230,23 +2230,35 @@ HasScheduledRebalanceJobs()
|
||||||
Relation pgDistRebalanceJobs = table_open(DistRebalanceJobsRelationId(),
|
Relation pgDistRebalanceJobs = table_open(DistRebalanceJobsRelationId(),
|
||||||
AccessShareLock);
|
AccessShareLock);
|
||||||
|
|
||||||
/* pg_dist_rebalance_jobs.status == 'scheduled' */
|
/* find any job in states listed here */
|
||||||
ScanKeyInit(&scanKey[0], Anum_pg_dist_rebalance_jobs_status,
|
RebalanceJobStatus jobs[] = {
|
||||||
BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(JobStatusScheduledId()));
|
REBALANCE_JOB_STATUS_RUNNING,
|
||||||
|
REBALANCE_JOB_STATUS_SCHEDULED
|
||||||
SysScanDesc scanDescriptor = systable_beginscan(pgDistRebalanceJobs,
|
};
|
||||||
DistRebalanceJobsStatusJobsIdIndexId(),
|
|
||||||
indexOK, NULL, scanKeyCount, scanKey);
|
|
||||||
|
|
||||||
HeapTuple jobTuple = systable_getnext(scanDescriptor);
|
|
||||||
|
|
||||||
bool hasScheduledJob = false;
|
bool hasScheduledJob = false;
|
||||||
|
for (int i = 0; !hasScheduledJob && i < sizeof(jobs) / sizeof(jobs[0]); i++)
|
||||||
|
{
|
||||||
|
/* pg_dist_rebalance_jobs.status == jobs[i] */
|
||||||
|
ScanKeyInit(&scanKey[0], Anum_pg_dist_rebalance_jobs_status,
|
||||||
|
BTEqualStrategyNumber, F_OIDEQ,
|
||||||
|
ObjectIdGetDatum(RebalanceJobStatusOid(jobs[i])));
|
||||||
|
|
||||||
|
SysScanDesc scanDescriptor = systable_beginscan(
|
||||||
|
pgDistRebalanceJobs,
|
||||||
|
DistRebalanceJobsStatusJobsIdIndexId(),
|
||||||
|
indexOK, NULL, scanKeyCount,
|
||||||
|
scanKey);
|
||||||
|
|
||||||
|
HeapTuple jobTuple = systable_getnext(scanDescriptor);
|
||||||
if (HeapTupleIsValid(jobTuple))
|
if (HeapTupleIsValid(jobTuple))
|
||||||
{
|
{
|
||||||
hasScheduledJob = true;
|
hasScheduledJob = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
systable_endscan(scanDescriptor);
|
systable_endscan(scanDescriptor);
|
||||||
|
}
|
||||||
|
|
||||||
table_close(pgDistRebalanceJobs, AccessShareLock);
|
table_close(pgDistRebalanceJobs, AccessShareLock);
|
||||||
|
|
||||||
return hasScheduledJob;
|
return hasScheduledJob;
|
||||||
|
@ -2264,6 +2276,10 @@ RebalanceJobStatusByOid(Oid enumOid)
|
||||||
{
|
{
|
||||||
return REBALANCE_JOB_STATUS_SCHEDULED;
|
return REBALANCE_JOB_STATUS_SCHEDULED;
|
||||||
}
|
}
|
||||||
|
else if (enumOid == JobStatusRunningId())
|
||||||
|
{
|
||||||
|
return REBALANCE_JOB_STATUS_RUNNING;
|
||||||
|
}
|
||||||
else if (enumOid == JobStatusErrorId())
|
else if (enumOid == JobStatusErrorId())
|
||||||
{
|
{
|
||||||
return REBALANCE_JOB_STATUS_ERROR;
|
return REBALANCE_JOB_STATUS_ERROR;
|
||||||
|
@ -2292,7 +2308,7 @@ IsRebalanceJobStatusTerminal(RebalanceJobStatus status)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static Oid
|
Oid
|
||||||
RebalanceJobStatusOid(RebalanceJobStatus status)
|
RebalanceJobStatusOid(RebalanceJobStatus status)
|
||||||
{
|
{
|
||||||
switch (status)
|
switch (status)
|
||||||
|
@ -2302,6 +2318,11 @@ RebalanceJobStatusOid(RebalanceJobStatus status)
|
||||||
return JobStatusScheduledId();
|
return JobStatusScheduledId();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case REBALANCE_JOB_STATUS_RUNNING:
|
||||||
|
{
|
||||||
|
return JobStatusRunningId();
|
||||||
|
}
|
||||||
|
|
||||||
case REBALANCE_JOB_STATUS_DONE:
|
case REBALANCE_JOB_STATUS_DONE:
|
||||||
{
|
{
|
||||||
return JobStatusDoneId();
|
return JobStatusDoneId();
|
||||||
|
@ -2385,16 +2406,25 @@ GetScheduledRebalanceJob(void)
|
||||||
Relation pgDistRebalanceJobs = table_open(DistRebalanceJobsRelationId(),
|
Relation pgDistRebalanceJobs = table_open(DistRebalanceJobsRelationId(),
|
||||||
AccessShareLock);
|
AccessShareLock);
|
||||||
|
|
||||||
/* pg_dist_rebalance_jobs.status == 'scheduled' */
|
RebalanceJobStatus jobStatus[] = {
|
||||||
|
REBALANCE_JOB_STATUS_RUNNING,
|
||||||
|
REBALANCE_JOB_STATUS_SCHEDULED
|
||||||
|
};
|
||||||
|
|
||||||
|
RebalanceJob *job = NULL;
|
||||||
|
for (int i = 0; !job && i < sizeof(jobStatus) / sizeof(jobStatus[0]); i++)
|
||||||
|
{
|
||||||
|
/* pg_dist_rebalance_jobs.status == jobStatus[i] */
|
||||||
ScanKeyInit(&scanKey[0], Anum_pg_dist_rebalance_jobs_status,
|
ScanKeyInit(&scanKey[0], Anum_pg_dist_rebalance_jobs_status,
|
||||||
BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(JobStatusScheduledId()));
|
BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(
|
||||||
|
RebalanceJobStatusOid(jobStatus[i])));
|
||||||
|
|
||||||
SysScanDesc scanDescriptor = systable_beginscan(pgDistRebalanceJobs,
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistRebalanceJobs,
|
||||||
DistRebalanceJobsStatusJobsIdIndexId(),
|
DistRebalanceJobsStatusJobsIdIndexId(),
|
||||||
indexOK, NULL, scanKeyCount, scanKey);
|
indexOK, NULL, scanKeyCount,
|
||||||
|
scanKey);
|
||||||
|
|
||||||
HeapTuple jobTuple = systable_getnext(scanDescriptor);
|
HeapTuple jobTuple = systable_getnext(scanDescriptor);
|
||||||
RebalanceJob *job = NULL;
|
|
||||||
if (HeapTupleIsValid(jobTuple))
|
if (HeapTupleIsValid(jobTuple))
|
||||||
{
|
{
|
||||||
Form_pg_dist_rebalance_job jobData = NULL;
|
Form_pg_dist_rebalance_job jobData = NULL;
|
||||||
|
@ -2415,6 +2445,8 @@ GetScheduledRebalanceJob(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
systable_endscan(scanDescriptor);
|
systable_endscan(scanDescriptor);
|
||||||
|
}
|
||||||
|
|
||||||
table_close(pgDistRebalanceJobs, AccessShareLock);
|
table_close(pgDistRebalanceJobs, AccessShareLock);
|
||||||
|
|
||||||
return job;
|
return job;
|
||||||
|
@ -2422,7 +2454,7 @@ GetScheduledRebalanceJob(void)
|
||||||
|
|
||||||
|
|
||||||
RebalanceJob *
|
RebalanceJob *
|
||||||
GetScheduledRebalanceJobyJobID(int64 jobId)
|
GetScheduledRebalanceJobByJobID(int64 jobId)
|
||||||
{
|
{
|
||||||
const int scanKeyCount = 1;
|
const int scanKeyCount = 1;
|
||||||
ScanKeyData scanKey[1];
|
ScanKeyData scanKey[1];
|
||||||
|
@ -2503,6 +2535,21 @@ UpdateJobStatus(RebalanceJob *job, RebalanceJobStatus newStatus)
|
||||||
isnull[Anum_pg_dist_rebalance_jobs_status - 1] = false;
|
isnull[Anum_pg_dist_rebalance_jobs_status - 1] = false;
|
||||||
replace[Anum_pg_dist_rebalance_jobs_status - 1] = true;
|
replace[Anum_pg_dist_rebalance_jobs_status - 1] = true;
|
||||||
|
|
||||||
|
/* TODO figure out a nice way on how to update a tuple selectively */
|
||||||
|
if (newStatus == REBALANCE_JOB_STATUS_RUNNING)
|
||||||
|
{
|
||||||
|
/* update pid for running status */
|
||||||
|
values[Anum_pg_dist_rebalance_jobs_pid - 1] = Int32GetDatum((int32) MyProcPid);
|
||||||
|
isnull[Anum_pg_dist_rebalance_jobs_pid - 1] = false;
|
||||||
|
replace[Anum_pg_dist_rebalance_jobs_pid - 1] = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
values[Anum_pg_dist_rebalance_jobs_pid - 1] = 0;
|
||||||
|
isnull[Anum_pg_dist_rebalance_jobs_pid - 1] = true;
|
||||||
|
replace[Anum_pg_dist_rebalance_jobs_pid - 1] = true;
|
||||||
|
}
|
||||||
|
|
||||||
heapTuple = heap_modify_tuple(heapTuple, tupleDescriptor, values, isnull, replace);
|
heapTuple = heap_modify_tuple(heapTuple, tupleDescriptor, values, isnull, replace);
|
||||||
|
|
||||||
CatalogTupleUpdate(pgDistRebalanceJobs, &heapTuple->t_self, heapTuple);
|
CatalogTupleUpdate(pgDistRebalanceJobs, &heapTuple->t_self, heapTuple);
|
||||||
|
|
|
@ -77,11 +77,12 @@ DROP FUNCTION pg_catalog.get_all_active_transactions(OUT datid oid, OUT process_
|
||||||
DROP FUNCTION pg_catalog.isolate_tenant_to_new_shard(table_name regclass, tenant_id "any", cascade_option text);
|
DROP FUNCTION pg_catalog.isolate_tenant_to_new_shard(table_name regclass, tenant_id "any", cascade_option text);
|
||||||
#include "udfs/isolate_tenant_to_new_shard/11.1-1.sql"
|
#include "udfs/isolate_tenant_to_new_shard/11.1-1.sql"
|
||||||
|
|
||||||
CREATE TYPE citus.citus_job_status AS ENUM ('scheduled', 'done', 'error');
|
CREATE TYPE citus.citus_job_status AS ENUM ('scheduled', 'running', 'done', 'error');
|
||||||
ALTER TYPE citus.citus_job_status SET SCHEMA pg_catalog;
|
ALTER TYPE citus.citus_job_status SET SCHEMA pg_catalog;
|
||||||
|
|
||||||
CREATE TABLE citus.pg_dist_rebalance_jobs(
|
CREATE TABLE citus.pg_dist_rebalance_jobs(
|
||||||
jobid bigserial NOT NULL,
|
jobid bigserial NOT NULL,
|
||||||
|
pid integer,
|
||||||
status pg_catalog.citus_job_status default 'scheduled' NOT NULL,
|
status pg_catalog.citus_job_status default 'scheduled' NOT NULL,
|
||||||
command text NOT NULL,
|
command text NOT NULL,
|
||||||
retry_count integer,
|
retry_count integer,
|
||||||
|
|
|
@ -858,9 +858,18 @@ RebalanceJobsBackgroundWorkerMain(Datum arg)
|
||||||
|
|
||||||
/* pg_usleep(30 * 1000 * 1000); */
|
/* pg_usleep(30 * 1000 * 1000); */
|
||||||
|
|
||||||
|
MemoryContext perJobContext = AllocSetContextCreateExtended(CurrentMemoryContext,
|
||||||
|
"PerJobContext",
|
||||||
|
ALLOCSET_DEFAULT_MINSIZE,
|
||||||
|
ALLOCSET_DEFAULT_INITSIZE,
|
||||||
|
ALLOCSET_DEFAULT_MAXSIZE);
|
||||||
|
|
||||||
|
MemoryContext oldContextPerJob = MemoryContextSwitchTo(perJobContext);
|
||||||
bool hasJobs = true;
|
bool hasJobs = true;
|
||||||
while (hasJobs)
|
while (hasJobs)
|
||||||
{
|
{
|
||||||
|
MemoryContextReset(perJobContext);
|
||||||
|
|
||||||
CHECK_FOR_INTERRUPTS();
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
|
||||||
InvalidateMetadataSystemCache();
|
InvalidateMetadataSystemCache();
|
||||||
|
@ -875,11 +884,26 @@ RebalanceJobsBackgroundWorkerMain(Datum arg)
|
||||||
}
|
}
|
||||||
else if (CheckCitusVersion(DEBUG1) && CitusHasBeenLoaded())
|
else if (CheckCitusVersion(DEBUG1) && CitusHasBeenLoaded())
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* We need to load the job into the perJobContext as we will switch contexts
|
||||||
|
* later due to the committing and starting of new transactions
|
||||||
|
*/
|
||||||
|
MemoryContext oldContext = MemoryContextSwitchTo(perJobContext);
|
||||||
RebalanceJob *job = GetScheduledRebalanceJob();
|
RebalanceJob *job = GetScheduledRebalanceJob();
|
||||||
|
MemoryContextSwitchTo(oldContext);
|
||||||
|
|
||||||
if (job)
|
if (job)
|
||||||
{
|
{
|
||||||
ereport(LOG, (errmsg("found job with jobid: %ld", job->jobid)));
|
ereport(LOG, (errmsg("found job with jobid: %ld", job->jobid)));
|
||||||
MemoryContext savedContext = CurrentMemoryContext;
|
MemoryContext savedContext = CurrentMemoryContext;
|
||||||
|
|
||||||
|
UpdateJobStatus(job, REBALANCE_JOB_STATUS_RUNNING);
|
||||||
|
PopActiveSnapshot();
|
||||||
|
CommitTransactionCommand();
|
||||||
|
|
||||||
|
StartTransactionCommand();
|
||||||
|
PushActiveSnapshot(GetTransactionSnapshot());
|
||||||
|
|
||||||
BeginInternalSubTransaction(NULL);
|
BeginInternalSubTransaction(NULL);
|
||||||
|
|
||||||
PG_TRY();
|
PG_TRY();
|
||||||
|
@ -919,6 +943,8 @@ RebalanceJobsBackgroundWorkerMain(Datum arg)
|
||||||
CommitTransactionCommand();
|
CommitTransactionCommand();
|
||||||
ProcessCompletedNotifies();
|
ProcessCompletedNotifies();
|
||||||
}
|
}
|
||||||
|
MemoryContextSwitchTo(oldContextPerJob);
|
||||||
|
MemoryContextDelete(perJobContext);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ citus_wait_for_rebalance_job(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
CHECK_FOR_INTERRUPTS();
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
|
||||||
RebalanceJob *job = GetScheduledRebalanceJobyJobID(jobid);
|
RebalanceJob *job = GetScheduledRebalanceJobByJobID(jobid);
|
||||||
if (!job)
|
if (!job)
|
||||||
{
|
{
|
||||||
ereport(ERROR, (errmsg("unkown job with jobid: %ld", jobid)));
|
ereport(ERROR, (errmsg("unkown job with jobid: %ld", jobid)));
|
||||||
|
|
|
@ -272,6 +272,7 @@ extern Oid CitusCopyFormatTypeId(void);
|
||||||
extern Oid TextCopyFormatId(void);
|
extern Oid TextCopyFormatId(void);
|
||||||
extern Oid BinaryCopyFormatId(void);
|
extern Oid BinaryCopyFormatId(void);
|
||||||
extern Oid JobStatusScheduledId(void);
|
extern Oid JobStatusScheduledId(void);
|
||||||
|
extern Oid JobStatusRunningId(void);
|
||||||
extern Oid JobStatusDoneId(void);
|
extern Oid JobStatusDoneId(void);
|
||||||
extern Oid JobStatusErrorId(void);
|
extern Oid JobStatusErrorId(void);
|
||||||
|
|
||||||
|
|
|
@ -208,6 +208,7 @@ typedef enum RebalanceJobStatus
|
||||||
{
|
{
|
||||||
REBALANCE_JOB_STATUS_UNKNOWN,
|
REBALANCE_JOB_STATUS_UNKNOWN,
|
||||||
REBALANCE_JOB_STATUS_SCHEDULED,
|
REBALANCE_JOB_STATUS_SCHEDULED,
|
||||||
|
REBALANCE_JOB_STATUS_RUNNING,
|
||||||
REBALANCE_JOB_STATUS_DONE,
|
REBALANCE_JOB_STATUS_DONE,
|
||||||
REBALANCE_JOB_STATUS_ERROR
|
REBALANCE_JOB_STATUS_ERROR
|
||||||
} RebalanceJobStatus;
|
} RebalanceJobStatus;
|
||||||
|
@ -332,8 +333,9 @@ extern bool HasScheduledRebalanceJobs(void);
|
||||||
extern int64 GetNextRebalanceJobId(void);
|
extern int64 GetNextRebalanceJobId(void);
|
||||||
extern RebalanceJob * ScheduleBackgrounRebalanceJob(char *command);
|
extern RebalanceJob * ScheduleBackgrounRebalanceJob(char *command);
|
||||||
extern RebalanceJob * GetScheduledRebalanceJob(void);
|
extern RebalanceJob * GetScheduledRebalanceJob(void);
|
||||||
extern RebalanceJob * GetScheduledRebalanceJobyJobID(int64 jobId);
|
extern RebalanceJob * GetScheduledRebalanceJobByJobID(int64 jobId);
|
||||||
extern void UpdateJobStatus(RebalanceJob *job, RebalanceJobStatus newStatus);
|
extern void UpdateJobStatus(RebalanceJob *job, RebalanceJobStatus newStatus);
|
||||||
extern void UpdateJobError(RebalanceJob *job, ErrorData *edata);
|
extern void UpdateJobError(RebalanceJob *job, ErrorData *edata);
|
||||||
extern bool IsRebalanceJobStatusTerminal(RebalanceJobStatus status);
|
extern bool IsRebalanceJobStatusTerminal(RebalanceJobStatus status);
|
||||||
|
extern Oid RebalanceJobStatusOid(RebalanceJobStatus status);
|
||||||
#endif /* METADATA_UTILITY_H */
|
#endif /* METADATA_UTILITY_H */
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
typedef struct FormData_pg_dist_rebalance_job
|
typedef struct FormData_pg_dist_rebalance_job
|
||||||
{
|
{
|
||||||
int64 jobid;
|
int64 jobid;
|
||||||
|
int32 pid;
|
||||||
Oid status;
|
Oid status;
|
||||||
#ifdef CATALOG_VARLEN /* variable-length fields start here */
|
#ifdef CATALOG_VARLEN /* variable-length fields start here */
|
||||||
text command;
|
text command;
|
||||||
|
@ -28,12 +29,13 @@ typedef FormData_pg_dist_rebalance_job *Form_pg_dist_rebalance_job;
|
||||||
* compiler constants for pg_dist_rebalance_jobs
|
* compiler constants for pg_dist_rebalance_jobs
|
||||||
* ----------------
|
* ----------------
|
||||||
*/
|
*/
|
||||||
#define Natts_pg_dist_rebalance_jobs 5
|
#define Natts_pg_dist_rebalance_jobs 6
|
||||||
#define Anum_pg_dist_rebalance_jobs_jobid 1
|
#define Anum_pg_dist_rebalance_jobs_jobid 1
|
||||||
#define Anum_pg_dist_rebalance_jobs_status 2
|
#define Anum_pg_dist_rebalance_jobs_pid 2
|
||||||
#define Anum_pg_dist_rebalance_jobs_command 3
|
#define Anum_pg_dist_rebalance_jobs_status 3
|
||||||
#define Anum_pg_dist_rebalance_jobs_retry_count 4
|
#define Anum_pg_dist_rebalance_jobs_command 4
|
||||||
#define Anum_pg_dist_rebalance_jobs_message 5
|
#define Anum_pg_dist_rebalance_jobs_retry_count 5
|
||||||
|
#define Anum_pg_dist_rebalance_jobs_message 6
|
||||||
|
|
||||||
#define REBALANCE_JOB_JOBID_SEQUENCE_NAME "pg_catalog.pg_dist_rebalance_jobs_jobid_seq"
|
#define REBALANCE_JOB_JOBID_SEQUENCE_NAME "pg_catalog.pg_dist_rebalance_jobs_jobid_seq"
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue