From 49b824e44f8513424220765e457fc1b4df1313e3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Milo=C5=A1=20Mula=C4=8D?= Date: Wed, 13 Jun 2007 13:37:30 +0000 Subject: [PATCH] new implementation of job collections --- org.glite.lb.server/src/jobstat.c | 209 +++++++++++++++++--------------- org.glite.lb.server/src/jobstat.h | 9 ++ org.glite.lb.server/src/process_event.c | 4 +- 3 files changed, 121 insertions(+), 101 deletions(-) diff --git a/org.glite.lb.server/src/jobstat.c b/org.glite.lb.server/src/jobstat.c index d13aada..a8d48b0 100644 --- a/org.glite.lb.server/src/jobstat.c +++ b/org.glite.lb.server/src/jobstat.c @@ -797,119 +797,127 @@ static int log_collectionState_event(edg_wll_Context ctx, edg_wll_JobStatCode st } -/* called only when childen state changed - */ -static edg_wll_ErrorCode update_parent_status(edg_wll_Context ctx, edg_wll_JobStatCode old_state, enum edg_wll_StatDone_code old_done_code, intJobStat *cis, edg_wll_Event *ce) +/* returns state class of subjob of job collection */ +static subjobClassCodes class(edg_wll_JobStat *stat) { - intJobStat *pis = NULL; - int update_hist = 0; - - - /* Easy version, where the whole histogram is evolving... - * not used because of performance reasons - * - if (load_parent_intJobStat(ctx, cis, &pis)) goto err; - - pis->pub.children_hist[cis->pub.state+1]++; - if (cis->pub.state == EDG_WLL_JOB_DONE) - pis->children_done_hist[cis->pub.done_code]++; - pis->pub.children_hist[old_state+1]--; - if (old_state == EDG_WLL_JOB_DONE) - pis->children_done_hist[old_done_code]--; - edg_wll_StoreSubjobHistogram(ctx, cis->pub.parent_job, pis); - */ - - - // XXX: if load_parent_intJobStat occure (and survives in future) in each subcase - // load parent status at the beginning of this function - - /* Increment histogram for interesting states and - * cook artificial events to enable parent job state shift - */ - switch (cis->pub.state) { + switch (stat->state) { case EDG_WLL_JOB_RUNNING: - if (load_parent_intJobStat(ctx, cis, &pis)) goto err; - pis->pub.children_hist[cis->pub.state+1]++; - - if (pis->pub.jobtype == EDG_WLL_STAT_COLLECTION) { - /* not RUNNING yet? */ - if (pis->pub.state < EDG_WLL_JOB_RUNNING) { - if (log_collectionState_event(ctx, cis->pub.state, 0, cis, pis, ce)) - goto err; - } - } - update_hist = 1; + return(SUBJOB_CLASS_RUNNING); break; case EDG_WLL_JOB_DONE: - if (load_parent_intJobStat(ctx, cis, &pis)) goto err; - pis->pub.children_hist[cis->pub.state+1]++; - pis->children_done_hist[cis->pub.done_code]++; - - if (pis->pub.jobtype == EDG_WLL_STAT_COLLECTION) { - if (pis->pub.children_hist[cis->pub.state+1] == pis->pub.children_num) { - /* not DONE yet? */ - if (pis->pub.state < EDG_WLL_JOB_DONE) { - if (log_collectionState_event(ctx, cis->pub.state, - cis->pub.done_code, cis, pis, ce)) - goto err; - } - } - } - update_hist = 1; + if (stat->done_code == EDG_WLL_STAT_OK) + return(SUBJOB_CLASS_DONE); + else + // failed & cancelled + return(SUBJOB_CLASS_ABORTED); break; - case EDG_WLL_JOB_CLEARED: - if (load_parent_intJobStat(ctx, cis, &pis)) goto err; - pis->pub.children_hist[cis->pub.state+1]++; - - if (pis->pub.jobtype == EDG_WLL_STAT_COLLECTION) { - if (pis->pub.children_hist[cis->pub.state+1] == pis->pub.children_num) { - /* not CLEARED yet? */ - if (pis->pub.state < EDG_WLL_JOB_CLEARED) { - if (log_collectionState_event(ctx, cis->pub.state, - cis->pub.done_code, cis, pis, ce)) - goto err; - } - } - } - update_hist = 1; + case EDG_WLL_JOB_ABORTED: + return(SUBJOB_CLASS_ABORTED); + break; + case EDG_WLL_JOB_CANCELLED: + return(SUBJOB_CLASS_DONE); break; default: - if (load_parent_intJobStat(ctx, cis, &pis)) goto err; - pis->pub.children_hist[EDG_WLL_JOB_UNKNOWN+1]++; - // update_hist = 1; - triggered by the next case or not needed + return(SUBJOB_CLASS_REST); break; } - +} - /* Decrement histogram for interesting states - */ - switch (old_state) { - case EDG_WLL_JOB_RUNNING: - if (load_parent_intJobStat(ctx, cis, &pis)) goto err; - pis->pub.children_hist[old_state+1]--; - update_hist = 1; - break; - case EDG_WLL_JOB_DONE: - if (load_parent_intJobStat(ctx, cis, &pis)) goto err; - pis->pub.children_hist[old_state+1]--; - pis->children_done_hist[old_done_code]--; - update_hist = 1; - break; - case EDG_WLL_JOB_CLEARED: - if (load_parent_intJobStat(ctx, cis, &pis)) goto err; - pis->pub.children_hist[old_state+1]--; - update_hist = 1; - break; - default: - if (load_parent_intJobStat(ctx, cis, &pis)) goto err; - pis->pub.children_hist[EDG_WLL_JOB_UNKNOWN+1]--; - // update_hist = 1; - triggered by the previous case or not needed - break; +/* Mapping of subjob class to some field in childen_hist */ +static edg_wll_JobStatCode class_to_statCode(subjobClassCodes code) +{ + switch (code) { + case SUBJOB_CLASS_RUNNING: return(EDG_WLL_JOB_RUNNING); break; + case SUBJOB_CLASS_DONE: return(EDG_WLL_JOB_DONE); break; + case SUBJOB_CLASS_ABORTED: return(EDG_WLL_JOB_ABORTED); break; + case SUBJOB_CLASS_REST: return(EDG_WLL_JOB_UNKNOWN); break; + default: assert(0); break; } +} + +static edg_wll_ErrorCode update_parent_status(edg_wll_Context ctx, edg_wll_JobStat *subjob_stat_old, intJobStat *cis, edg_wll_Event *ce) +{ + intJobStat *pis = NULL; + subjobClassCodes subjob_class, subjob_class_old; + + + subjob_class = class(&cis->pub); + subjob_class_old = class(subjob_stat_old); + + + if (subjob_class_old != subjob_class) { + if (load_parent_intJobStat(ctx, cis, &pis)) goto err; + + pis->pub.children_hist[class_to_statCode(subjob_class)+1]++; + pis->pub.children_hist[class_to_statCode(subjob_class_old)+1]--; + + /* not needed if DONE_OK and DONE_FAILED mapped on diffrent field in histogram + * if furure proves it, children_done_hist field of intStat may be removed + if (cis->pub.state == EDG_WLL_JOB_DONE) + pis->children_done_hist[cis->pub.done_code]++; + if (coll_stat_old->state == EDG_WLL_JOB_DONE) + pis->children_done_hist[coll_stat_old->done_code]--; + */ - if (update_hist) edg_wll_StoreSubjobHistogram(ctx, cis->pub.parent_job, pis); + + if (pis->pub.jobtype == EDG_WLL_STAT_COLLECTION) { + switch (subjob_class) { + case SUBJOB_CLASS_RUNNING: + if (pis->pub.state < EDG_WLL_JOB_RUNNING) { + // no subjob running yet, this is the very first + if (log_collectionState_event(ctx, EDG_WLL_JOB_RUNNING, 0, cis, pis, ce)) + goto err; + } + else if ((pis->pub.state == EDG_WLL_JOB_CLEARED) || + (pis->pub.state == EDG_WLL_JOB_ABORTED)) { + // done or aborted hist-field loosing one subjob + if (log_collectionState_event(ctx, EDG_WLL_JOB_RUNNING, 0, cis, pis, ce)) + goto err; + } + break; + case SUBJOB_CLASS_DONE: + if (pis->pub.children_hist[class_to_statCode(SUBJOB_CLASS_DONE)+1] == pis->pub.children_num) { + if (log_collectionState_event(ctx, EDG_WLL_JOB_CLEARED, EDG_WLL_STAT_OK, cis, pis, ce)) + goto err; + } + else + if (( pis->pub.children_hist[class_to_statCode(SUBJOB_CLASS_DONE)+1] + + pis->pub.children_hist[class_to_statCode(SUBJOB_CLASS_ABORTED)+1]) + == pis->pub.children_num) { + if (log_collectionState_event(ctx, EDG_WLL_JOB_ABORTED, EDG_WLL_STAT_FAILED, cis, pis, ce)) + goto err; + } + break; + case SUBJOB_CLASS_ABORTED: + // XXX: is it correct semantics? + // what about EDG_WLL_STAT_ code? is it meaningful here? + if (( pis->pub.children_hist[class_to_statCode(SUBJOB_CLASS_ABORTED)+1] + + pis->pub.children_hist[class_to_statCode(SUBJOB_CLASS_DONE)+1]) + == pis->pub.children_num) { + if (log_collectionState_event(ctx, EDG_WLL_JOB_ABORTED, 0, cis, pis, ce)) + goto err; + } + break; + case SUBJOB_CLASS_REST: + // XXX: is it correct semantics? + if ((pis->pub.children_hist[class_to_statCode(SUBJOB_CLASS_RUNNING)+1] == 0) && + (pis->pub.children_hist[class_to_statCode(SUBJOB_CLASS_DONE)+1] == 0) && + (pis->pub.children_hist[class_to_statCode(SUBJOB_CLASS_ABORTED)+1] == 0) && + (pis->pub.children_hist[class_to_statCode(SUBJOB_CLASS_REST)+1] == 0)) { + if (log_collectionState_event(ctx, EDG_WLL_JOB_SUBMITTED, 0, cis, pis, ce)) + goto err; + } + return(EDG_WLL_JOB_SUBMITTED); + break; + default: + assert(0); + break; + } + } + } + err: edg_wll_UnlockJob(ctx,cis->pub.parent_job); @@ -921,6 +929,7 @@ err: } + /* * update stored state according to the new event * (must be called with the job locked) @@ -1011,7 +1020,7 @@ edg_wll_ErrorCode edg_wll_StepIntState(edg_wll_Context ctx, /* check whether subjob state change does not change parent state */ if ((ijsp->pub.parent_job) && (oldstat.state != ijsp->pub.state)) { - if (update_parent_status(ctx, oldstat.state, oldstat.done_code, ijsp, e)) + if (update_parent_status(ctx, &oldstat, ijsp, e)) return edg_wll_SetError(ctx, EINVAL, "update_parent_status()"); } diff --git a/org.glite.lb.server/src/jobstat.h b/org.glite.lb.server/src/jobstat.h index 64e32b1..a128aac 100644 --- a/org.glite.lb.server/src/jobstat.h +++ b/org.glite.lb.server/src/jobstat.h @@ -81,6 +81,15 @@ typedef enum _edg_wll_CondorEventSource { EDG_WLL_CONDOR_EVENT_SOURCE__LAST } edg_wll_CondorEventSource; +typedef enum _subjobClassCodes { + SUBJOB_CLASS_UNDEF = 0, + SUBJOB_CLASS_RUNNING, + SUBJOB_CLASS_DONE, + SUBJOB_CLASS_ABORTED, + SUBJOB_CLASS_REST +} subjobClassCodes; + + void destroy_intJobStat(intJobStat *); void destroy_intJobStat_extension(intJobStat *p); diff --git a/org.glite.lb.server/src/process_event.c b/org.glite.lb.server/src/process_event.c index 1bce002..e03c6b6 100644 --- a/org.glite.lb.server/src/process_event.c +++ b/org.glite.lb.server/src/process_event.c @@ -272,7 +272,9 @@ static int processEvent_glite(intJobStat *js, edg_wll_Event *e, int ev_seq, int int lm_favour_lrms = 0; - if (old_state == EDG_WLL_JOB_ABORTED || + // Aborted may not be terminal state for collection in some cases + // i.e. if some Done/failed subjob is resubmitted + if ( (old_state == EDG_WLL_JOB_ABORTED && e->any.type != EDG_WLL_EVENT_COLLECTIONSTATE) || old_state == EDG_WLL_JOB_CANCELLED || old_state == EDG_WLL_JOB_CLEARED) { res = RET_LATE; -- 1.8.2.3