From ae44205177d9eaa74217b4389044b1882321f15b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ale=C5=A1=20K=C5=99enek?= Date: Tue, 29 Nov 2005 14:41:31 +0000 Subject: [PATCH] - do not completely ignore LRMS events arriving after first LM ones due to lower seqence code - completely reset the branching machinery if there is a newer NS event --- org.glite.lb.server/src/jobstat.h | 2 ++ org.glite.lb.server/src/jobstat_supp.c | 35 +++++++++++++++++++++++++++++ org.glite.lb.server/src/process_event.c | 40 +++++++++++++++++++++++++++++---- 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/org.glite.lb.server/src/jobstat.h b/org.glite.lb.server/src/jobstat.h index 181a85d..352af06 100644 --- a/org.glite.lb.server/src/jobstat.h +++ b/org.glite.lb.server/src/jobstat.h @@ -64,9 +64,11 @@ void write2rgma_status(edg_wll_JobStat *); int before_deep_resubmission(const char *, const char *); int same_branch(const char *, const char *); int component_seqcode(const char *a, edg_wll_Source index); +char * set_component_seqcode(char *s,edg_wll_Source index,int val); int processEvent(intJobStat *, edg_wll_Event *, int, int, char **); int add_stringlist(char ***, const char *); int edg_wll_compare_seq(const char *, const char *); void init_intJobStat(intJobStat *p); + diff --git a/org.glite.lb.server/src/jobstat_supp.c b/org.glite.lb.server/src/jobstat_supp.c index 58d4a51..6fb09ef 100644 --- a/org.glite.lb.server/src/jobstat_supp.c +++ b/org.glite.lb.server/src/jobstat_supp.c @@ -658,6 +658,41 @@ int component_seqcode(const char *a, edg_wll_Source index) return(c[index]); } +char * set_component_seqcode(char *s,edg_wll_Source index,int val) +{ + unsigned int c[EDG_WLL_SOURCE__LAST]; + int res; + char *ret; + + res = sscanf(s, "UI=%d:NS=%d:WM=%d:BH=%d:JSS=%d:LM=%d:LRMS=%d:APP=%d", + &c[EDG_WLL_SOURCE_USER_INTERFACE], + &c[EDG_WLL_SOURCE_NETWORK_SERVER], + &c[EDG_WLL_SOURCE_WORKLOAD_MANAGER], + &c[EDG_WLL_SOURCE_BIG_HELPER], + &c[EDG_WLL_SOURCE_JOB_SUBMISSION], + &c[EDG_WLL_SOURCE_LOG_MONITOR], + &c[EDG_WLL_SOURCE_LRMS], + &c[EDG_WLL_SOURCE_APPLICATION]); + if (res != EDG_WLL_SOURCE__LAST-1) { + syslog(LOG_ERR, "unparsable sequence code %s\n", s); + fprintf(stderr, "unparsable sequence code %s\n", s); + return NULL; + } + + c[index] = val; + trio_asprintf(&ret,"UI=%06d:NS=%010d:WM=%06d:BH=%010d:JSS=%06d" + ":LM=%06d:LRMS=%06d:APP=%06d", + c[EDG_WLL_SOURCE_USER_INTERFACE], + c[EDG_WLL_SOURCE_NETWORK_SERVER], + c[EDG_WLL_SOURCE_WORKLOAD_MANAGER], + c[EDG_WLL_SOURCE_BIG_HELPER], + c[EDG_WLL_SOURCE_JOB_SUBMISSION], + c[EDG_WLL_SOURCE_LOG_MONITOR], + c[EDG_WLL_SOURCE_LRMS], + c[EDG_WLL_SOURCE_APPLICATION]); + return ret; +} + int before_deep_resubmission(const char *a, const char *b) { if (component_seqcode(a, EDG_WLL_SOURCE_WORKLOAD_MANAGER) < diff --git a/org.glite.lb.server/src/process_event.c b/org.glite.lb.server/src/process_event.c index c2bfcf6..899ceee 100644 --- a/org.glite.lb.server/src/process_event.c +++ b/org.glite.lb.server/src/process_event.c @@ -175,6 +175,12 @@ static char* location_string(const char *source, const char *host, const char *i return ret; } +static int after_enter_wm(const char *es,const char *js) +{ + return component_seqcode(es,EDG_WLL_SOURCE_NETWORK_SERVER) > + component_seqcode(js,EDG_WLL_SOURCE_NETWORK_SERVER); +} + static int badEvent(intJobStat *js UNUSED_VAR, edg_wll_Event *e, int ev_seq UNUSED_VAR) { @@ -205,7 +211,7 @@ int processEvent(intJobStat *js, edg_wll_Event *e, int ev_seq, int strict, char int res = RET_OK, fine_res = RET_OK; - + int lm_favour_lrms = 0; if (old_state == EDG_WLL_JOB_ABORTED || old_state == EDG_WLL_JOB_CANCELLED || @@ -213,6 +219,15 @@ int processEvent(intJobStat *js, edg_wll_Event *e, int ev_seq, int strict, char res = RET_LATE; } +/* new event coming from NS => forget about any resubmission loops */ + if (e->type != EDG_WLL_EVENT_CANCEL && + js->last_seqcode && + after_enter_wm(e->any.seqcode,js->last_seqcode)) + { + rep(js->branch_tag_seqcode,NULL); + rep(js->deep_resubmit_seqcode,NULL); + rep(js->last_branch_seqcode,NULL); + } if (js->deep_resubmit_seqcode && before_deep_resubmission(e->any.seqcode, js->deep_resubmit_seqcode)) { @@ -253,7 +268,9 @@ int processEvent(intJobStat *js, edg_wll_Event *e, int ev_seq, int strict, char e->any.timestamp.tv_sec; res = RET_LATE; } - new_state = EDG_WLL_JOB_SCHEDULED; break; + new_state = EDG_WLL_JOB_SCHEDULED; + lm_favour_lrms = 1; + break; default: goto bad_event; break; } @@ -312,7 +329,9 @@ int processEvent(intJobStat *js, edg_wll_Event *e, int ev_seq, int strict, char new_state = EDG_WLL_JOB_WAITING; break; case EDG_WLL_SOURCE_LOG_MONITOR: if (LRMS_STATE(old_state)) res = RET_LATE; - new_state = EDG_WLL_JOB_READY; break; + new_state = EDG_WLL_JOB_READY; + lm_favour_lrms = 1; + break; case EDG_WLL_SOURCE_LRMS: new_state = EDG_WLL_JOB_SCHEDULED; break; default: @@ -787,7 +806,20 @@ int processEvent(intJobStat *js, edg_wll_Event *e, int ev_seq, int strict, char if (e->any.type == EDG_WLL_EVENT_CANCEL) { rep(js->last_cancel_seqcode, e->any.seqcode); } else { - rep(js->last_seqcode, e->any.seqcode); + +/* the first set of LM events (Accept, Transfer/* -> LRMS) + should not should shift the state (to Ready, Scheduled) but NOT to + update js->last_seqcode completely, in order not to block following + LRMS events which are likely to arrive later but should still affect + job state (as there may be no more LM events due to the Condor bug). + However, don't ignore the incoming seqcode completely, to catch up + with possibly delayed WM/JSS events */ + + if (lm_favour_lrms) { + free(js->last_seqcode); + js->last_seqcode = set_component_seqcode(e->any.seqcode,EDG_WLL_SOURCE_LOG_MONITOR,0); + } + else rep(js->last_seqcode, e->any.seqcode); } } -- 1.8.2.3