1
1

Revise the launch timeout detection so we don't mistakenly declare "failed to start". Recognize that timeout is at the per-job level, and define the timeout param as a total value instead of seconds/daemon as it otherwise can get to be an enormous (and useless) number.

Resolves problems in loop_spawn where the timer was incorrectly firing and killing the overall job.

cmr=v1.7.4:reviewer=hjelmn

This commit was SVN r29661.
This commit is contained in:
Ralph Castain 2013-11-11 23:50:40 +00:00
parent 46f633883b
commit f1e510154c
4 changed files with 40 additions and 12 deletions

View File

@ -460,16 +460,13 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
/* catch timeout to allow cmds to progress */
static void timer_cb(int fd, short event, void *cbdata)
{
orte_timer_t *tm = (orte_timer_t*)cbdata;
orte_job_t *jdata = (orte_job_t*)tm->payload;
orte_job_t *jdata = (orte_job_t*)cbdata;
if (NULL == jdata || jdata->state < ORTE_JOB_STATE_RUNNING) {
/* declare launch failed */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
}
/* declare launch failed */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
/* free event */
OBJ_RELEASE(tm);
OBJ_RELEASE(jdata->failure_timer);
}
void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
@ -532,11 +529,24 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
*/
caddy->jdata->num_daemons_reported++;
/* setup a timer - if we don't launch within the
/* if requested, setup a timer - if we don't launch within the
* defined time, then we know things have failed
*/
if (0 < orte_startup_timeout) {
ORTE_DETECT_TIMEOUT(orte_startup_timeout, 1000000, 0, timer_cb, jdata);
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:launch defining timeout for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
jdata->failure_timer = OBJ_NEW(orte_timer_t);
jdata->failure_timer->payload = jdata;
opal_event_evtimer_set(orte_event_base,
jdata->failure_timer->ev, timer_cb,
jdata);
opal_event_set_priority(jdata->failure_timer->ev, ORTE_ERROR_PRI);
jdata->failure_timer->tv.tv_sec = orte_startup_timeout;
jdata->failure_timer->tv.tv_usec = 0;
opal_event_evtimer_add(jdata->failure_timer->ev,
&jdata->failure_timer->tv);
}
/* cleanup */
@ -553,6 +563,16 @@ void orte_plm_base_post_launch(int fd, short args, void *cbdata)
/* convenience */
jdata = caddy->jdata;
/* if a timer was defined, cancel it */
if (NULL != jdata->failure_timer) {
opal_event_evtimer_del(jdata->failure_timer->ev);
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:launch deleting timeout for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
OBJ_RELEASE(jdata->failure_timer);
}
if (ORTE_JOB_STATE_RUNNING != caddy->job_state) {
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy);
@ -594,8 +614,9 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
jdata = caddy->jdata;
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:launch registered event",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
"%s plm:base:launch %s registered",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
if (ORTE_JOB_STATE_REGISTERED != caddy->job_state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,

View File

@ -702,6 +702,7 @@ static void orte_job_construct(orte_job_t* job)
2);
job->num_apps = 0;
job->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT;
job->failure_timer = NULL;
job->gang_launched = true;
job->stdin_target = ORTE_VPID_INVALID;
job->stdout_target = ORTE_JOBID_INVALID;
@ -767,6 +768,10 @@ static void orte_job_destruct(orte_job_t* job)
}
OBJ_RELEASE(job->apps);
if (NULL != job->failure_timer) {
OBJ_RELEASE(job->failure_timer);
}
if (NULL != job->map) {
OBJ_RELEASE(job->map);
job->map = NULL;

View File

@ -388,6 +388,8 @@ typedef struct {
* for description of supported flags
*/
orte_job_controls_t controls;
/* timer event for failure detect/response if fails to launch */
orte_timer_t *failure_timer;
/* flag to indicate that MPI is allowed on this job - i.e.,
* that all members of the job are being simultaneously
* launched

View File

@ -314,7 +314,7 @@ int orte_register_params(void)
orte_startup_timeout = 0;
(void) mca_base_var_register ("orte", "orte", NULL, "startup_timeout",
"Seconds/daemon to wait for startup before declaring failed_to_start (default: 0 => do not check)",
"Seconds to wait for startup or job launch before declaring failed_to_start (default: 0 => do not check)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_startup_timeout);