From f1e510154c65fba687ee5d260e00ff7c8bc00613 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 11 Nov 2013 23:50:40 +0000 Subject: [PATCH] Revise the launch timeout detection so we don't mistakenly declare "failed to start". Recognize that timeout is at the per-job level, and define the timeout param as a total value instead of seconds/daemon as it otherwise can get to be an enormous (and useless) number. Resolves problems in loop_spawn where the timer was incorrectly firing and killing the overall job. cmr=v1.7.4:reviewer=hjelmn This commit was SVN r29661. --- orte/mca/plm/base/plm_base_launch_support.c | 43 +++++++++++++++------ orte/runtime/orte_globals.c | 5 +++ orte/runtime/orte_globals.h | 2 + orte/runtime/orte_mca_params.c | 2 +- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index d93ccd6ce2..3deee1173f 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -460,16 +460,13 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata) /* catch timeout to allow cmds to progress */ static void timer_cb(int fd, short event, void *cbdata) { - orte_timer_t *tm = (orte_timer_t*)cbdata; - orte_job_t *jdata = (orte_job_t*)tm->payload; + orte_job_t *jdata = (orte_job_t*)cbdata; - if (NULL == jdata || jdata->state < ORTE_JOB_STATE_RUNNING) { - /* declare launch failed */ - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START); - } + /* declare launch failed */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START); /* free event */ - OBJ_RELEASE(tm); + OBJ_RELEASE(jdata->failure_timer); } void orte_plm_base_launch_apps(int fd, short args, void *cbdata) @@ -532,11 +529,24 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata) */ caddy->jdata->num_daemons_reported++; - /* setup a timer - if we don't launch within the + /* if requested, setup a timer - if we don't launch within the * defined time, then we know things have failed */ if (0 < orte_startup_timeout) { - ORTE_DETECT_TIMEOUT(orte_startup_timeout, 1000000, 0, timer_cb, jdata); + OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, + "%s plm:base:launch defining timeout for job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + jdata->failure_timer = OBJ_NEW(orte_timer_t); + jdata->failure_timer->payload = jdata; + opal_event_evtimer_set(orte_event_base, + jdata->failure_timer->ev, timer_cb, + jdata); + opal_event_set_priority(jdata->failure_timer->ev, ORTE_ERROR_PRI); + jdata->failure_timer->tv.tv_sec = orte_startup_timeout; + jdata->failure_timer->tv.tv_usec = 0; + opal_event_evtimer_add(jdata->failure_timer->ev, + &jdata->failure_timer->tv); } /* cleanup */ @@ -553,6 +563,16 @@ void orte_plm_base_post_launch(int fd, short args, void *cbdata) /* convenience */ jdata = caddy->jdata; + /* if a timer was defined, cancel it */ + if (NULL != jdata->failure_timer) { + opal_event_evtimer_del(jdata->failure_timer->ev); + OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, + "%s plm:base:launch deleting timeout for job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + OBJ_RELEASE(jdata->failure_timer); + } + if (ORTE_JOB_STATE_RUNNING != caddy->job_state) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); @@ -594,8 +614,9 @@ void orte_plm_base_registered(int fd, short args, void *cbdata) jdata = caddy->jdata; OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, - "%s plm:base:launch registered event", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + "%s plm:base:launch %s registered", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); if (ORTE_JOB_STATE_REGISTERED != caddy->job_state) { OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 0bf966828d..384a585a1d 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -702,6 +702,7 @@ static void orte_job_construct(orte_job_t* job) 2); job->num_apps = 0; job->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT; + job->failure_timer = NULL; job->gang_launched = true; job->stdin_target = ORTE_VPID_INVALID; job->stdout_target = ORTE_JOBID_INVALID; @@ -767,6 +768,10 @@ static void orte_job_destruct(orte_job_t* job) } OBJ_RELEASE(job->apps); + if (NULL != job->failure_timer) { + OBJ_RELEASE(job->failure_timer); + } + if (NULL != job->map) { OBJ_RELEASE(job->map); job->map = NULL; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 2133dff6dc..8209781549 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -388,6 +388,8 @@ typedef struct { * for description of supported flags */ orte_job_controls_t controls; + /* timer event for failure detect/response if fails to launch */ + orte_timer_t *failure_timer; /* flag to indicate that MPI is allowed on this job - i.e., * that all members of the job are being simultaneously * launched diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 151d88d33e..e45afe9375 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -314,7 +314,7 @@ int orte_register_params(void) orte_startup_timeout = 0; (void) mca_base_var_register ("orte", "orte", NULL, "startup_timeout", - "Seconds/daemon to wait for startup before declaring failed_to_start (default: 0 => do not check)", + "Seconds to wait for startup or job launch before declaring failed_to_start (default: 0 => do not check)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &orte_startup_timeout);