1
1

Revise the launch timeout detection so we don't mistakenly declare "failed to start". Recognize that timeout is at the per-job level, and define the timeout param as a total value instead of seconds/daemon as it otherwise can get to be an enormous (and useless) number.

Resolves problems in loop_spawn where the timer was incorrectly firing and killing the overall job.

cmr=v1.7.4:reviewer=hjelmn

This commit was SVN r29661.
Этот коммит содержится в:
Ralph Castain 2013-11-11 23:50:40 +00:00
родитель 46f633883b
Коммит f1e510154c
4 изменённых файлов: 40 добавлений и 12 удалений

Просмотреть файл

@ -460,16 +460,13 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
/* catch timeout to allow cmds to progress */ /* catch timeout to allow cmds to progress */
static void timer_cb(int fd, short event, void *cbdata) static void timer_cb(int fd, short event, void *cbdata)
{ {
orte_timer_t *tm = (orte_timer_t*)cbdata; orte_job_t *jdata = (orte_job_t*)cbdata;
orte_job_t *jdata = (orte_job_t*)tm->payload;
if (NULL == jdata || jdata->state < ORTE_JOB_STATE_RUNNING) {
/* declare launch failed */ /* declare launch failed */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
}
/* free event */ /* free event */
OBJ_RELEASE(tm); OBJ_RELEASE(jdata->failure_timer);
} }
void orte_plm_base_launch_apps(int fd, short args, void *cbdata) void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
@ -532,11 +529,24 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
*/ */
caddy->jdata->num_daemons_reported++; caddy->jdata->num_daemons_reported++;
/* setup a timer - if we don't launch within the /* if requested, setup a timer - if we don't launch within the
* defined time, then we know things have failed * defined time, then we know things have failed
*/ */
if (0 < orte_startup_timeout) { if (0 < orte_startup_timeout) {
ORTE_DETECT_TIMEOUT(orte_startup_timeout, 1000000, 0, timer_cb, jdata); OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:launch defining timeout for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
jdata->failure_timer = OBJ_NEW(orte_timer_t);
jdata->failure_timer->payload = jdata;
opal_event_evtimer_set(orte_event_base,
jdata->failure_timer->ev, timer_cb,
jdata);
opal_event_set_priority(jdata->failure_timer->ev, ORTE_ERROR_PRI);
jdata->failure_timer->tv.tv_sec = orte_startup_timeout;
jdata->failure_timer->tv.tv_usec = 0;
opal_event_evtimer_add(jdata->failure_timer->ev,
&jdata->failure_timer->tv);
} }
/* cleanup */ /* cleanup */
@ -553,6 +563,16 @@ void orte_plm_base_post_launch(int fd, short args, void *cbdata)
/* convenience */ /* convenience */
jdata = caddy->jdata; jdata = caddy->jdata;
/* if a timer was defined, cancel it */
if (NULL != jdata->failure_timer) {
opal_event_evtimer_del(jdata->failure_timer->ev);
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:launch deleting timeout for job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
OBJ_RELEASE(jdata->failure_timer);
}
if (ORTE_JOB_STATE_RUNNING != caddy->job_state) { if (ORTE_JOB_STATE_RUNNING != caddy->job_state) {
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
OBJ_RELEASE(caddy); OBJ_RELEASE(caddy);
@ -594,8 +614,9 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
jdata = caddy->jdata; jdata = caddy->jdata;
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
"%s plm:base:launch registered event", "%s plm:base:launch %s registered",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
if (ORTE_JOB_STATE_REGISTERED != caddy->job_state) { if (ORTE_JOB_STATE_REGISTERED != caddy->job_state) {
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,

Просмотреть файл

@ -702,6 +702,7 @@ static void orte_job_construct(orte_job_t* job)
2); 2);
job->num_apps = 0; job->num_apps = 0;
job->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT; job->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT;
job->failure_timer = NULL;
job->gang_launched = true; job->gang_launched = true;
job->stdin_target = ORTE_VPID_INVALID; job->stdin_target = ORTE_VPID_INVALID;
job->stdout_target = ORTE_JOBID_INVALID; job->stdout_target = ORTE_JOBID_INVALID;
@ -767,6 +768,10 @@ static void orte_job_destruct(orte_job_t* job)
} }
OBJ_RELEASE(job->apps); OBJ_RELEASE(job->apps);
if (NULL != job->failure_timer) {
OBJ_RELEASE(job->failure_timer);
}
if (NULL != job->map) { if (NULL != job->map) {
OBJ_RELEASE(job->map); OBJ_RELEASE(job->map);
job->map = NULL; job->map = NULL;

Просмотреть файл

@ -388,6 +388,8 @@ typedef struct {
* for description of supported flags * for description of supported flags
*/ */
orte_job_controls_t controls; orte_job_controls_t controls;
/* timer event for failure detect/response if fails to launch */
orte_timer_t *failure_timer;
/* flag to indicate that MPI is allowed on this job - i.e., /* flag to indicate that MPI is allowed on this job - i.e.,
* that all members of the job are being simultaneously * that all members of the job are being simultaneously
* launched * launched

Просмотреть файл

@ -314,7 +314,7 @@ int orte_register_params(void)
orte_startup_timeout = 0; orte_startup_timeout = 0;
(void) mca_base_var_register ("orte", "orte", NULL, "startup_timeout", (void) mca_base_var_register ("orte", "orte", NULL, "startup_timeout",
"Seconds/daemon to wait for startup before declaring failed_to_start (default: 0 => do not check)", "Seconds to wait for startup or job launch before declaring failed_to_start (default: 0 => do not check)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_startup_timeout); &orte_startup_timeout);