Revise the launch timeout detection so we don't mistakenly declare "failed to start". Recognize that timeout is at the per-job level, and define the timeout param as a total value instead of seconds/daemon as it otherwise can get to be an enormous (and useless) number.
Resolves problems in loop_spawn where the timer was incorrectly firing and killing the overall job. cmr=v1.7.4:reviewer=hjelmn This commit was SVN r29661.
Этот коммит содержится в:
родитель
46f633883b
Коммит
f1e510154c
@ -460,16 +460,13 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
|
||||
/* catch timeout to allow cmds to progress */
|
||||
static void timer_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
orte_timer_t *tm = (orte_timer_t*)cbdata;
|
||||
orte_job_t *jdata = (orte_job_t*)tm->payload;
|
||||
orte_job_t *jdata = (orte_job_t*)cbdata;
|
||||
|
||||
if (NULL == jdata || jdata->state < ORTE_JOB_STATE_RUNNING) {
|
||||
/* declare launch failed */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
||||
/* free event */
|
||||
OBJ_RELEASE(tm);
|
||||
OBJ_RELEASE(jdata->failure_timer);
|
||||
}
|
||||
|
||||
void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
|
||||
@ -532,11 +529,24 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
|
||||
*/
|
||||
caddy->jdata->num_daemons_reported++;
|
||||
|
||||
/* setup a timer - if we don't launch within the
|
||||
/* if requested, setup a timer - if we don't launch within the
|
||||
* defined time, then we know things have failed
|
||||
*/
|
||||
if (0 < orte_startup_timeout) {
|
||||
ORTE_DETECT_TIMEOUT(orte_startup_timeout, 1000000, 0, timer_cb, jdata);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:launch defining timeout for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
jdata->failure_timer = OBJ_NEW(orte_timer_t);
|
||||
jdata->failure_timer->payload = jdata;
|
||||
opal_event_evtimer_set(orte_event_base,
|
||||
jdata->failure_timer->ev, timer_cb,
|
||||
jdata);
|
||||
opal_event_set_priority(jdata->failure_timer->ev, ORTE_ERROR_PRI);
|
||||
jdata->failure_timer->tv.tv_sec = orte_startup_timeout;
|
||||
jdata->failure_timer->tv.tv_usec = 0;
|
||||
opal_event_evtimer_add(jdata->failure_timer->ev,
|
||||
&jdata->failure_timer->tv);
|
||||
}
|
||||
|
||||
/* cleanup */
|
||||
@ -553,6 +563,16 @@ void orte_plm_base_post_launch(int fd, short args, void *cbdata)
|
||||
/* convenience */
|
||||
jdata = caddy->jdata;
|
||||
|
||||
/* if a timer was defined, cancel it */
|
||||
if (NULL != jdata->failure_timer) {
|
||||
opal_event_evtimer_del(jdata->failure_timer->ev);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:launch deleting timeout for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
OBJ_RELEASE(jdata->failure_timer);
|
||||
}
|
||||
|
||||
if (ORTE_JOB_STATE_RUNNING != caddy->job_state) {
|
||||
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
OBJ_RELEASE(caddy);
|
||||
@ -594,8 +614,9 @@ void orte_plm_base_registered(int fd, short args, void *cbdata)
|
||||
jdata = caddy->jdata;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
"%s plm:base:launch registered event",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
"%s plm:base:launch %s registered",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid)));
|
||||
|
||||
if (ORTE_JOB_STATE_REGISTERED != caddy->job_state) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
|
||||
|
@ -702,6 +702,7 @@ static void orte_job_construct(orte_job_t* job)
|
||||
2);
|
||||
job->num_apps = 0;
|
||||
job->controls = ORTE_JOB_CONTROL_FORWARD_OUTPUT;
|
||||
job->failure_timer = NULL;
|
||||
job->gang_launched = true;
|
||||
job->stdin_target = ORTE_VPID_INVALID;
|
||||
job->stdout_target = ORTE_JOBID_INVALID;
|
||||
@ -767,6 +768,10 @@ static void orte_job_destruct(orte_job_t* job)
|
||||
}
|
||||
OBJ_RELEASE(job->apps);
|
||||
|
||||
if (NULL != job->failure_timer) {
|
||||
OBJ_RELEASE(job->failure_timer);
|
||||
}
|
||||
|
||||
if (NULL != job->map) {
|
||||
OBJ_RELEASE(job->map);
|
||||
job->map = NULL;
|
||||
|
@ -388,6 +388,8 @@ typedef struct {
|
||||
* for description of supported flags
|
||||
*/
|
||||
orte_job_controls_t controls;
|
||||
/* timer event for failure detect/response if fails to launch */
|
||||
orte_timer_t *failure_timer;
|
||||
/* flag to indicate that MPI is allowed on this job - i.e.,
|
||||
* that all members of the job are being simultaneously
|
||||
* launched
|
||||
|
@ -314,7 +314,7 @@ int orte_register_params(void)
|
||||
|
||||
orte_startup_timeout = 0;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "startup_timeout",
|
||||
"Seconds/daemon to wait for startup before declaring failed_to_start (default: 0 => do not check)",
|
||||
"Seconds to wait for startup or job launch before declaring failed_to_start (default: 0 => do not check)",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_startup_timeout);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user