diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index dc841d891d..c849de87e2 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -783,6 +783,13 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed all jobs terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* stop the job timeout event, if set */ + if (NULL != orte_mpiexec_timeout) { + OBJ_RELEASE(orte_mpiexec_timeout); + orte_mpiexec_timeout = NULL; + } + /* set the exit status to 0 - this will only happen if it * wasn't already set by an error condition */ diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 131b5292e0..f3e3029b05 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -123,9 +123,9 @@ bool orte_orteds_term_ordered = false; bool orte_allowed_exit_without_sync = false; int orte_startup_timeout; - int orte_timeout_usec_per_proc; float orte_max_timeout; +orte_timer_t *orte_mpiexec_timeout = NULL; opal_buffer_t *orte_tree_launch_cmd = NULL; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index e3deb87e39..f284045bf7 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -668,7 +668,7 @@ ORTE_DECLSPEC extern int orte_startup_timeout; ORTE_DECLSPEC extern int orte_timeout_usec_per_proc; ORTE_DECLSPEC extern float orte_max_timeout; - +ORTE_DECLSPEC extern orte_timer_t *orte_mpiexec_timeout; ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd; /* global arrays for data storage */ diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index beae15cf27..c357298386 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -137,6 +137,8 @@ static void open_fifo (void); ORTE_DECLSPEC void* MPIR_Breakpoint(void); +static void orte_timeout_wakeup(int sd, short args, void *cbdata); + /* * Breakpoint function for parallel debuggers */ @@ -1022,6 +1024,23 @@ int orterun(int argc, char *argv[]) } } + /* check for a job timeout specification, to be provided in seconds + * as that is what MPICH used + */ + if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) { + if (NULL == (orte_mpiexec_timeout = OBJ_NEW(orte_timer_t))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE); + goto DONE; + } + orte_mpiexec_timeout->tv.tv_sec = strtol(param, NULL, 10); + orte_mpiexec_timeout->tv.tv_usec = 0; + opal_event_evtimer_set(orte_event_base, orte_mpiexec_timeout->ev, + orte_timeout_wakeup, jdata); + opal_event_set_priority(orte_mpiexec_timeout->ev, ORTE_ERROR_PRI); + opal_event_evtimer_add(orte_mpiexec_timeout->ev, &orte_mpiexec_timeout->tv); + } + /* spawn the job and its daemons */ rc = orte_plm.spawn(jdata); @@ -3037,3 +3056,17 @@ static void build_debugger_args(orte_app_context_t *debugger) } } } + +void orte_timeout_wakeup(int sd, short args, void *cbdata) +{ + orte_job_t *jdata = (orte_job_t*)cbdata; + + /* this function gets called when the job execution time + * has hit a prescribed limit - so just abort + */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + /* abort the job */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FORCED_EXIT); + /* set the global abnormal exit flag */ + orte_abnormal_term_ordered = true; +}