1
1

Per request from Dave Goodell, add support for MPIEXEC_TIMEOUT - if set in the environment, terminate the job after the specified number of seconds has passed. Equivalent to MPICH functionality.

cmr=v1.7.4:reviewer=dgoodell:subject=add support for MPIEXEC_TIMEOUT

This commit was SVN r29831.
Этот коммит содержится в:
Ralph Castain 2013-12-07 01:58:32 +00:00
родитель 3bd9c603ff
Коммит d44e4a311f
4 изменённых файлов: 42 добавлений и 2 удалений

Просмотреть файл

@ -783,6 +783,13 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
"%s state:base:check_job_completed all jobs terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* stop the job timeout event, if set */
if (NULL != orte_mpiexec_timeout) {
OBJ_RELEASE(orte_mpiexec_timeout);
orte_mpiexec_timeout = NULL;
}
/* set the exit status to 0 - this will only happen if it
* wasn't already set by an error condition
*/

Просмотреть файл

@ -123,9 +123,9 @@ bool orte_orteds_term_ordered = false;
bool orte_allowed_exit_without_sync = false;
int orte_startup_timeout;
int orte_timeout_usec_per_proc;
float orte_max_timeout;
orte_timer_t *orte_mpiexec_timeout = NULL;
opal_buffer_t *orte_tree_launch_cmd = NULL;

Просмотреть файл

@ -668,7 +668,7 @@ ORTE_DECLSPEC extern int orte_startup_timeout;
ORTE_DECLSPEC extern int orte_timeout_usec_per_proc;
ORTE_DECLSPEC extern float orte_max_timeout;
ORTE_DECLSPEC extern orte_timer_t *orte_mpiexec_timeout;
ORTE_DECLSPEC extern opal_buffer_t *orte_tree_launch_cmd;
/* global arrays for data storage */

Просмотреть файл

@ -137,6 +137,8 @@ static void open_fifo (void);
ORTE_DECLSPEC void* MPIR_Breakpoint(void);
static void orte_timeout_wakeup(int sd, short args, void *cbdata);
/*
* Breakpoint function for parallel debuggers
*/
@ -1022,6 +1024,23 @@ int orterun(int argc, char *argv[])
}
}
/* check for a job timeout specification, to be provided in seconds
* as that is what MPICH used
*/
if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) {
if (NULL == (orte_mpiexec_timeout = OBJ_NEW(orte_timer_t))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE);
goto DONE;
}
orte_mpiexec_timeout->tv.tv_sec = strtol(param, NULL, 10);
orte_mpiexec_timeout->tv.tv_usec = 0;
opal_event_evtimer_set(orte_event_base, orte_mpiexec_timeout->ev,
orte_timeout_wakeup, jdata);
opal_event_set_priority(orte_mpiexec_timeout->ev, ORTE_ERROR_PRI);
opal_event_evtimer_add(orte_mpiexec_timeout->ev, &orte_mpiexec_timeout->tv);
}
/* spawn the job and its daemons */
rc = orte_plm.spawn(jdata);
@ -3037,3 +3056,17 @@ static void build_debugger_args(orte_app_context_t *debugger)
}
}
}
void orte_timeout_wakeup(int sd, short args, void *cbdata)
{
orte_job_t *jdata = (orte_job_t*)cbdata;
/* this function gets called when the job execution time
* has hit a prescribed limit - so just abort
*/
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
/* abort the job */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FORCED_EXIT);
/* set the global abnormal exit flag */
orte_abnormal_term_ordered = true;
}