1
1

Provide a "progress meter" for launch that outputs progress as we are launching, especially on large jobs. Also, provide a timeout mechanism so that we cleanly abort if we don't get a response from the next daemon in a specified time.

This commit was SVN r21359.
Этот коммит содержится в:
Ralph Castain 2009-06-02 23:52:02 +00:00
родитель 4d3aa5a8a4
Коммит 30a357bd8d
4 изменённых файлов: 62 добавлений и 2 удалений

Просмотреть файл

@ -646,6 +646,20 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
*/
static bool app_launch_failed;
static struct timeval max_daemon_launch_msg_recvd = {0,0};
static orte_vpid_t num_daemons_reported=0;
static opal_event_t *dmn_report_ev=NULL;
/* catch timeout to allow cmds to progress */
static void timer_cb(int fd, short event, void *cbdata)
{
/* free event */
if (NULL != dmn_report_ev) {
free(dmn_report_ev);
dmn_report_ev = NULL;
}
/* declare time is up */
app_launch_failed = true;
}
/* since the HNP also reports launch of procs, we need to separate out
* the processing of the message vs its receipt so that the HNP
@ -670,6 +684,13 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&mev->sender)));
/* got a response - cancel the timer */
if (NULL != dmn_report_ev) {
opal_event_del(dmn_report_ev);
free(dmn_report_ev);
dmn_report_ev = NULL;
}
/* unpack the jobid being reported */
cnt = 1;
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &cnt, ORTE_JOBID))) {
@ -689,6 +710,8 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
goto CLEANUP;
}
num_daemons_reported++;
/* get the job data object */
if (NULL == (jdata = orte_get_job_data_object(jobid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
@ -827,6 +850,14 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
ORTE_ERROR_LOG(rc);
}
if (orte_report_launch_progress) {
if (0 == num_daemons_reported % 100 || num_daemons_reported == orte_process_info.num_procs) {
opal_output(orte_clean_output, "Reported: %d(%d) daemons %d(%d) procs",
(int)num_daemons_reported, (int)orte_process_info.num_procs,
(int)jdata->num_launched, (int)jdata->num_procs);
}
}
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
"%s plm:base:app_report_launch completed processing",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -842,6 +873,10 @@ CLEANUP:
}
}
/* restart the timer, if necessary */
if (jdata->num_launched < jdata->num_procs && 0 < orte_startup_timeout) {
ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb);
}
}
@ -892,6 +927,13 @@ static int orte_plm_base_report_launched(orte_jobid_t job)
return ORTE_ERR_NOT_FOUND;
}
/* setup a timer - if we don't hear back from a daemon in the
* defined time, then we know things have failed
*/
if (0 < orte_startup_timeout) {
ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb);
}
/* we should get a callback from every daemon that is involved in
* the launch. Fortunately, the mapper keeps track of this number
* for us since num_nodes = num_participating_daemons

Просмотреть файл

@ -127,6 +127,9 @@ bool orte_assume_same_shell = true;
/* orted exit with barrier */
bool orte_orted_exit_with_barrier = true;
/* report launch progress */
bool orte_report_launch_progress = false;
#endif /* !ORTE_DISABLE_FULL_RTE */
int orte_debug_output = -1;

Просмотреть файл

@ -514,6 +514,9 @@ ORTE_DECLSPEC extern bool orte_assume_same_shell;
/* whether or not to barrier the orteds upon exit */
ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier;
/* whether or not to report launch progress */
ORTE_DECLSPEC extern bool orte_report_launch_progress;
#endif /* ORTE_DISABLE_FULL_SUPPORT */
END_C_DECLS

Просмотреть файл

@ -284,6 +284,18 @@ int orte_register_params(void)
mca_base_param_lookup_int(tmp, &value);
orte_assume_same_shell = OPAL_INT_TO_BOOL(value);
/* whether or not to report launch progress */
mca_base_param_reg_int_name("orte", "report_launch_progress",
"Output a brief periodic report on launch progress [default: no]",
false, false,
(int) false, &value);
orte_report_launch_progress = OPAL_INT_TO_BOOL(value);
if (orte_report_launch_progress) {
/* ensure the startup timeout is set to something reasonable */
if (0 == orte_startup_timeout) {
orte_startup_timeout = 2000; /* default to 2 seconds */
}
}
#endif /* ORTE_DISABLE_FULL_SUPPORT */
return ORTE_SUCCESS;