Provide a "progress meter" for launch that outputs progress as we are launching, especially on large jobs. Also, provide a timeout mechanism so that we cleanly abort if we don't get a response from the next daemon in a specified time.
This commit was SVN r21359.
Этот коммит содержится в:
родитель
4d3aa5a8a4
Коммит
30a357bd8d
@ -646,6 +646,20 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
|
||||
*/
|
||||
static bool app_launch_failed;
|
||||
static struct timeval max_daemon_launch_msg_recvd = {0,0};
|
||||
static orte_vpid_t num_daemons_reported=0;
|
||||
static opal_event_t *dmn_report_ev=NULL;
|
||||
|
||||
/* catch timeout to allow cmds to progress */
|
||||
static void timer_cb(int fd, short event, void *cbdata)
|
||||
{
|
||||
/* free event */
|
||||
if (NULL != dmn_report_ev) {
|
||||
free(dmn_report_ev);
|
||||
dmn_report_ev = NULL;
|
||||
}
|
||||
/* declare time is up */
|
||||
app_launch_failed = true;
|
||||
}
|
||||
|
||||
/* since the HNP also reports launch of procs, we need to separate out
|
||||
* the processing of the message vs its receipt so that the HNP
|
||||
@ -670,6 +684,13 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&mev->sender)));
|
||||
|
||||
/* got a response - cancel the timer */
|
||||
if (NULL != dmn_report_ev) {
|
||||
opal_event_del(dmn_report_ev);
|
||||
free(dmn_report_ev);
|
||||
dmn_report_ev = NULL;
|
||||
}
|
||||
|
||||
/* unpack the jobid being reported */
|
||||
cnt = 1;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &cnt, ORTE_JOBID))) {
|
||||
@ -689,6 +710,8 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
num_daemons_reported++;
|
||||
|
||||
/* get the job data object */
|
||||
if (NULL == (jdata = orte_get_job_data_object(jobid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
@ -827,6 +850,14 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
if (orte_report_launch_progress) {
|
||||
if (0 == num_daemons_reported % 100 || num_daemons_reported == orte_process_info.num_procs) {
|
||||
opal_output(orte_clean_output, "Reported: %d(%d) daemons %d(%d) procs",
|
||||
(int)num_daemons_reported, (int)orte_process_info.num_procs,
|
||||
(int)jdata->num_launched, (int)jdata->num_procs);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:app_report_launch completed processing",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
@ -841,7 +872,11 @@ CLEANUP:
|
||||
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* restart the timer, if necessary */
|
||||
if (jdata->num_launched < jdata->num_procs && 0 < orte_startup_timeout) {
|
||||
ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -880,7 +915,7 @@ static int orte_plm_base_report_launched(orte_jobid_t job)
|
||||
{
|
||||
int rc;
|
||||
orte_job_t *jdata;
|
||||
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||
"%s plm:base:report_launched for job %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -892,6 +927,13 @@ static int orte_plm_base_report_launched(orte_jobid_t job)
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* setup a timer - if we don't hear back from a daemon in the
|
||||
* defined time, then we know things have failed
|
||||
*/
|
||||
if (0 < orte_startup_timeout) {
|
||||
ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb);
|
||||
}
|
||||
|
||||
/* we should get a callback from every daemon that is involved in
|
||||
* the launch. Fortunately, the mapper keeps track of this number
|
||||
* for us since num_nodes = num_participating_daemons
|
||||
|
@ -127,6 +127,9 @@ bool orte_assume_same_shell = true;
|
||||
/* orted exit with barrier */
|
||||
bool orte_orted_exit_with_barrier = true;
|
||||
|
||||
/* report launch progress */
|
||||
bool orte_report_launch_progress = false;
|
||||
|
||||
#endif /* !ORTE_DISABLE_FULL_RTE */
|
||||
|
||||
int orte_debug_output = -1;
|
||||
|
@ -514,6 +514,9 @@ ORTE_DECLSPEC extern bool orte_assume_same_shell;
|
||||
/* whether or not to barrier the orteds upon exit */
|
||||
ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier;
|
||||
|
||||
/* whether or not to report launch progress */
|
||||
ORTE_DECLSPEC extern bool orte_report_launch_progress;
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
END_C_DECLS
|
||||
|
@ -284,6 +284,18 @@ int orte_register_params(void)
|
||||
mca_base_param_lookup_int(tmp, &value);
|
||||
orte_assume_same_shell = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
/* whether or not to report launch progress */
|
||||
mca_base_param_reg_int_name("orte", "report_launch_progress",
|
||||
"Output a brief periodic report on launch progress [default: no]",
|
||||
false, false,
|
||||
(int) false, &value);
|
||||
orte_report_launch_progress = OPAL_INT_TO_BOOL(value);
|
||||
if (orte_report_launch_progress) {
|
||||
/* ensure the startup timeout is set to something reasonable */
|
||||
if (0 == orte_startup_timeout) {
|
||||
orte_startup_timeout = 2000; /* default to 2 seconds */
|
||||
}
|
||||
}
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user