Provide a "progress meter" for launch that outputs progress as we are launching, especially on large jobs. Also, provide a timeout mechanism so that we cleanly abort if we don't get a response from the next daemon in a specified time.
This commit was SVN r21359.
Этот коммит содержится в:
родитель
4d3aa5a8a4
Коммит
30a357bd8d
@ -646,6 +646,20 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons)
|
|||||||
*/
|
*/
|
||||||
static bool app_launch_failed;
|
static bool app_launch_failed;
|
||||||
static struct timeval max_daemon_launch_msg_recvd = {0,0};
|
static struct timeval max_daemon_launch_msg_recvd = {0,0};
|
||||||
|
static orte_vpid_t num_daemons_reported=0;
|
||||||
|
static opal_event_t *dmn_report_ev=NULL;
|
||||||
|
|
||||||
|
/* catch timeout to allow cmds to progress */
|
||||||
|
static void timer_cb(int fd, short event, void *cbdata)
|
||||||
|
{
|
||||||
|
/* free event */
|
||||||
|
if (NULL != dmn_report_ev) {
|
||||||
|
free(dmn_report_ev);
|
||||||
|
dmn_report_ev = NULL;
|
||||||
|
}
|
||||||
|
/* declare time is up */
|
||||||
|
app_launch_failed = true;
|
||||||
|
}
|
||||||
|
|
||||||
/* since the HNP also reports launch of procs, we need to separate out
|
/* since the HNP also reports launch of procs, we need to separate out
|
||||||
* the processing of the message vs its receipt so that the HNP
|
* the processing of the message vs its receipt so that the HNP
|
||||||
@ -670,6 +684,13 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
|
|||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(&mev->sender)));
|
ORTE_NAME_PRINT(&mev->sender)));
|
||||||
|
|
||||||
|
/* got a response - cancel the timer */
|
||||||
|
if (NULL != dmn_report_ev) {
|
||||||
|
opal_event_del(dmn_report_ev);
|
||||||
|
free(dmn_report_ev);
|
||||||
|
dmn_report_ev = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* unpack the jobid being reported */
|
/* unpack the jobid being reported */
|
||||||
cnt = 1;
|
cnt = 1;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &cnt, ORTE_JOBID))) {
|
if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &cnt, ORTE_JOBID))) {
|
||||||
@ -689,6 +710,8 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
|
|||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
num_daemons_reported++;
|
||||||
|
|
||||||
/* get the job data object */
|
/* get the job data object */
|
||||||
if (NULL == (jdata = orte_get_job_data_object(jobid))) {
|
if (NULL == (jdata = orte_get_job_data_object(jobid))) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||||
@ -827,6 +850,14 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data)
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (orte_report_launch_progress) {
|
||||||
|
if (0 == num_daemons_reported % 100 || num_daemons_reported == orte_process_info.num_procs) {
|
||||||
|
opal_output(orte_clean_output, "Reported: %d(%d) daemons %d(%d) procs",
|
||||||
|
(int)num_daemons_reported, (int)orte_process_info.num_procs,
|
||||||
|
(int)jdata->num_launched, (int)jdata->num_procs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
"%s plm:base:app_report_launch completed processing",
|
"%s plm:base:app_report_launch completed processing",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
@ -841,7 +872,11 @@ CLEANUP:
|
|||||||
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
|
orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* restart the timer, if necessary */
|
||||||
|
if (jdata->num_launched < jdata->num_procs && 0 < orte_startup_timeout) {
|
||||||
|
ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -880,7 +915,7 @@ static int orte_plm_base_report_launched(orte_jobid_t job)
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
|
||||||
"%s plm:base:report_launched for job %s",
|
"%s plm:base:report_launched for job %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
@ -892,6 +927,13 @@ static int orte_plm_base_report_launched(orte_jobid_t job)
|
|||||||
return ORTE_ERR_NOT_FOUND;
|
return ORTE_ERR_NOT_FOUND;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* setup a timer - if we don't hear back from a daemon in the
|
||||||
|
* defined time, then we know things have failed
|
||||||
|
*/
|
||||||
|
if (0 < orte_startup_timeout) {
|
||||||
|
ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb);
|
||||||
|
}
|
||||||
|
|
||||||
/* we should get a callback from every daemon that is involved in
|
/* we should get a callback from every daemon that is involved in
|
||||||
* the launch. Fortunately, the mapper keeps track of this number
|
* the launch. Fortunately, the mapper keeps track of this number
|
||||||
* for us since num_nodes = num_participating_daemons
|
* for us since num_nodes = num_participating_daemons
|
||||||
|
@ -127,6 +127,9 @@ bool orte_assume_same_shell = true;
|
|||||||
/* orted exit with barrier */
|
/* orted exit with barrier */
|
||||||
bool orte_orted_exit_with_barrier = true;
|
bool orte_orted_exit_with_barrier = true;
|
||||||
|
|
||||||
|
/* report launch progress */
|
||||||
|
bool orte_report_launch_progress = false;
|
||||||
|
|
||||||
#endif /* !ORTE_DISABLE_FULL_RTE */
|
#endif /* !ORTE_DISABLE_FULL_RTE */
|
||||||
|
|
||||||
int orte_debug_output = -1;
|
int orte_debug_output = -1;
|
||||||
|
@ -514,6 +514,9 @@ ORTE_DECLSPEC extern bool orte_assume_same_shell;
|
|||||||
/* whether or not to barrier the orteds upon exit */
|
/* whether or not to barrier the orteds upon exit */
|
||||||
ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier;
|
ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier;
|
||||||
|
|
||||||
|
/* whether or not to report launch progress */
|
||||||
|
ORTE_DECLSPEC extern bool orte_report_launch_progress;
|
||||||
|
|
||||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
@ -284,6 +284,18 @@ int orte_register_params(void)
|
|||||||
mca_base_param_lookup_int(tmp, &value);
|
mca_base_param_lookup_int(tmp, &value);
|
||||||
orte_assume_same_shell = OPAL_INT_TO_BOOL(value);
|
orte_assume_same_shell = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
|
/* whether or not to report launch progress */
|
||||||
|
mca_base_param_reg_int_name("orte", "report_launch_progress",
|
||||||
|
"Output a brief periodic report on launch progress [default: no]",
|
||||||
|
false, false,
|
||||||
|
(int) false, &value);
|
||||||
|
orte_report_launch_progress = OPAL_INT_TO_BOOL(value);
|
||||||
|
if (orte_report_launch_progress) {
|
||||||
|
/* ensure the startup timeout is set to something reasonable */
|
||||||
|
if (0 == orte_startup_timeout) {
|
||||||
|
orte_startup_timeout = 2000; /* default to 2 seconds */
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user