diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 460d5c649f..7c0f3f29b3 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -646,6 +646,20 @@ int orte_plm_base_daemon_callback(orte_std_cntr_t num_daemons) */ static bool app_launch_failed; static struct timeval max_daemon_launch_msg_recvd = {0,0}; +static orte_vpid_t num_daemons_reported=0; +static opal_event_t *dmn_report_ev=NULL; + +/* catch timeout to allow cmds to progress */ +static void timer_cb(int fd, short event, void *cbdata) +{ + /* free event */ + if (NULL != dmn_report_ev) { + free(dmn_report_ev); + dmn_report_ev = NULL; + } + /* declare time is up */ + app_launch_failed = true; +} /* since the HNP also reports launch of procs, we need to separate out * the processing of the message vs its receipt so that the HNP @@ -670,6 +684,13 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mev->sender))); + /* got a response - cancel the timer */ + if (NULL != dmn_report_ev) { + opal_event_del(dmn_report_ev); + free(dmn_report_ev); + dmn_report_ev = NULL; + } + /* unpack the jobid being reported */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jobid, &cnt, ORTE_JOBID))) { @@ -689,6 +710,8 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data) goto CLEANUP; } + num_daemons_reported++; + /* get the job data object */ if (NULL == (jdata = orte_get_job_data_object(jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); @@ -827,6 +850,14 @@ void orte_plm_base_app_report_launch(int fd, short event, void *data) ORTE_ERROR_LOG(rc); } + if (orte_report_launch_progress) { + if (0 == num_daemons_reported % 100 || num_daemons_reported == orte_process_info.num_procs) { + opal_output(orte_clean_output, "Reported: %d(%d) daemons %d(%d) procs", + (int)num_daemons_reported, (int)orte_process_info.num_procs, + (int)jdata->num_launched, (int)jdata->num_procs); + } + } + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:app_report_launch completed processing", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -841,7 +872,11 @@ CLEANUP: orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code); } } - + + /* restart the timer, if necessary */ + if (jdata->num_launched < jdata->num_procs && 0 < orte_startup_timeout) { + ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb); + } } @@ -880,7 +915,7 @@ static int orte_plm_base_report_launched(orte_jobid_t job) { int rc; orte_job_t *jdata; - + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:report_launched for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -892,6 +927,13 @@ static int orte_plm_base_report_launched(orte_jobid_t job) return ORTE_ERR_NOT_FOUND; } + /* setup a timer - if we don't hear back from a daemon in the + * defined time, then we know things have failed + */ + if (0 < orte_startup_timeout) { + ORTE_DETECT_TIMEOUT(&dmn_report_ev, orte_startup_timeout, 1000, 10000000, timer_cb); + } + /* we should get a callback from every daemon that is involved in * the launch. Fortunately, the mapper keeps track of this number * for us since num_nodes = num_participating_daemons diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 2eea384d05..248342632c 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -127,6 +127,9 @@ bool orte_assume_same_shell = true; /* orted exit with barrier */ bool orte_orted_exit_with_barrier = true; +/* report launch progress */ +bool orte_report_launch_progress = false; + #endif /* !ORTE_DISABLE_FULL_RTE */ int orte_debug_output = -1; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index e790b12263..e4c23f82dd 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -514,6 +514,9 @@ ORTE_DECLSPEC extern bool orte_assume_same_shell; /* whether or not to barrier the orteds upon exit */ ORTE_DECLSPEC extern bool orte_orted_exit_with_barrier; +/* whether or not to report launch progress */ +ORTE_DECLSPEC extern bool orte_report_launch_progress; + #endif /* ORTE_DISABLE_FULL_SUPPORT */ END_C_DECLS diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 07746b89e9..fb02abc093 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -284,6 +284,18 @@ int orte_register_params(void) mca_base_param_lookup_int(tmp, &value); orte_assume_same_shell = OPAL_INT_TO_BOOL(value); + /* whether or not to report launch progress */ + mca_base_param_reg_int_name("orte", "report_launch_progress", + "Output a brief periodic report on launch progress [default: no]", + false, false, + (int) false, &value); + orte_report_launch_progress = OPAL_INT_TO_BOOL(value); + if (orte_report_launch_progress) { + /* ensure the startup timeout is set to something reasonable */ + if (0 == orte_startup_timeout) { + orte_startup_timeout = 2000; /* default to 2 seconds */ + } + } #endif /* ORTE_DISABLE_FULL_SUPPORT */ return ORTE_SUCCESS;