Fix reporting of launch progress so the numbers are correct and appear when they should
This commit was SVN r26342.
Этот коммит содержится в:
родитель
3b5b185c86
Коммит
3461809341
@ -372,6 +372,11 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
|
||||
return;
|
||||
}
|
||||
|
||||
/* track that we automatically are considered to have reported - used
|
||||
* only to report out launch progress
|
||||
*/
|
||||
caddy->jdata->num_daemons_reported++;
|
||||
|
||||
/* setup a timer - if we don't launch within the
|
||||
* defined time, then we know things have failed
|
||||
*/
|
||||
|
@ -304,9 +304,7 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
|
||||
if (orte_report_launch_progress) {
|
||||
if (0 == jdata->num_daemons_reported % 100 ||
|
||||
jdata->num_daemons_reported == orte_process_info.num_procs) {
|
||||
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
|
||||
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
||||
(int)jdata->num_launched, (int)jdata->num_procs);
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -148,6 +148,8 @@ typedef int32_t orte_job_state_t;
|
||||
#define ORTE_JOB_STATE_FORCED_EXIT (ORTE_JOB_STATE_ERROR + 14)
|
||||
#define ORTE_JOB_STATE_SILENT_ABORT (ORTE_JOB_STATE_ERROR + 16) /* an error occurred and was reported elsewhere, so error out quietly */
|
||||
|
||||
#define ORTE_JOB_STATE_REPORT_PROGRESS (ORTE_JOB_STATE_ERROR + 17) /* report launch progress - not an error */
|
||||
|
||||
/* Define a boundary so that external developers
|
||||
* have a starting point for defining their own
|
||||
* job states
|
||||
|
@ -63,14 +63,23 @@ orte_state_base_module_t orte_state_hnp_module = {
|
||||
orte_state_base_remove_proc_state
|
||||
};
|
||||
|
||||
static void ignore_cbfunc(int fd, short argc, void *cbdata)
|
||||
static void local_launch_complete(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = state->jdata;
|
||||
|
||||
if (orte_report_launch_progress) {
|
||||
if (0 == jdata->num_daemons_reported % 100 ||
|
||||
jdata->num_daemons_reported == orte_process_info.num_procs) {
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(state);
|
||||
}
|
||||
|
||||
static void track_procs(int fd, short argc, void *cbdata);
|
||||
static void check_all_complete(int fd, short argc, void *cbdata);
|
||||
static void report_progress(int fd, short argc, void *cbdata);
|
||||
|
||||
/* defined default state machine sequence - individual
|
||||
* plm's must add a state for launching daemons
|
||||
@ -99,7 +108,7 @@ static orte_state_cbfunc_t launch_callbacks[] = {
|
||||
orte_rmaps_base_map_job,
|
||||
orte_plm_base_complete_setup,
|
||||
orte_plm_base_launch_apps,
|
||||
ignore_cbfunc, /* HNP doesn't need to process local_launch_complete */
|
||||
local_launch_complete,
|
||||
orte_plm_base_post_launch,
|
||||
orte_plm_base_registered,
|
||||
check_all_complete,
|
||||
@ -148,6 +157,11 @@ static int init(void)
|
||||
orte_quit, ORTE_ERROR_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* add callback to report progress, if requested */
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_REPORT_PROGRESS,
|
||||
report_progress, ORTE_INFO_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
if (5 < opal_output_get_verbosity(orte_state_base_output)) {
|
||||
orte_state_base_print_job_state_machine();
|
||||
}
|
||||
@ -207,6 +221,17 @@ static void cleanup_node(orte_proc_t *proc)
|
||||
}
|
||||
}
|
||||
|
||||
static void report_progress(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata = caddy->jdata;
|
||||
|
||||
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
|
||||
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
|
||||
(int)jdata->num_launched, (int)jdata->num_procs);
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static void track_procs(int fd, short argc, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
@ -267,6 +267,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
||||
return "DAEMONS TERMINATED";
|
||||
case ORTE_JOB_STATE_SILENT_ABORT:
|
||||
return "ERROR REPORTED ELSEWHERE";
|
||||
case ORTE_JOB_STATE_REPORT_PROGRESS:
|
||||
return "REPORT PROGRESS";
|
||||
case ORTE_JOB_STATE_ANY:
|
||||
return "ANY";
|
||||
default:
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user