1
1

Fix reporting of launch progress so the numbers are correct and appear when they should

This commit was SVN r26342.
Этот коммит содержится в:
Ralph Castain 2012-04-26 00:10:09 +00:00
родитель 3b5b185c86
Коммит 3461809341
5 изменённых файлов: 37 добавлений и 5 удалений

Просмотреть файл

@ -372,6 +372,11 @@ void orte_plm_base_launch_apps(int fd, short args, void *cbdata)
return;
}
/* track that we automatically are considered to have reported - used
* only to report out launch progress
*/
caddy->jdata->num_daemons_reported++;
/* setup a timer - if we don't launch within the
* defined time, then we know things have failed
*/

Просмотреть файл

@ -304,9 +304,7 @@ void orte_plm_base_recv(int status, orte_process_name_t* sender,
if (orte_report_launch_progress) {
if (0 == jdata->num_daemons_reported % 100 ||
jdata->num_daemons_reported == orte_process_info.num_procs) {
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
(int)jdata->num_launched, (int)jdata->num_procs);
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS);
}
}
}

Просмотреть файл

@ -148,6 +148,8 @@ typedef int32_t orte_job_state_t;
#define ORTE_JOB_STATE_FORCED_EXIT (ORTE_JOB_STATE_ERROR + 14)
#define ORTE_JOB_STATE_SILENT_ABORT (ORTE_JOB_STATE_ERROR + 16) /* an error occurred and was reported elsewhere, so error out quietly */
#define ORTE_JOB_STATE_REPORT_PROGRESS (ORTE_JOB_STATE_ERROR + 17) /* report launch progress - not an error */
/* Define a boundary so that external developers
* have a starting point for defining their own
* job states

Просмотреть файл

@ -63,14 +63,23 @@ orte_state_base_module_t orte_state_hnp_module = {
orte_state_base_remove_proc_state
};
static void ignore_cbfunc(int fd, short argc, void *cbdata)
static void local_launch_complete(int fd, short argc, void *cbdata)
{
orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = state->jdata;
if (orte_report_launch_progress) {
if (0 == jdata->num_daemons_reported % 100 ||
jdata->num_daemons_reported == orte_process_info.num_procs) {
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS);
}
}
OBJ_RELEASE(state);
}
static void track_procs(int fd, short argc, void *cbdata);
static void check_all_complete(int fd, short argc, void *cbdata);
static void report_progress(int fd, short argc, void *cbdata);
/* defined default state machine sequence - individual
* plm's must add a state for launching daemons
@ -99,7 +108,7 @@ static orte_state_cbfunc_t launch_callbacks[] = {
orte_rmaps_base_map_job,
orte_plm_base_complete_setup,
orte_plm_base_launch_apps,
ignore_cbfunc, /* HNP doesn't need to process local_launch_complete */
local_launch_complete,
orte_plm_base_post_launch,
orte_plm_base_registered,
check_all_complete,
@ -148,6 +157,11 @@ static int init(void)
orte_quit, ORTE_ERROR_PRI))) {
ORTE_ERROR_LOG(rc);
}
/* add callback to report progress, if requested */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_REPORT_PROGRESS,
report_progress, ORTE_INFO_PRI))) {
ORTE_ERROR_LOG(rc);
}
if (5 < opal_output_get_verbosity(orte_state_base_output)) {
orte_state_base_print_job_state_machine();
}
@ -207,6 +221,17 @@ static void cleanup_node(orte_proc_t *proc)
}
}
static void report_progress(int fd, short argc, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata = caddy->jdata;
opal_output(orte_clean_output, "App launch reported: %d (out of %d) daemons - %d (out of %d) procs",
(int)jdata->num_daemons_reported, (int)orte_process_info.num_procs,
(int)jdata->num_launched, (int)jdata->num_procs);
OBJ_RELEASE(caddy);
}
static void track_procs(int fd, short argc, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

Просмотреть файл

@ -267,6 +267,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
return "DAEMONS TERMINATED";
case ORTE_JOB_STATE_SILENT_ABORT:
return "ERROR REPORTED ELSEWHERE";
case ORTE_JOB_STATE_REPORT_PROGRESS:
return "REPORT PROGRESS";
case ORTE_JOB_STATE_ANY:
return "ANY";
default: