diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c index 231d8d088b..d03cc2eeda 100644 --- a/orte/mca/errmgr/base/errmgr_base_fns.c +++ b/orte/mca/errmgr/base/errmgr_base_fns.c @@ -64,7 +64,7 @@ int orte_errmgr_base_update_state(orte_jobid_t job, orte_proc_state_t state, orte_exit_code_t exit_code) { - int rc; + int rc=ORTE_SUCCESS; int i; orte_errmgr_stack_state_t stack_state; orte_errmgr_base_module_t *module; diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index 99cd267943..e277fb6006 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -1053,6 +1053,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) orte_node_t *node; orte_app_context_t *app; orte_job_map_t *map; + char *app_name; int rc, i, n; /* get the proc_t object for this process */ @@ -1067,12 +1068,13 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) if (jdata->max_global_restarts < pdata->relocates) { return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED; } - + /* if it is a daemon that died, we need to flag all of its procs * to be relocated */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { map = jdata->map; + app_name = "orted"; for (n=0; n < map->nodes->size; n++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { continue; @@ -1103,7 +1105,11 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) OBJ_RELEASE(node); break; } + } else { + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); + app_name = app->app; } + /* reset the job params for restart */ orte_plm_base_reset_job(jdata); @@ -1117,7 +1123,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) ORTE_NAME_PRINT(proc))); if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { - opal_output(0, "FAILED TO RESTART APP %s on error %s", app->app, ORTE_ERROR_NAME(rc)); + opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); return rc; } diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index 8f098d1cc3..26c35bd068 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -116,7 +116,7 @@ static int update_state(orte_jobid_t job, orte_odls_child_t *child; opal_buffer_t alert; orte_plm_cmd_flag_t cmd; - int rc; + int rc=ORTE_SUCCESS; orte_vpid_t null=ORTE_VPID_INVALID; /* indicate that this is the end of the line */ @@ -281,26 +281,22 @@ static int update_state(orte_jobid_t job, if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; + /* Decrement the number of local procs */ + jobdat->num_local_procs--; + /* kill this proc */ + killprocs(proc->jobid, proc->vpid); } + if (jobdat->enable_recovery && child->restarts < jobdat->max_local_restarts) { + child->restarts++; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:orted restarting proc %s for the %d time", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), child->restarts)); + rc = orte_odls.restart_proc(child); + } + return rc; } } - if (jobdat->enable_recovery && child->restarts < jobdat->max_local_restarts) { - child->restarts++; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted restarting proc %s for the %d time", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(proc), child->restarts)); - if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { - return ORTE_SUCCESS; - } - /* let it fall thru to abort */ - } - /* Decrement the number of local procs */ - jobdat->num_local_procs--; - /* kill this proc */ - killprocs(proc->jobid, proc->vpid); - /* let the proc be reported back when terminated */ - return ORTE_SUCCESS; } if (ORTE_PROC_STATE_TERMINATED < state) { diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 88bc655d7b..171d061d6e 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -85,9 +85,9 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data, orte_jobid_t job) { int rc; - orte_job_t *jdata; + orte_job_t *jdata=NULL; orte_proc_t *proc; - orte_job_map_t *map; + orte_job_map_t *map=NULL; opal_buffer_t *wireup; opal_byte_object_t bo, *boptr; int32_t numbytes, *restarts; @@ -2917,6 +2917,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, ORTE_NAME_PRINT(child->name))); /* find this child's jobdat */ + jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { @@ -2925,6 +2926,12 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, break; } } + if (NULL == jobdat) { + /* not found */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + child->state = ORTE_PROC_STATE_FAILED_TO_START; child->exit_code = 0; child->waitpid_recvd = false;