1
1
This commit was SVN r23150.
Этот коммит содержится в:
Ralph Castain 2010-05-16 20:23:26 +00:00
родитель de85049477
Коммит 7e6985edbf
4 изменённых файлов: 32 добавлений и 23 удалений

Просмотреть файл

@ -64,7 +64,7 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
orte_proc_state_t state,
orte_exit_code_t exit_code)
{
int rc;
int rc=ORTE_SUCCESS;
int i;
orte_errmgr_stack_state_t stack_state;
orte_errmgr_base_module_t *module;

Просмотреть файл

@ -1053,6 +1053,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
orte_node_t *node;
orte_app_context_t *app;
orte_job_map_t *map;
char *app_name;
int rc, i, n;
/* get the proc_t object for this process */
@ -1067,12 +1068,13 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
if (jdata->max_global_restarts < pdata->relocates) {
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
}
/* if it is a daemon that died, we need to flag all of its procs
* to be relocated
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
map = jdata->map;
app_name = "orted";
for (n=0; n < map->nodes->size; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
continue;
@ -1103,7 +1105,11 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
OBJ_RELEASE(node);
break;
}
} else {
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
app_name = app->app;
}
/* reset the job params for restart */
orte_plm_base_reset_job(jdata);
@ -1117,7 +1123,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
ORTE_NAME_PRINT(proc)));
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
opal_output(0, "FAILED TO RESTART APP %s on error %s", app->app, ORTE_ERROR_NAME(rc));
opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc));
return rc;
}

Просмотреть файл

@ -116,7 +116,7 @@ static int update_state(orte_jobid_t job,
orte_odls_child_t *child;
opal_buffer_t alert;
orte_plm_cmd_flag_t cmd;
int rc;
int rc=ORTE_SUCCESS;
orte_vpid_t null=ORTE_VPID_INVALID;
/* indicate that this is the end of the line */
@ -281,26 +281,22 @@ static int update_state(orte_jobid_t job,
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
child->exit_code = exit_code;
/* Decrement the number of local procs */
jobdat->num_local_procs--;
/* kill this proc */
killprocs(proc->jobid, proc->vpid);
}
if (jobdat->enable_recovery && child->restarts < jobdat->max_local_restarts) {
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc), child->restarts));
rc = orte_odls.restart_proc(child);
}
return rc;
}
}
if (jobdat->enable_recovery && child->restarts < jobdat->max_local_restarts) {
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc), child->restarts));
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
return ORTE_SUCCESS;
}
/* let it fall thru to abort */
}
/* Decrement the number of local procs */
jobdat->num_local_procs--;
/* kill this proc */
killprocs(proc->jobid, proc->vpid);
/* let the proc be reported back when terminated */
return ORTE_SUCCESS;
}
if (ORTE_PROC_STATE_TERMINATED < state) {

Просмотреть файл

@ -85,9 +85,9 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *data,
orte_jobid_t job)
{
int rc;
orte_job_t *jdata;
orte_job_t *jdata=NULL;
orte_proc_t *proc;
orte_job_map_t *map;
orte_job_map_t *map=NULL;
opal_buffer_t *wireup;
opal_byte_object_t bo, *boptr;
int32_t numbytes, *restarts;
@ -2917,6 +2917,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
ORTE_NAME_PRINT(child->name)));
/* find this child's jobdat */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
@ -2925,6 +2926,12 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
break;
}
}
if (NULL == jobdat) {
/* not found */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = 0;
child->waitpid_recvd = false;