1
1

Restore the global restart capability

This commit was SVN r23089.
Этот коммит содержится в:
Ralph Castain 2010-05-04 02:40:29 +00:00
родитель 3ca0b4138b
Коммит cd569f8a79

Просмотреть файл

@ -32,7 +32,8 @@
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h" #include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h" #include "orte/mca/odls/base/base.h"
#include "orte/mca/plm/base/base.h" #include "orte/mca/plm/base/plm_private.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rmaps/rmaps_types.h"
#if ORTE_ENABLE_SENSORS #if ORTE_ENABLE_SENSORS
#include "orte/mca/sensor/sensor.h" #include "orte/mca/sensor/sensor.h"
@ -51,6 +52,8 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
orte_exit_code_t exit_code); orte_exit_code_t exit_code);
static void check_job_complete(orte_job_t *jdata); static void check_job_complete(orte_job_t *jdata);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid); static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc);
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
/* /*
* Module functions: Global * Module functions: Global
@ -80,7 +83,7 @@ static int ft_event(int state);
/****************** /******************
* ORCM module * HNP module
******************/ ******************/
orte_errmgr_base_module_t orte_errmgr_hnp_module = { orte_errmgr_base_module_t orte_errmgr_hnp_module = {
init, init,
@ -114,7 +117,6 @@ static int update_state(orte_jobid_t job,
orte_job_t *jdata; orte_job_t *jdata;
orte_exit_code_t sts; orte_exit_code_t sts;
orte_odls_child_t *child; orte_odls_child_t *child;
opal_list_item_t *item;
int rc; int rc;
/* indicate that this is the end of the line */ /* indicate that this is the end of the line */
@ -256,25 +258,27 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_COMM_FAILED: case ORTE_PROC_STATE_COMM_FAILED:
if (jdata->enable_recovery) { if (jdata->enable_recovery) {
/* is this a local proc */ /* is this a local proc */
child = NULL; if (NULL != (child = proc_is_local(proc))) {
for (item = opal_list_get_first(&orte_local_children); /* local proc - see if it has reached its local restart limit */
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
break;
}
}
if (NULL != child) {
/* see if this child has reached its local restart limit */
if (child->restarts < jdata->max_local_restarts) { if (child->restarts < jdata->max_local_restarts) {
child->restarts++; child->restarts++;
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/* let it fall thru to abort */ /* let it fall thru to abort */
} else {
/* see if we can relocate it somewhere else */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
return ORTE_SUCCESS;
}
/* let it fall thru to abort */
} }
} else {
/* this is a remote process - see if we can relocate it */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
return ORTE_SUCCESS;
}
/* guess not - let it fall thru to abort */
} }
} }
update_proc(jdata, proc, state, exit_code); update_proc(jdata, proc, state, exit_code);
@ -910,3 +914,67 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
OBJ_DESTRUCT(&proc); OBJ_DESTRUCT(&proc);
} }
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
{
orte_proc_t *pdata;
orte_node_t *node, *newnode;
orte_app_context_t *app;
int rc;
/* get the proc_t object for this process */
pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
if (NULL == pdata) {
opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc));
return ORTE_ERR_NOT_FOUND;
}
/* track that we are attempting to relocate */
pdata->relocates++;
/* have we exceeded the number of relocates for this proc? */
if (jdata->max_global_restarts < pdata->relocates) {
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
}
/* proc just died - save the node where this proc was located */
node = pdata->node;
/* reset the job params for restart */
orte_plm_base_reset_job(jdata);
/* restart the job - the spawn function will remap and
* launch the replacement proc(s)
*/
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
"%s RESTARTING APP: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
opal_output(0, "FAILED TO RESTART APP %s on error %s", app->app, ORTE_ERROR_NAME(rc));
return rc;
}
/* get the new node */
newnode = pdata->node;
/* report what we did */
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
"%s Proc %s:%s aborted on node %s and was restarted on node %s\n\n",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
app->app, ORTE_NAME_PRINT(proc), node->name, newnode->name));
return ORTE_SUCCESS;
}
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc)
{
orte_odls_child_t *child;
opal_list_item_t *item;
child = NULL;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
return child;
}
}
return NULL;
}