1
1

Restore process recovery for procs local to mpirun (first step towards restoring full capability). Define three new MCA params:

1. orte_enable_recovery - default recovery policy, can be overridden on a per-job basis

2. orte_max_local_restarts - default max number of local restarts, can be overridden

3. orte_max_global_restarts - default max number of relocates, can be overridden

Implement the restart_proc API for the ODLS framework, reorganize the default fns a little to avoid copying code.

This commit was SVN r23057.
Этот коммит содержится в:
Ralph Castain 2010-04-28 04:06:57 +00:00
родитель f064056a07
Коммит 319758e3e0
10 изменённых файлов: 314 добавлений и 168 удалений

Просмотреть файл

@ -173,17 +173,6 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
int i, rc;
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
/*
* If the user did not ask for recovery, then do not process recovery events
*/
if( !orte_errmgr_base.enable_recovery ) {
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:suggest_map_targets() %s) "
"------- Recovery currently disabled! Skipping...",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:base:suggest_map_targets() %s) "
"------- Notifying components... (%3d active components)",

Просмотреть файл

@ -39,6 +39,7 @@
#include "opal/util/trace.h"
#include "opal/util/output.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
@ -47,10 +48,6 @@
/*
* Globals
*/
int orte_errmgr_base_output = -1;
bool orte_errmgr_base_enable_recovery = false;
bool orte_errmgr_base_shutting_down = false;
bool orte_errmgr_initialized = false;
opal_list_t orte_errmgr_base_components_available;
orte_errmgr_base_t orte_errmgr_base;
@ -70,8 +67,6 @@ orte_errmgr_API_t orte_errmgr = {
*/
int orte_errmgr_base_open(void)
{
int value;
OPAL_TRACE(5);
/* Only pass this way once */
@ -84,40 +79,6 @@ int orte_errmgr_base_open(void)
orte_errmgr_base.output = opal_output_open(NULL);
mca_base_param_reg_int_name("errmgr",
"base_enable_recovery",
"If the ErrMgr recovery components should be enabled."
" [Default = disabled]",
false, false,
0, &value);
orte_errmgr_base.enable_recovery = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("errmgr",
"max_global_restarts",
"Max number of times to relocate a failed process to a new node",
false, false,
-1, &orte_errmgr_base.max_global_restarts);
mca_base_param_reg_int_name("errmgr",
"max_local_restarts",
"Max number of times to locally restart a failed process before relocating it to a new node",
false, false,
-1, &orte_errmgr_base.max_local_restarts);
if (orte_errmgr_base.enable_recovery) {
if (orte_errmgr_base.max_global_restarts < 0 ) {
orte_errmgr_base.max_global_restarts = 3;
}
if (orte_errmgr_base.max_local_restarts < 0) {
orte_errmgr_base.max_local_restarts = 3;
}
} else {
if (orte_errmgr_base.max_local_restarts > 0 ||
orte_errmgr_base.max_global_restarts > 0) {
orte_errmgr_base.enable_recovery = true;
}
}
/*
* A flag to indicate that orterun is shutting down, so skip the recovery
* logic.

Просмотреть файл

@ -43,11 +43,8 @@ BEGIN_C_DECLS
typedef struct {
int output;
bool shutting_down;
bool enable_recovery;
opal_pointer_array_t modules;
bool initialized;
int max_global_restarts;
int max_local_restarts;
} orte_errmgr_base_t;
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;

Просмотреть файл

@ -113,6 +113,9 @@ static int update_state(orte_jobid_t job,
{
orte_job_t *jdata;
orte_exit_code_t sts;
orte_odls_child_t *child;
opal_list_item_t *item;
int rc;
/* indicate that this is the end of the line */
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
@ -251,7 +254,29 @@ static int update_state(orte_jobid_t job,
case ORTE_PROC_STATE_ABORTED_BY_SIG:
case ORTE_PROC_STATE_TERM_WO_SYNC:
case ORTE_PROC_STATE_COMM_FAILED:
case ORTE_PROC_STATE_CALLED_ABORT:
if (jdata->enable_recovery) {
/* is this a local proc */
child = NULL;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
break;
}
}
if (NULL != child) {
/* see if this child has reached its local restart limit */
if (child->restarts < jdata->max_local_restarts) {
child->restarts++;
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
return ORTE_SUCCESS;
}
/* let it fall thru to abort */
}
}
}
update_proc(jdata, proc, state, exit_code);
check_job_complete(jdata); /* need to set the job state */
/* the job object for this job will have been NULL'd
@ -264,6 +289,7 @@ static int update_state(orte_jobid_t job,
break;
case ORTE_PROC_STATE_FAILED_TO_START:
case ORTE_PROC_STATE_CALLED_ABORT:
update_proc(jdata, proc, state, exit_code);
check_job_complete(jdata);
/* the job object for this job will have been NULL'd
@ -466,7 +492,8 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
}
}
static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
static void update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
orte_exit_code_t exit_code)
{
@ -489,8 +516,11 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
proct->state = state;
proct->exit_code = exit_code;
if (ORTE_PROC_STATE_UNTERMINATED < state) {
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
if (!jdata->enable_recovery) {
opal_output(0, "JDATA NOT ENABLED FOR RECOVERY");
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
}
jdata->num_terminated++;
} else if (ORTE_PROC_STATE_RUNNING == state) {
jdata->num_launched++;

Просмотреть файл

@ -1166,6 +1166,111 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
return ORTE_SUCCESS;
}
static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env)
{
char *vpid_str, *param, *value;
orte_node_rank_t node_rank;
orte_local_rank_t local_rank;
int rc;
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc;
}
opal_setenv(param, vpid_str, true, env);
free(param);
/* although the vpid IS the process' rank within the job, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, env);
free(vpid_str); /* done with this now */
/* users would appreciate being given a public environmental variable
* that also represents the local rank value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
return rc;
}
asprintf(&value, "%lu", (unsigned long) local_rank);
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, env);
free(value);
/* users would appreciate being given a public environmental variable
* that also represents the node rank value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
return rc;
}
asprintf(&value, "%lu", (unsigned long) node_rank);
opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, env);
/* set an mca param for it too */
if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
/* pass the number of restarts for this proc - will be zero for
* an initial start, but procs would like to know if they are being
* restarted so they can take appropriate action
*/
if (NULL == (param = mca_base_param_environ_variable("orte","num","restarts"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc;
}
asprintf(&value, "%d", child->restarts);
opal_setenv(param, value, true, env);
free(param);
free(value);
/* if the proc should not barrier in orte_init, tell it */
if (child->do_not_barrier || 0 < child->restarts) {
if (NULL == (param = mca_base_param_environ_variable("orte","do_not","barrier"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc;
}
opal_setenv(param, "1", true, env);
free(param);
}
/* if the proc isn't going to forward IO, then we need to flag that
* it has "completed" iof termination as otherwise it will never fire
*/
if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
child->iof_complete = true;
}
return ORTE_SUCCESS;
}
/* define a timer release point so that we can wait for
* file descriptors to come available, if necessary
*/
@ -1186,7 +1291,7 @@ static void timer_cb(int fd, short event, void *cbdata)
int orte_odls_base_default_launch_local(orte_jobid_t job,
orte_odls_base_fork_local_proc_fn_t fork_local)
{
char *job_str, *vpid_str, *param, *value;
char *job_str, *param;
opal_list_item_t *item;
orte_app_context_t *app, **apps;
orte_app_idx_t i, num_apps;
@ -1198,8 +1303,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
opal_buffer_t alert;
orte_std_cntr_t proc_rank;
orte_odls_job_t *jobdat;
orte_local_rank_t local_rank;
orte_node_rank_t node_rank;
char *pathenv = NULL, *mpiexec_pathenv = NULL;
char basedir[MAXPATHLEN];
char dir[MAXPATHLEN];
@ -1639,10 +1742,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
@ -1652,97 +1751,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
free(param);
free(job_str);
if (NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
opal_setenv(param, vpid_str, true, &app->env);
free(param);
/* although the vpid IS the process' rank within the job, users
* would appreciate being given a public environmental variable
* that also represents this value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, &app->env);
free(vpid_str); /* done with this now */
/* users would appreciate being given a public environmental variable
* that also represents the local rank value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
goto CLEANUP;
}
asprintf(&value, "%lu", (unsigned long) local_rank);
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
free(value);
/* users would appreciate being given a public environmental variable
* that also represents the node rank value - something MPI specific - so
* do that here.
*
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) {
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
goto CLEANUP;
}
asprintf(&value, "%lu", (unsigned long) node_rank);
opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env);
/* set an mca param for it too */
if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto CLEANUP;
}
opal_setenv(param, value, true, &app->env);
free(param);
free(value);
/* pass the number of restarts for this proc - will be zero for
* an initial start, but procs would like to know if they are being
* restarted so they can take appropriate action
*/
if (NULL == (param = mca_base_param_environ_variable("orte","num","restarts"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto CLEANUP;
}
asprintf(&value, "%d", child->restarts);
opal_setenv(param, value, true, &app->env);
free(param);
free(value);
/* if the proc should not barrier in orte_init, tell it */
if (child->do_not_barrier || 0 < child->restarts) {
if (NULL == (param = mca_base_param_environ_variable("orte","do_not","barrier"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto CLEANUP;
}
opal_setenv(param, "1", true, &app->env);
free(param);
}
/* if the proc isn't going to forward IO, then we need to flag that
* it has "completed" iof termination as otherwise it will never fire
*/
if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
child->iof_complete = true;
}
/* if we are timing things, record when we are going to launch this proc */
if (orte_timing) {
gettimeofday(&child->starttime, NULL);
@ -2857,5 +2870,63 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
orte_odls_base_fork_local_proc_fn_t fork_local)
{
return ORTE_SUCCESS;
int rc;
orte_app_context_t *app;
opal_list_item_t *item;
orte_odls_job_t *jobdat;
/* protect operations involving the global list of children */
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:restart_proc for proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
/* find this child's jobdat */
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
if (jobdat->jobid == child->name->jobid) {
break;
}
}
child->state = ORTE_PROC_STATE_FAILED_TO_START;
child->exit_code = 0;
child->waitpid_recvd = false;
child->iof_complete = false;
child->coll_recvd = false;
child->pid = 0;
child->init_recvd = false;
child->fini_recvd = false;
if (NULL != child->rml_uri) {
free(child->rml_uri);
child->rml_uri = NULL;
}
app = jobdat->apps[child->app_idx];
/* reset envars to match this child */
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
rc = fork_local(app, child, app->env, jobdat);
if (ORTE_SUCCESS == rc) {
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
}
CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:restart of proc %s %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name),
(ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
return rc;
}

Просмотреть файл

@ -91,6 +91,23 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
/* store it on the global job data pool */
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
/* if its restart limits have not been set, set them to the defaults */
if (jdata->max_global_restarts < 0) {
jdata->max_global_restarts = orte_max_global_restarts;
}
if (jdata->max_local_restarts < 0) {
jdata->max_local_restarts = orte_max_local_restarts;
}
/* consistency check */
if (jdata->max_global_restarts <= 0 &&
jdata->max_local_restarts <= 0) {
jdata->enable_recovery = false;
} else {
jdata->enable_recovery = true;
}
}
/* get the allocation */

Просмотреть файл

@ -174,6 +174,11 @@ bool orte_report_bindings = false;
/* barrier control */
bool orte_do_not_barrier = false;
/* process recovery */
bool orte_enable_recovery;
int32_t orte_max_global_restarts;
int32_t orte_max_local_restarts;
/* comm fn for updating state */
orte_default_comm_fn_t orte_comm;
@ -266,16 +271,6 @@ int orte_dt_init(void)
}
#if !ORTE_DISABLE_FULL_SUPPORT
/* get a clean output channel too */
{
opal_output_stream_t lds;
OBJ_CONSTRUCT(&lds, opal_output_stream_t);
lds.lds_want_stdout = true;
orte_clean_output = opal_output_open(&lds);
OBJ_DESTRUCT(&lds);
}
tmp = ORTE_JOB;
if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_job,
orte_dt_unpack_job,
@ -640,8 +635,9 @@ static void orte_job_construct(orte_job_t* job)
OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t);
job->not_reported = true;
job->max_local_restarts = 0;
job->max_global_restarts = 0;
job->enable_recovery = false;
job->max_local_restarts = -1;
job->max_global_restarts = -1;
job->launch_msg_sent.tv_sec = 0;
job->launch_msg_sent.tv_usec = 0;

Просмотреть файл

@ -390,6 +390,8 @@ typedef struct {
bool abort;
/* proc that caused that to happen */
struct orte_proc_t *aborted_proc;
/* enable recovery of these processes */
bool enable_recovery;
/* max number of times a process can be restarted locally */
int32_t max_local_restarts;
/* max number of times a process can be relocated to another node */
@ -665,6 +667,11 @@ ORTE_DECLSPEC extern bool orte_report_bindings;
/* barrier control */
ORTE_DECLSPEC extern bool orte_do_not_barrier;
/* process recovery */
ORTE_DECLSPEC extern bool orte_enable_recovery;
ORTE_DECLSPEC extern int32_t orte_max_global_restarts;
ORTE_DECLSPEC extern int32_t orte_max_local_restarts;
/* comm interface */
typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data);

Просмотреть файл

@ -54,6 +54,20 @@ int orte_register_params(void)
}
passed_thru = true;
#if !ORTE_DISABLE_FULL_SUPPORT
/* get a clean output channel too - need to do this here because
* we use it below, and orterun and some other tools call this
* function prior to calling orte_init
*/
{
opal_output_stream_t lds;
OBJ_CONSTRUCT(&lds, opal_output_stream_t);
lds.lds_want_stdout = true;
orte_clean_output = opal_output_open(&lds);
OBJ_DESTRUCT(&lds);
}
#endif /* !ORTE_DISABLE_FULL_SUPPORT */
mca_base_param_reg_int_name("orte", "base_help_aggregate",
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
false, false,
@ -448,6 +462,58 @@ int orte_register_params(void)
(int) false, &value);
orte_do_not_barrier = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "enable_recovery",
"Enable recovery from process failure [Default = disabled]",
false, false,
(int)false, &value);
orte_enable_recovery = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "max_global_restarts",
"Max number of times to relocate a failed process to a new node",
false, false,
-1, &orte_max_global_restarts);
mca_base_param_reg_int_name("orte", "max_local_restarts",
"Max number of times to locally restart a failed process before relocating it to a new node",
false, false,
-1, &orte_max_local_restarts);
if (orte_enable_recovery) {
if (orte_max_global_restarts <= 0 &&
orte_max_local_restarts <= 0) {
if (ORTE_PROC_IS_HNP) {
opal_output(orte_clean_output,
"------------------------------------------------------------\n"
"Although the MCA param orte_enable_recovery was set to true,\n"
"values for the max number of restarts was not provided:\n\n"
"Max global restarts: %d\n"
"Max local restarts: %d\n\n"
"At least one of these must be a positive value. We are disabling\n"
"process recovery, but continuing execution.\n"
"------------------------------------------------------------",
orte_max_global_restarts, orte_max_local_restarts);
}
orte_enable_recovery = false;
}
} else if (orte_max_global_restarts > 0 ||
orte_max_local_restarts > 0) {
if (ORTE_PROC_IS_HNP) {
opal_output(orte_clean_output,
"------------------------------------------------------------------\n"
"The MCA param errmgr_base_enable_recovery was not set to true, but\n"
"positive value(s) were provided for the number of restarts:\n\n"
"Max global restarts: %d\n"
"Max local restarts: %d\n\n"
"We are enabling process recovery and continuing execution. To avoid\n"
"this warning in the future, please set the errmgr_base_enable_recovery\n"
"param to non-zero.\n"
"------------------------------------------------------------------",
orte_max_global_restarts, orte_max_local_restarts);
}
orte_enable_recovery = true;
}
#endif /* ORTE_DISABLE_FULL_SUPPORT */
return ORTE_SUCCESS;

Просмотреть файл

@ -425,6 +425,18 @@ static opal_cmd_line_init_t cmd_line_init[] = {
NULL, OPAL_CMD_LINE_TYPE_STRING,
"Report events to a tool listening at the specified URI" },
{ "orte", "enable", "recovery", '\0', "enable-recovery", "enable-recovery", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
"Enable recovery from process failure [Default = disabled]" },
{ "orte", "max", "global_restarts", '\0', "max-global-restarts", "max-global-restarts", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Max number of times to relocate a failed process to a new node" },
{ "orte", "max", "local_restarts", '\0', "max-local-restarts", "max-local-restarts", 1,
NULL, OPAL_CMD_LINE_TYPE_INT,
"Max number of times to locally restart a failed process before relocating it to a new node" },
/* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
@ -1135,11 +1147,11 @@ static void abort_exit_callback(int fd, short ign, void *arg)
orte_debugger_finalize();
/*
* Turn off the errmgr recovery functionality, if it was enabled.
* Turn off the process recovery functionality, if it was enabled.
* This keeps the errmgr from trying to recover from the shutdown
* procedure.
*/
orte_errmgr_base.enable_recovery = false;
orte_enable_recovery = false;
orte_errmgr_base.shutting_down = true;
/* terminate the orteds - they will automatically kill