From 319758e3e037a616c5550e01136c57d18c7efb59 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 28 Apr 2010 04:06:57 +0000 Subject: [PATCH] Restore process recovery for procs local to mpirun (first step towards restoring full capability). Define three new MCA params: 1. orte_enable_recovery - default recovery policy, can be overridden on a per-job basis 2. orte_max_local_restarts - default max number of local restarts, can be overridden 3. orte_max_global_restarts - default max number of relocates, can be overridden Implement the restart_proc API for the ODLS framework, reorganize the default fns a little to avoid copying code. This commit was SVN r23057. --- orte/mca/errmgr/base/errmgr_base_fns.c | 11 - orte/mca/errmgr/base/errmgr_base_open.c | 41 +-- orte/mca/errmgr/base/errmgr_private.h | 3 - orte/mca/errmgr/hnp/errmgr_hnp.c | 38 ++- orte/mca/odls/base/odls_base_default_fns.c | 263 +++++++++++++------- orte/mca/plm/base/plm_base_launch_support.c | 17 ++ orte/runtime/orte_globals.c | 20 +- orte/runtime/orte_globals.h | 7 + orte/runtime/orte_mca_params.c | 66 +++++ orte/tools/orterun/orterun.c | 16 +- 10 files changed, 314 insertions(+), 168 deletions(-) diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c index ad58420f8c..89d0943a79 100644 --- a/orte/mca/errmgr/base/errmgr_base_fns.c +++ b/orte/mca/errmgr/base/errmgr_base_fns.c @@ -173,17 +173,6 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc, int i, rc; orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE; - /* - * If the user did not ask for recovery, then do not process recovery events - */ - if( !orte_errmgr_base.enable_recovery ) { - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "errmgr:base:suggest_map_targets() %s) " - "------- Recovery currently disabled! Skipping...", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) )); - return ORTE_SUCCESS; - } - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, "errmgr:base:suggest_map_targets() %s) " "------- Notifying components... (%3d active components)", diff --git a/orte/mca/errmgr/base/errmgr_base_open.c b/orte/mca/errmgr/base/errmgr_base_open.c index 8182d1dd6c..a5d59ca027 100644 --- a/orte/mca/errmgr/base/errmgr_base_open.c +++ b/orte/mca/errmgr/base/errmgr_base_open.c @@ -39,6 +39,7 @@ #include "opal/util/trace.h" #include "opal/util/output.h" +#include "orte/util/show_help.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" @@ -47,10 +48,6 @@ /* * Globals */ -int orte_errmgr_base_output = -1; -bool orte_errmgr_base_enable_recovery = false; -bool orte_errmgr_base_shutting_down = false; -bool orte_errmgr_initialized = false; opal_list_t orte_errmgr_base_components_available; orte_errmgr_base_t orte_errmgr_base; @@ -70,8 +67,6 @@ orte_errmgr_API_t orte_errmgr = { */ int orte_errmgr_base_open(void) { - int value; - OPAL_TRACE(5); /* Only pass this way once */ @@ -84,40 +79,6 @@ int orte_errmgr_base_open(void) orte_errmgr_base.output = opal_output_open(NULL); - mca_base_param_reg_int_name("errmgr", - "base_enable_recovery", - "If the ErrMgr recovery components should be enabled." - " [Default = disabled]", - false, false, - 0, &value); - orte_errmgr_base.enable_recovery = OPAL_INT_TO_BOOL(value); - - mca_base_param_reg_int_name("errmgr", - "max_global_restarts", - "Max number of times to relocate a failed process to a new node", - false, false, - -1, &orte_errmgr_base.max_global_restarts); - - mca_base_param_reg_int_name("errmgr", - "max_local_restarts", - "Max number of times to locally restart a failed process before relocating it to a new node", - false, false, - -1, &orte_errmgr_base.max_local_restarts); - - if (orte_errmgr_base.enable_recovery) { - if (orte_errmgr_base.max_global_restarts < 0 ) { - orte_errmgr_base.max_global_restarts = 3; - } - if (orte_errmgr_base.max_local_restarts < 0) { - orte_errmgr_base.max_local_restarts = 3; - } - } else { - if (orte_errmgr_base.max_local_restarts > 0 || - orte_errmgr_base.max_global_restarts > 0) { - orte_errmgr_base.enable_recovery = true; - } - } - /* * A flag to indicate that orterun is shutting down, so skip the recovery * logic. diff --git a/orte/mca/errmgr/base/errmgr_private.h b/orte/mca/errmgr/base/errmgr_private.h index 02318c9582..8b7d11465e 100644 --- a/orte/mca/errmgr/base/errmgr_private.h +++ b/orte/mca/errmgr/base/errmgr_private.h @@ -43,11 +43,8 @@ BEGIN_C_DECLS typedef struct { int output; bool shutting_down; - bool enable_recovery; opal_pointer_array_t modules; bool initialized; - int max_global_restarts; - int max_local_restarts; } orte_errmgr_base_t; ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base; diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index 9467375fca..1a8ecf0a6b 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -113,6 +113,9 @@ static int update_state(orte_jobid_t job, { orte_job_t *jdata; orte_exit_code_t sts; + orte_odls_child_t *child; + opal_list_item_t *item; + int rc; /* indicate that this is the end of the line */ *stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE; @@ -251,7 +254,29 @@ static int update_state(orte_jobid_t job, case ORTE_PROC_STATE_ABORTED_BY_SIG: case ORTE_PROC_STATE_TERM_WO_SYNC: case ORTE_PROC_STATE_COMM_FAILED: - case ORTE_PROC_STATE_CALLED_ABORT: + if (jdata->enable_recovery) { + /* is this a local proc */ + child = NULL; + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { + break; + } + } + if (NULL != child) { + /* see if this child has reached its local restart limit */ + if (child->restarts < jdata->max_local_restarts) { + child->restarts++; + if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { + return ORTE_SUCCESS; + } + /* let it fall thru to abort */ + } + } + } update_proc(jdata, proc, state, exit_code); check_job_complete(jdata); /* need to set the job state */ /* the job object for this job will have been NULL'd @@ -264,6 +289,7 @@ static int update_state(orte_jobid_t job, break; case ORTE_PROC_STATE_FAILED_TO_START: + case ORTE_PROC_STATE_CALLED_ABORT: update_proc(jdata, proc, state, exit_code); check_job_complete(jdata); /* the job object for this job will have been NULL'd @@ -466,7 +492,8 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta } } -static void update_proc(orte_job_t *jdata, orte_process_name_t *proc, +static void update_proc(orte_job_t *jdata, + orte_process_name_t *proc, orte_proc_state_t state, orte_exit_code_t exit_code) { @@ -489,8 +516,11 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc, proct->state = state; proct->exit_code = exit_code; if (ORTE_PROC_STATE_UNTERMINATED < state) { - opal_list_remove_item(&orte_local_children, &child->super); - OBJ_RELEASE(child); + if (!jdata->enable_recovery) { + opal_output(0, "JDATA NOT ENABLED FOR RECOVERY"); + opal_list_remove_item(&orte_local_children, &child->super); + OBJ_RELEASE(child); + } jdata->num_terminated++; } else if (ORTE_PROC_STATE_RUNNING == state) { jdata->num_launched++; diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 923adaedd3..3c3026a5e9 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -1166,6 +1166,111 @@ static int odls_base_default_setup_fork(orte_app_context_t *context, return ORTE_SUCCESS; } +static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env) +{ + char *vpid_str, *param, *value; + orte_node_rank_t node_rank; + orte_local_rank_t local_rank; + int rc; + + if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + return rc; + } + opal_setenv(param, vpid_str, true, env); + free(param); + + /* although the vpid IS the process' rank within the job, users + * would appreciate being given a public environmental variable + * that also represents this value - something MPI specific - so + * do that here. + * + * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. + * We know - just live with it + */ + opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, env); + free(vpid_str); /* done with this now */ + + /* users would appreciate being given a public environmental variable + * that also represents the local rank value - something MPI specific - so + * do that here. + * + * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. + * We know - just live with it + */ + if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) { + ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); + rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; + return rc; + } + asprintf(&value, "%lu", (unsigned long) local_rank); + opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, env); + free(value); + + /* users would appreciate being given a public environmental variable + * that also represents the node rank value - something MPI specific - so + * do that here. + * + * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. + * We know - just live with it + */ + if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) { + ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); + rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; + return rc; + } + asprintf(&value, "%lu", (unsigned long) node_rank); + opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, env); + /* set an mca param for it too */ + if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + return rc; + } + opal_setenv(param, value, true, env); + free(param); + free(value); + + /* pass the number of restarts for this proc - will be zero for + * an initial start, but procs would like to know if they are being + * restarted so they can take appropriate action + */ + if (NULL == (param = mca_base_param_environ_variable("orte","num","restarts"))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + return rc; + } + asprintf(&value, "%d", child->restarts); + opal_setenv(param, value, true, env); + free(param); + free(value); + + /* if the proc should not barrier in orte_init, tell it */ + if (child->do_not_barrier || 0 < child->restarts) { + if (NULL == (param = mca_base_param_environ_variable("orte","do_not","barrier"))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + return rc; + } + opal_setenv(param, "1", true, env); + free(param); + } + + /* if the proc isn't going to forward IO, then we need to flag that + * it has "completed" iof termination as otherwise it will never fire + */ + if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { + child->iof_complete = true; + } + + return ORTE_SUCCESS; +} + /* define a timer release point so that we can wait for * file descriptors to come available, if necessary */ @@ -1186,7 +1291,7 @@ static void timer_cb(int fd, short event, void *cbdata) int orte_odls_base_default_launch_local(orte_jobid_t job, orte_odls_base_fork_local_proc_fn_t fork_local) { - char *job_str, *vpid_str, *param, *value; + char *job_str, *param; opal_list_item_t *item; orte_app_context_t *app, **apps; orte_app_idx_t i, num_apps; @@ -1198,8 +1303,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, opal_buffer_t alert; orte_std_cntr_t proc_rank; orte_odls_job_t *jobdat; - orte_local_rank_t local_rank; - orte_node_rank_t node_rank; char *pathenv = NULL, *mpiexec_pathenv = NULL; char basedir[MAXPATHLEN]; char dir[MAXPATHLEN]; @@ -1639,10 +1742,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, ORTE_ERROR_LOG(rc); goto CLEANUP; } - if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); rc = ORTE_ERR_OUT_OF_RESOURCE; @@ -1652,97 +1751,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, free(param); free(job_str); - if (NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; + if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { + ORTE_ERROR_LOG(rc); goto CLEANUP; } - opal_setenv(param, vpid_str, true, &app->env); - free(param); - /* although the vpid IS the process' rank within the job, users - * would appreciate being given a public environmental variable - * that also represents this value - something MPI specific - so - * do that here. - * - * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. - * We know - just live with it - */ - opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, &app->env); - free(vpid_str); /* done with this now */ - - /* users would appreciate being given a public environmental variable - * that also represents the local rank value - something MPI specific - so - * do that here. - * - * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. - * We know - just live with it - */ - if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) { - ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); - rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; - goto CLEANUP; - } - asprintf(&value, "%lu", (unsigned long) local_rank); - opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env); - free(value); - - /* users would appreciate being given a public environmental variable - * that also represents the node rank value - something MPI specific - so - * do that here. - * - * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. - * We know - just live with it - */ - if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) { - ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); - rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; - goto CLEANUP; - } - asprintf(&value, "%lu", (unsigned long) node_rank); - opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env); - /* set an mca param for it too */ - if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto CLEANUP; - } - opal_setenv(param, value, true, &app->env); - free(param); - free(value); - - /* pass the number of restarts for this proc - will be zero for - * an initial start, but procs would like to know if they are being - * restarted so they can take appropriate action - */ - if (NULL == (param = mca_base_param_environ_variable("orte","num","restarts"))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto CLEANUP; - } - asprintf(&value, "%d", child->restarts); - opal_setenv(param, value, true, &app->env); - free(param); - free(value); - - /* if the proc should not barrier in orte_init, tell it */ - if (child->do_not_barrier || 0 < child->restarts) { - if (NULL == (param = mca_base_param_environ_variable("orte","do_not","barrier"))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto CLEANUP; - } - opal_setenv(param, "1", true, &app->env); - free(param); - } - - /* if the proc isn't going to forward IO, then we need to flag that - * it has "completed" iof termination as otherwise it will never fire - */ - if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { - child->iof_complete = true; - } - /* if we are timing things, record when we are going to launch this proc */ if (orte_timing) { gettimeofday(&child->starttime, NULL); @@ -2857,5 +2870,63 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer, int orte_odls_base_default_restart_proc(orte_odls_child_t *child, orte_odls_base_fork_local_proc_fn_t fork_local) { - return ORTE_SUCCESS; + int rc; + orte_app_context_t *app; + opal_list_item_t *item; + orte_odls_job_t *jobdat; + + /* protect operations involving the global list of children */ + OPAL_THREAD_LOCK(&orte_odls_globals.mutex); + + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:restart_proc for proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name))); + + /* find this child's jobdat */ + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + if (jobdat->jobid == child->name->jobid) { + break; + } + } + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->exit_code = 0; + child->waitpid_recvd = false; + child->iof_complete = false; + child->coll_recvd = false; + child->pid = 0; + child->init_recvd = false; + child->fini_recvd = false; + if (NULL != child->rml_uri) { + free(child->rml_uri); + child->rml_uri = NULL; + } + app = jobdat->apps[child->app_idx]; + /* reset envars to match this child */ + + if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { + ORTE_ERROR_LOG(rc); + goto CLEANUP; + } + rc = fork_local(app, child, app->env, jobdat); + if (ORTE_SUCCESS == rc) { + OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); + orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL); + OPAL_THREAD_LOCK(&orte_odls_globals.mutex); + + } + +CLEANUP: + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:restart of proc %s %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), + (ORTE_SUCCESS == rc) ? "succeeded" : "failed")); + + opal_condition_signal(&orte_odls_globals.cond); + OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); + return rc; } diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 09cba7a010..d0e98fd396 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -91,6 +91,23 @@ int orte_plm_base_setup_job(orte_job_t *jdata) /* store it on the global job data pool */ ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); + + /* if its restart limits have not been set, set them to the defaults */ + if (jdata->max_global_restarts < 0) { + jdata->max_global_restarts = orte_max_global_restarts; + } + if (jdata->max_local_restarts < 0) { + jdata->max_local_restarts = orte_max_local_restarts; + } + /* consistency check */ + if (jdata->max_global_restarts <= 0 && + jdata->max_local_restarts <= 0) { + jdata->enable_recovery = false; + + } else { + jdata->enable_recovery = true; + } + } /* get the allocation */ diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 05a5b59719..770baba0fb 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -174,6 +174,11 @@ bool orte_report_bindings = false; /* barrier control */ bool orte_do_not_barrier = false; +/* process recovery */ +bool orte_enable_recovery; +int32_t orte_max_global_restarts; +int32_t orte_max_local_restarts; + /* comm fn for updating state */ orte_default_comm_fn_t orte_comm; @@ -266,16 +271,6 @@ int orte_dt_init(void) } #if !ORTE_DISABLE_FULL_SUPPORT - /* get a clean output channel too */ - { - opal_output_stream_t lds; - OBJ_CONSTRUCT(&lds, opal_output_stream_t); - lds.lds_want_stdout = true; - orte_clean_output = opal_output_open(&lds); - OBJ_DESTRUCT(&lds); - - } - tmp = ORTE_JOB; if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_job, orte_dt_unpack_job, @@ -640,8 +635,9 @@ static void orte_job_construct(orte_job_t* job) OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t); job->not_reported = true; - job->max_local_restarts = 0; - job->max_global_restarts = 0; + job->enable_recovery = false; + job->max_local_restarts = -1; + job->max_global_restarts = -1; job->launch_msg_sent.tv_sec = 0; job->launch_msg_sent.tv_usec = 0; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index aa3fc6cb61..10b937aefb 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -390,6 +390,8 @@ typedef struct { bool abort; /* proc that caused that to happen */ struct orte_proc_t *aborted_proc; + /* enable recovery of these processes */ + bool enable_recovery; /* max number of times a process can be restarted locally */ int32_t max_local_restarts; /* max number of times a process can be relocated to another node */ @@ -665,6 +667,11 @@ ORTE_DECLSPEC extern bool orte_report_bindings; /* barrier control */ ORTE_DECLSPEC extern bool orte_do_not_barrier; +/* process recovery */ +ORTE_DECLSPEC extern bool orte_enable_recovery; +ORTE_DECLSPEC extern int32_t orte_max_global_restarts; +ORTE_DECLSPEC extern int32_t orte_max_local_restarts; + /* comm interface */ typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data); diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index fcf62121dd..5efd4fd76a 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -54,6 +54,20 @@ int orte_register_params(void) } passed_thru = true; +#if !ORTE_DISABLE_FULL_SUPPORT + /* get a clean output channel too - need to do this here because + * we use it below, and orterun and some other tools call this + * function prior to calling orte_init + */ + { + opal_output_stream_t lds; + OBJ_CONSTRUCT(&lds, opal_output_stream_t); + lds.lds_want_stdout = true; + orte_clean_output = opal_output_open(&lds); + OBJ_DESTRUCT(&lds); + } +#endif /* !ORTE_DISABLE_FULL_SUPPORT */ + mca_base_param_reg_int_name("orte", "base_help_aggregate", "If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.", false, false, @@ -448,6 +462,58 @@ int orte_register_params(void) (int) false, &value); orte_do_not_barrier = OPAL_INT_TO_BOOL(value); + mca_base_param_reg_int_name("orte", "enable_recovery", + "Enable recovery from process failure [Default = disabled]", + false, false, + (int)false, &value); + orte_enable_recovery = OPAL_INT_TO_BOOL(value); + + mca_base_param_reg_int_name("orte", "max_global_restarts", + "Max number of times to relocate a failed process to a new node", + false, false, + -1, &orte_max_global_restarts); + + mca_base_param_reg_int_name("orte", "max_local_restarts", + "Max number of times to locally restart a failed process before relocating it to a new node", + false, false, + -1, &orte_max_local_restarts); + if (orte_enable_recovery) { + if (orte_max_global_restarts <= 0 && + orte_max_local_restarts <= 0) { + if (ORTE_PROC_IS_HNP) { + opal_output(orte_clean_output, + "------------------------------------------------------------\n" + "Although the MCA param orte_enable_recovery was set to true,\n" + "values for the max number of restarts was not provided:\n\n" + "Max global restarts: %d\n" + "Max local restarts: %d\n\n" + "At least one of these must be a positive value. We are disabling\n" + "process recovery, but continuing execution.\n" + "------------------------------------------------------------", + orte_max_global_restarts, orte_max_local_restarts); + } + orte_enable_recovery = false; + } + } else if (orte_max_global_restarts > 0 || + orte_max_local_restarts > 0) { + if (ORTE_PROC_IS_HNP) { + opal_output(orte_clean_output, + "------------------------------------------------------------------\n" + "The MCA param errmgr_base_enable_recovery was not set to true, but\n" + "positive value(s) were provided for the number of restarts:\n\n" + "Max global restarts: %d\n" + "Max local restarts: %d\n\n" + "We are enabling process recovery and continuing execution. To avoid\n" + "this warning in the future, please set the errmgr_base_enable_recovery\n" + "param to non-zero.\n" + "------------------------------------------------------------------", + orte_max_global_restarts, orte_max_local_restarts); + } + orte_enable_recovery = true; + } + + + #endif /* ORTE_DISABLE_FULL_SUPPORT */ return ORTE_SUCCESS; diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index b048fff5d2..ffc89ee947 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -425,6 +425,18 @@ static opal_cmd_line_init_t cmd_line_init[] = { NULL, OPAL_CMD_LINE_TYPE_STRING, "Report events to a tool listening at the specified URI" }, + { "orte", "enable", "recovery", '\0', "enable-recovery", "enable-recovery", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable recovery from process failure [Default = disabled]" }, + + { "orte", "max", "global_restarts", '\0', "max-global-restarts", "max-global-restarts", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Max number of times to relocate a failed process to a new node" }, + + { "orte", "max", "local_restarts", '\0', "max-local-restarts", "max-local-restarts", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Max number of times to locally restart a failed process before relocating it to a new node" }, + /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } @@ -1135,11 +1147,11 @@ static void abort_exit_callback(int fd, short ign, void *arg) orte_debugger_finalize(); /* - * Turn off the errmgr recovery functionality, if it was enabled. + * Turn off the process recovery functionality, if it was enabled. * This keeps the errmgr from trying to recover from the shutdown * procedure. */ - orte_errmgr_base.enable_recovery = false; + orte_enable_recovery = false; orte_errmgr_base.shutting_down = true; /* terminate the orteds - they will automatically kill