Restore process recovery for procs local to mpirun (first step towards restoring full capability). Define three new MCA params:
1. orte_enable_recovery - default recovery policy, can be overridden on a per-job basis 2. orte_max_local_restarts - default max number of local restarts, can be overridden 3. orte_max_global_restarts - default max number of relocates, can be overridden Implement the restart_proc API for the ODLS framework, reorganize the default fns a little to avoid copying code. This commit was SVN r23057.
Этот коммит содержится в:
родитель
f064056a07
Коммит
319758e3e0
@ -173,17 +173,6 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
||||
int i, rc;
|
||||
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||
|
||||
/*
|
||||
* If the user did not ask for recovery, then do not process recovery events
|
||||
*/
|
||||
if( !orte_errmgr_base.enable_recovery ) {
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:suggest_map_targets() %s) "
|
||||
"------- Recovery currently disabled! Skipping...",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:base:suggest_map_targets() %s) "
|
||||
"------- Notifying components... (%3d active components)",
|
||||
|
@ -39,6 +39,7 @@
|
||||
#include "opal/util/trace.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
@ -47,10 +48,6 @@
|
||||
/*
|
||||
* Globals
|
||||
*/
|
||||
int orte_errmgr_base_output = -1;
|
||||
bool orte_errmgr_base_enable_recovery = false;
|
||||
bool orte_errmgr_base_shutting_down = false;
|
||||
bool orte_errmgr_initialized = false;
|
||||
opal_list_t orte_errmgr_base_components_available;
|
||||
|
||||
orte_errmgr_base_t orte_errmgr_base;
|
||||
@ -70,8 +67,6 @@ orte_errmgr_API_t orte_errmgr = {
|
||||
*/
|
||||
int orte_errmgr_base_open(void)
|
||||
{
|
||||
int value;
|
||||
|
||||
OPAL_TRACE(5);
|
||||
|
||||
/* Only pass this way once */
|
||||
@ -84,40 +79,6 @@ int orte_errmgr_base_open(void)
|
||||
|
||||
orte_errmgr_base.output = opal_output_open(NULL);
|
||||
|
||||
mca_base_param_reg_int_name("errmgr",
|
||||
"base_enable_recovery",
|
||||
"If the ErrMgr recovery components should be enabled."
|
||||
" [Default = disabled]",
|
||||
false, false,
|
||||
0, &value);
|
||||
orte_errmgr_base.enable_recovery = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("errmgr",
|
||||
"max_global_restarts",
|
||||
"Max number of times to relocate a failed process to a new node",
|
||||
false, false,
|
||||
-1, &orte_errmgr_base.max_global_restarts);
|
||||
|
||||
mca_base_param_reg_int_name("errmgr",
|
||||
"max_local_restarts",
|
||||
"Max number of times to locally restart a failed process before relocating it to a new node",
|
||||
false, false,
|
||||
-1, &orte_errmgr_base.max_local_restarts);
|
||||
|
||||
if (orte_errmgr_base.enable_recovery) {
|
||||
if (orte_errmgr_base.max_global_restarts < 0 ) {
|
||||
orte_errmgr_base.max_global_restarts = 3;
|
||||
}
|
||||
if (orte_errmgr_base.max_local_restarts < 0) {
|
||||
orte_errmgr_base.max_local_restarts = 3;
|
||||
}
|
||||
} else {
|
||||
if (orte_errmgr_base.max_local_restarts > 0 ||
|
||||
orte_errmgr_base.max_global_restarts > 0) {
|
||||
orte_errmgr_base.enable_recovery = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* A flag to indicate that orterun is shutting down, so skip the recovery
|
||||
* logic.
|
||||
|
@ -43,11 +43,8 @@ BEGIN_C_DECLS
|
||||
typedef struct {
|
||||
int output;
|
||||
bool shutting_down;
|
||||
bool enable_recovery;
|
||||
opal_pointer_array_t modules;
|
||||
bool initialized;
|
||||
int max_global_restarts;
|
||||
int max_local_restarts;
|
||||
} orte_errmgr_base_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
|
||||
|
@ -113,6 +113,9 @@ static int update_state(orte_jobid_t job,
|
||||
{
|
||||
orte_job_t *jdata;
|
||||
orte_exit_code_t sts;
|
||||
orte_odls_child_t *child;
|
||||
opal_list_item_t *item;
|
||||
int rc;
|
||||
|
||||
/* indicate that this is the end of the line */
|
||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||
@ -251,7 +254,29 @@ static int update_state(orte_jobid_t job,
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
case ORTE_PROC_STATE_COMM_FAILED:
|
||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||
if (jdata->enable_recovery) {
|
||||
/* is this a local proc */
|
||||
child = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (NULL != child) {
|
||||
/* see if this child has reached its local restart limit */
|
||||
if (child->restarts < jdata->max_local_restarts) {
|
||||
child->restarts++;
|
||||
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* let it fall thru to abort */
|
||||
}
|
||||
}
|
||||
}
|
||||
update_proc(jdata, proc, state, exit_code);
|
||||
check_job_complete(jdata); /* need to set the job state */
|
||||
/* the job object for this job will have been NULL'd
|
||||
@ -264,6 +289,7 @@ static int update_state(orte_jobid_t job,
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||
update_proc(jdata, proc, state, exit_code);
|
||||
check_job_complete(jdata);
|
||||
/* the job object for this job will have been NULL'd
|
||||
@ -466,7 +492,8 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
|
||||
}
|
||||
}
|
||||
|
||||
static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
static void update_proc(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
@ -489,8 +516,11 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
proct->state = state;
|
||||
proct->exit_code = exit_code;
|
||||
if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
OBJ_RELEASE(child);
|
||||
if (!jdata->enable_recovery) {
|
||||
opal_output(0, "JDATA NOT ENABLED FOR RECOVERY");
|
||||
opal_list_remove_item(&orte_local_children, &child->super);
|
||||
OBJ_RELEASE(child);
|
||||
}
|
||||
jdata->num_terminated++;
|
||||
} else if (ORTE_PROC_STATE_RUNNING == state) {
|
||||
jdata->num_launched++;
|
||||
|
@ -1166,6 +1166,111 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env)
|
||||
{
|
||||
char *vpid_str, *param, *value;
|
||||
orte_node_rank_t node_rank;
|
||||
orte_local_rank_t local_rank;
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
return rc;
|
||||
}
|
||||
opal_setenv(param, vpid_str, true, env);
|
||||
free(param);
|
||||
|
||||
/* although the vpid IS the process' rank within the job, users
|
||||
* would appreciate being given a public environmental variable
|
||||
* that also represents this value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, env);
|
||||
free(vpid_str); /* done with this now */
|
||||
|
||||
/* users would appreciate being given a public environmental variable
|
||||
* that also represents the local rank value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
return rc;
|
||||
}
|
||||
asprintf(&value, "%lu", (unsigned long) local_rank);
|
||||
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, env);
|
||||
free(value);
|
||||
|
||||
/* users would appreciate being given a public environmental variable
|
||||
* that also represents the node rank value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
return rc;
|
||||
}
|
||||
asprintf(&value, "%lu", (unsigned long) node_rank);
|
||||
opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, env);
|
||||
/* set an mca param for it too */
|
||||
if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
return rc;
|
||||
}
|
||||
opal_setenv(param, value, true, env);
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* pass the number of restarts for this proc - will be zero for
|
||||
* an initial start, but procs would like to know if they are being
|
||||
* restarted so they can take appropriate action
|
||||
*/
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","num","restarts"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
return rc;
|
||||
}
|
||||
asprintf(&value, "%d", child->restarts);
|
||||
opal_setenv(param, value, true, env);
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* if the proc should not barrier in orte_init, tell it */
|
||||
if (child->do_not_barrier || 0 < child->restarts) {
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","do_not","barrier"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
return rc;
|
||||
}
|
||||
opal_setenv(param, "1", true, env);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* if the proc isn't going to forward IO, then we need to flag that
|
||||
* it has "completed" iof termination as otherwise it will never fire
|
||||
*/
|
||||
if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
|
||||
child->iof_complete = true;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* define a timer release point so that we can wait for
|
||||
* file descriptors to come available, if necessary
|
||||
*/
|
||||
@ -1186,7 +1291,7 @@ static void timer_cb(int fd, short event, void *cbdata)
|
||||
int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
orte_odls_base_fork_local_proc_fn_t fork_local)
|
||||
{
|
||||
char *job_str, *vpid_str, *param, *value;
|
||||
char *job_str, *param;
|
||||
opal_list_item_t *item;
|
||||
orte_app_context_t *app, **apps;
|
||||
orte_app_idx_t i, num_apps;
|
||||
@ -1198,8 +1303,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
opal_buffer_t alert;
|
||||
orte_std_cntr_t proc_rank;
|
||||
orte_odls_job_t *jobdat;
|
||||
orte_local_rank_t local_rank;
|
||||
orte_node_rank_t node_rank;
|
||||
char *pathenv = NULL, *mpiexec_pathenv = NULL;
|
||||
char basedir[MAXPATHLEN];
|
||||
char dir[MAXPATHLEN];
|
||||
@ -1639,10 +1742,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
@ -1652,97 +1751,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
free(param);
|
||||
free(job_str);
|
||||
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
opal_setenv(param, vpid_str, true, &app->env);
|
||||
free(param);
|
||||
|
||||
/* although the vpid IS the process' rank within the job, users
|
||||
* would appreciate being given a public environmental variable
|
||||
* that also represents this value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, &app->env);
|
||||
free(vpid_str); /* done with this now */
|
||||
|
||||
/* users would appreciate being given a public environmental variable
|
||||
* that also represents the local rank value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
goto CLEANUP;
|
||||
}
|
||||
asprintf(&value, "%lu", (unsigned long) local_rank);
|
||||
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
|
||||
free(value);
|
||||
|
||||
/* users would appreciate being given a public environmental variable
|
||||
* that also represents the node rank value - something MPI specific - so
|
||||
* do that here.
|
||||
*
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
goto CLEANUP;
|
||||
}
|
||||
asprintf(&value, "%lu", (unsigned long) node_rank);
|
||||
opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env);
|
||||
/* set an mca param for it too */
|
||||
if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto CLEANUP;
|
||||
}
|
||||
opal_setenv(param, value, true, &app->env);
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* pass the number of restarts for this proc - will be zero for
|
||||
* an initial start, but procs would like to know if they are being
|
||||
* restarted so they can take appropriate action
|
||||
*/
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","num","restarts"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto CLEANUP;
|
||||
}
|
||||
asprintf(&value, "%d", child->restarts);
|
||||
opal_setenv(param, value, true, &app->env);
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* if the proc should not barrier in orte_init, tell it */
|
||||
if (child->do_not_barrier || 0 < child->restarts) {
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","do_not","barrier"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto CLEANUP;
|
||||
}
|
||||
opal_setenv(param, "1", true, &app->env);
|
||||
free(param);
|
||||
}
|
||||
|
||||
/* if the proc isn't going to forward IO, then we need to flag that
|
||||
* it has "completed" iof termination as otherwise it will never fire
|
||||
*/
|
||||
if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
|
||||
child->iof_complete = true;
|
||||
}
|
||||
|
||||
/* if we are timing things, record when we are going to launch this proc */
|
||||
if (orte_timing) {
|
||||
gettimeofday(&child->starttime, NULL);
|
||||
@ -2857,5 +2870,63 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
|
||||
int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
||||
orte_odls_base_fork_local_proc_fn_t fork_local)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
int rc;
|
||||
orte_app_context_t *app;
|
||||
opal_list_item_t *item;
|
||||
orte_odls_job_t *jobdat;
|
||||
|
||||
/* protect operations involving the global list of children */
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:restart_proc for proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
|
||||
/* find this child's jobdat */
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
item != opal_list_get_end(&orte_local_jobdata);
|
||||
item = opal_list_get_next(item)) {
|
||||
jobdat = (orte_odls_job_t*)item;
|
||||
if (jobdat->jobid == child->name->jobid) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
child->exit_code = 0;
|
||||
child->waitpid_recvd = false;
|
||||
child->iof_complete = false;
|
||||
child->coll_recvd = false;
|
||||
child->pid = 0;
|
||||
child->init_recvd = false;
|
||||
child->fini_recvd = false;
|
||||
if (NULL != child->rml_uri) {
|
||||
free(child->rml_uri);
|
||||
child->rml_uri = NULL;
|
||||
}
|
||||
app = jobdat->apps[child->app_idx];
|
||||
/* reset envars to match this child */
|
||||
|
||||
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
rc = fork_local(app, child, app->env, jobdat);
|
||||
if (ORTE_SUCCESS == rc) {
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:restart of proc %s %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name),
|
||||
(ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
|
||||
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
return rc;
|
||||
}
|
||||
|
@ -91,6 +91,23 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
||||
/* store it on the global job data pool */
|
||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||
|
||||
/* if its restart limits have not been set, set them to the defaults */
|
||||
if (jdata->max_global_restarts < 0) {
|
||||
jdata->max_global_restarts = orte_max_global_restarts;
|
||||
}
|
||||
if (jdata->max_local_restarts < 0) {
|
||||
jdata->max_local_restarts = orte_max_local_restarts;
|
||||
}
|
||||
/* consistency check */
|
||||
if (jdata->max_global_restarts <= 0 &&
|
||||
jdata->max_local_restarts <= 0) {
|
||||
jdata->enable_recovery = false;
|
||||
|
||||
} else {
|
||||
jdata->enable_recovery = true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* get the allocation */
|
||||
|
@ -174,6 +174,11 @@ bool orte_report_bindings = false;
|
||||
/* barrier control */
|
||||
bool orte_do_not_barrier = false;
|
||||
|
||||
/* process recovery */
|
||||
bool orte_enable_recovery;
|
||||
int32_t orte_max_global_restarts;
|
||||
int32_t orte_max_local_restarts;
|
||||
|
||||
/* comm fn for updating state */
|
||||
orte_default_comm_fn_t orte_comm;
|
||||
|
||||
@ -266,16 +271,6 @@ int orte_dt_init(void)
|
||||
}
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
/* get a clean output channel too */
|
||||
{
|
||||
opal_output_stream_t lds;
|
||||
OBJ_CONSTRUCT(&lds, opal_output_stream_t);
|
||||
lds.lds_want_stdout = true;
|
||||
orte_clean_output = opal_output_open(&lds);
|
||||
OBJ_DESTRUCT(&lds);
|
||||
|
||||
}
|
||||
|
||||
tmp = ORTE_JOB;
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_job,
|
||||
orte_dt_unpack_job,
|
||||
@ -640,8 +635,9 @@ static void orte_job_construct(orte_job_t* job)
|
||||
OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t);
|
||||
job->not_reported = true;
|
||||
|
||||
job->max_local_restarts = 0;
|
||||
job->max_global_restarts = 0;
|
||||
job->enable_recovery = false;
|
||||
job->max_local_restarts = -1;
|
||||
job->max_global_restarts = -1;
|
||||
|
||||
job->launch_msg_sent.tv_sec = 0;
|
||||
job->launch_msg_sent.tv_usec = 0;
|
||||
|
@ -390,6 +390,8 @@ typedef struct {
|
||||
bool abort;
|
||||
/* proc that caused that to happen */
|
||||
struct orte_proc_t *aborted_proc;
|
||||
/* enable recovery of these processes */
|
||||
bool enable_recovery;
|
||||
/* max number of times a process can be restarted locally */
|
||||
int32_t max_local_restarts;
|
||||
/* max number of times a process can be relocated to another node */
|
||||
@ -665,6 +667,11 @@ ORTE_DECLSPEC extern bool orte_report_bindings;
|
||||
/* barrier control */
|
||||
ORTE_DECLSPEC extern bool orte_do_not_barrier;
|
||||
|
||||
/* process recovery */
|
||||
ORTE_DECLSPEC extern bool orte_enable_recovery;
|
||||
ORTE_DECLSPEC extern int32_t orte_max_global_restarts;
|
||||
ORTE_DECLSPEC extern int32_t orte_max_local_restarts;
|
||||
|
||||
/* comm interface */
|
||||
typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data);
|
||||
|
||||
|
@ -54,6 +54,20 @@ int orte_register_params(void)
|
||||
}
|
||||
passed_thru = true;
|
||||
|
||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||
/* get a clean output channel too - need to do this here because
|
||||
* we use it below, and orterun and some other tools call this
|
||||
* function prior to calling orte_init
|
||||
*/
|
||||
{
|
||||
opal_output_stream_t lds;
|
||||
OBJ_CONSTRUCT(&lds, opal_output_stream_t);
|
||||
lds.lds_want_stdout = true;
|
||||
orte_clean_output = opal_output_open(&lds);
|
||||
OBJ_DESTRUCT(&lds);
|
||||
}
|
||||
#endif /* !ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
mca_base_param_reg_int_name("orte", "base_help_aggregate",
|
||||
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
|
||||
false, false,
|
||||
@ -448,6 +462,58 @@ int orte_register_params(void)
|
||||
(int) false, &value);
|
||||
orte_do_not_barrier = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "enable_recovery",
|
||||
"Enable recovery from process failure [Default = disabled]",
|
||||
false, false,
|
||||
(int)false, &value);
|
||||
orte_enable_recovery = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "max_global_restarts",
|
||||
"Max number of times to relocate a failed process to a new node",
|
||||
false, false,
|
||||
-1, &orte_max_global_restarts);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "max_local_restarts",
|
||||
"Max number of times to locally restart a failed process before relocating it to a new node",
|
||||
false, false,
|
||||
-1, &orte_max_local_restarts);
|
||||
if (orte_enable_recovery) {
|
||||
if (orte_max_global_restarts <= 0 &&
|
||||
orte_max_local_restarts <= 0) {
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
opal_output(orte_clean_output,
|
||||
"------------------------------------------------------------\n"
|
||||
"Although the MCA param orte_enable_recovery was set to true,\n"
|
||||
"values for the max number of restarts was not provided:\n\n"
|
||||
"Max global restarts: %d\n"
|
||||
"Max local restarts: %d\n\n"
|
||||
"At least one of these must be a positive value. We are disabling\n"
|
||||
"process recovery, but continuing execution.\n"
|
||||
"------------------------------------------------------------",
|
||||
orte_max_global_restarts, orte_max_local_restarts);
|
||||
}
|
||||
orte_enable_recovery = false;
|
||||
}
|
||||
} else if (orte_max_global_restarts > 0 ||
|
||||
orte_max_local_restarts > 0) {
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
opal_output(orte_clean_output,
|
||||
"------------------------------------------------------------------\n"
|
||||
"The MCA param errmgr_base_enable_recovery was not set to true, but\n"
|
||||
"positive value(s) were provided for the number of restarts:\n\n"
|
||||
"Max global restarts: %d\n"
|
||||
"Max local restarts: %d\n\n"
|
||||
"We are enabling process recovery and continuing execution. To avoid\n"
|
||||
"this warning in the future, please set the errmgr_base_enable_recovery\n"
|
||||
"param to non-zero.\n"
|
||||
"------------------------------------------------------------------",
|
||||
orte_max_global_restarts, orte_max_local_restarts);
|
||||
}
|
||||
orte_enable_recovery = true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -425,6 +425,18 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||
"Report events to a tool listening at the specified URI" },
|
||||
|
||||
{ "orte", "enable", "recovery", '\0', "enable-recovery", "enable-recovery", 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||
"Enable recovery from process failure [Default = disabled]" },
|
||||
|
||||
{ "orte", "max", "global_restarts", '\0', "max-global-restarts", "max-global-restarts", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Max number of times to relocate a failed process to a new node" },
|
||||
|
||||
{ "orte", "max", "local_restarts", '\0', "max-local-restarts", "max-local-restarts", 1,
|
||||
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||
"Max number of times to locally restart a failed process before relocating it to a new node" },
|
||||
|
||||
/* End of list */
|
||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||
@ -1135,11 +1147,11 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
orte_debugger_finalize();
|
||||
|
||||
/*
|
||||
* Turn off the errmgr recovery functionality, if it was enabled.
|
||||
* Turn off the process recovery functionality, if it was enabled.
|
||||
* This keeps the errmgr from trying to recover from the shutdown
|
||||
* procedure.
|
||||
*/
|
||||
orte_errmgr_base.enable_recovery = false;
|
||||
orte_enable_recovery = false;
|
||||
orte_errmgr_base.shutting_down = true;
|
||||
|
||||
/* terminate the orteds - they will automatically kill
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user