Restore process recovery for procs local to mpirun (first step towards restoring full capability). Define three new MCA params:
1. orte_enable_recovery - default recovery policy, can be overridden on a per-job basis 2. orte_max_local_restarts - default max number of local restarts, can be overridden 3. orte_max_global_restarts - default max number of relocates, can be overridden Implement the restart_proc API for the ODLS framework, reorganize the default fns a little to avoid copying code. This commit was SVN r23057.
Этот коммит содержится в:
родитель
f064056a07
Коммит
319758e3e0
@ -173,17 +173,6 @@ int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc,
|
|||||||
int i, rc;
|
int i, rc;
|
||||||
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
orte_errmgr_stack_state_t stack_state = ORTE_ERRMGR_STACK_STATE_NONE;
|
||||||
|
|
||||||
/*
|
|
||||||
* If the user did not ask for recovery, then do not process recovery events
|
|
||||||
*/
|
|
||||||
if( !orte_errmgr_base.enable_recovery ) {
|
|
||||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
|
||||||
"errmgr:base:suggest_map_targets() %s) "
|
|
||||||
"------- Recovery currently disabled! Skipping...",
|
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) ));
|
|
||||||
return ORTE_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||||
"errmgr:base:suggest_map_targets() %s) "
|
"errmgr:base:suggest_map_targets() %s) "
|
||||||
"------- Notifying components... (%3d active components)",
|
"------- Notifying components... (%3d active components)",
|
||||||
|
@ -39,6 +39,7 @@
|
|||||||
#include "opal/util/trace.h"
|
#include "opal/util/trace.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
|
||||||
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/mca/errmgr/base/base.h"
|
#include "orte/mca/errmgr/base/base.h"
|
||||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||||
|
|
||||||
@ -47,10 +48,6 @@
|
|||||||
/*
|
/*
|
||||||
* Globals
|
* Globals
|
||||||
*/
|
*/
|
||||||
int orte_errmgr_base_output = -1;
|
|
||||||
bool orte_errmgr_base_enable_recovery = false;
|
|
||||||
bool orte_errmgr_base_shutting_down = false;
|
|
||||||
bool orte_errmgr_initialized = false;
|
|
||||||
opal_list_t orte_errmgr_base_components_available;
|
opal_list_t orte_errmgr_base_components_available;
|
||||||
|
|
||||||
orte_errmgr_base_t orte_errmgr_base;
|
orte_errmgr_base_t orte_errmgr_base;
|
||||||
@ -70,8 +67,6 @@ orte_errmgr_API_t orte_errmgr = {
|
|||||||
*/
|
*/
|
||||||
int orte_errmgr_base_open(void)
|
int orte_errmgr_base_open(void)
|
||||||
{
|
{
|
||||||
int value;
|
|
||||||
|
|
||||||
OPAL_TRACE(5);
|
OPAL_TRACE(5);
|
||||||
|
|
||||||
/* Only pass this way once */
|
/* Only pass this way once */
|
||||||
@ -84,40 +79,6 @@ int orte_errmgr_base_open(void)
|
|||||||
|
|
||||||
orte_errmgr_base.output = opal_output_open(NULL);
|
orte_errmgr_base.output = opal_output_open(NULL);
|
||||||
|
|
||||||
mca_base_param_reg_int_name("errmgr",
|
|
||||||
"base_enable_recovery",
|
|
||||||
"If the ErrMgr recovery components should be enabled."
|
|
||||||
" [Default = disabled]",
|
|
||||||
false, false,
|
|
||||||
0, &value);
|
|
||||||
orte_errmgr_base.enable_recovery = OPAL_INT_TO_BOOL(value);
|
|
||||||
|
|
||||||
mca_base_param_reg_int_name("errmgr",
|
|
||||||
"max_global_restarts",
|
|
||||||
"Max number of times to relocate a failed process to a new node",
|
|
||||||
false, false,
|
|
||||||
-1, &orte_errmgr_base.max_global_restarts);
|
|
||||||
|
|
||||||
mca_base_param_reg_int_name("errmgr",
|
|
||||||
"max_local_restarts",
|
|
||||||
"Max number of times to locally restart a failed process before relocating it to a new node",
|
|
||||||
false, false,
|
|
||||||
-1, &orte_errmgr_base.max_local_restarts);
|
|
||||||
|
|
||||||
if (orte_errmgr_base.enable_recovery) {
|
|
||||||
if (orte_errmgr_base.max_global_restarts < 0 ) {
|
|
||||||
orte_errmgr_base.max_global_restarts = 3;
|
|
||||||
}
|
|
||||||
if (orte_errmgr_base.max_local_restarts < 0) {
|
|
||||||
orte_errmgr_base.max_local_restarts = 3;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (orte_errmgr_base.max_local_restarts > 0 ||
|
|
||||||
orte_errmgr_base.max_global_restarts > 0) {
|
|
||||||
orte_errmgr_base.enable_recovery = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* A flag to indicate that orterun is shutting down, so skip the recovery
|
* A flag to indicate that orterun is shutting down, so skip the recovery
|
||||||
* logic.
|
* logic.
|
||||||
|
@ -43,11 +43,8 @@ BEGIN_C_DECLS
|
|||||||
typedef struct {
|
typedef struct {
|
||||||
int output;
|
int output;
|
||||||
bool shutting_down;
|
bool shutting_down;
|
||||||
bool enable_recovery;
|
|
||||||
opal_pointer_array_t modules;
|
opal_pointer_array_t modules;
|
||||||
bool initialized;
|
bool initialized;
|
||||||
int max_global_restarts;
|
|
||||||
int max_local_restarts;
|
|
||||||
} orte_errmgr_base_t;
|
} orte_errmgr_base_t;
|
||||||
|
|
||||||
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
|
ORTE_DECLSPEC extern orte_errmgr_base_t orte_errmgr_base;
|
||||||
|
@ -113,6 +113,9 @@ static int update_state(orte_jobid_t job,
|
|||||||
{
|
{
|
||||||
orte_job_t *jdata;
|
orte_job_t *jdata;
|
||||||
orte_exit_code_t sts;
|
orte_exit_code_t sts;
|
||||||
|
orte_odls_child_t *child;
|
||||||
|
opal_list_item_t *item;
|
||||||
|
int rc;
|
||||||
|
|
||||||
/* indicate that this is the end of the line */
|
/* indicate that this is the end of the line */
|
||||||
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
*stack_state |= ORTE_ERRMGR_STACK_STATE_COMPLETE;
|
||||||
@ -251,7 +254,29 @@ static int update_state(orte_jobid_t job,
|
|||||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||||
case ORTE_PROC_STATE_COMM_FAILED:
|
case ORTE_PROC_STATE_COMM_FAILED:
|
||||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
if (jdata->enable_recovery) {
|
||||||
|
/* is this a local proc */
|
||||||
|
child = NULL;
|
||||||
|
for (item = opal_list_get_first(&orte_local_children);
|
||||||
|
item != opal_list_get_end(&orte_local_children);
|
||||||
|
item = opal_list_get_next(item)) {
|
||||||
|
child = (orte_odls_child_t*)item;
|
||||||
|
if (child->name->jobid == proc->jobid &&
|
||||||
|
child->name->vpid == proc->vpid) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (NULL != child) {
|
||||||
|
/* see if this child has reached its local restart limit */
|
||||||
|
if (child->restarts < jdata->max_local_restarts) {
|
||||||
|
child->restarts++;
|
||||||
|
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
/* let it fall thru to abort */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
update_proc(jdata, proc, state, exit_code);
|
update_proc(jdata, proc, state, exit_code);
|
||||||
check_job_complete(jdata); /* need to set the job state */
|
check_job_complete(jdata); /* need to set the job state */
|
||||||
/* the job object for this job will have been NULL'd
|
/* the job object for this job will have been NULL'd
|
||||||
@ -264,6 +289,7 @@ static int update_state(orte_jobid_t job,
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||||
|
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||||
update_proc(jdata, proc, state, exit_code);
|
update_proc(jdata, proc, state, exit_code);
|
||||||
check_job_complete(jdata);
|
check_job_complete(jdata);
|
||||||
/* the job object for this job will have been NULL'd
|
/* the job object for this job will have been NULL'd
|
||||||
@ -466,7 +492,8 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
|
static void update_proc(orte_job_t *jdata,
|
||||||
|
orte_process_name_t *proc,
|
||||||
orte_proc_state_t state,
|
orte_proc_state_t state,
|
||||||
orte_exit_code_t exit_code)
|
orte_exit_code_t exit_code)
|
||||||
{
|
{
|
||||||
@ -489,8 +516,11 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
|
|||||||
proct->state = state;
|
proct->state = state;
|
||||||
proct->exit_code = exit_code;
|
proct->exit_code = exit_code;
|
||||||
if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
if (ORTE_PROC_STATE_UNTERMINATED < state) {
|
||||||
opal_list_remove_item(&orte_local_children, &child->super);
|
if (!jdata->enable_recovery) {
|
||||||
OBJ_RELEASE(child);
|
opal_output(0, "JDATA NOT ENABLED FOR RECOVERY");
|
||||||
|
opal_list_remove_item(&orte_local_children, &child->super);
|
||||||
|
OBJ_RELEASE(child);
|
||||||
|
}
|
||||||
jdata->num_terminated++;
|
jdata->num_terminated++;
|
||||||
} else if (ORTE_PROC_STATE_RUNNING == state) {
|
} else if (ORTE_PROC_STATE_RUNNING == state) {
|
||||||
jdata->num_launched++;
|
jdata->num_launched++;
|
||||||
|
@ -1166,6 +1166,111 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
|
|||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env)
|
||||||
|
{
|
||||||
|
char *vpid_str, *param, *value;
|
||||||
|
orte_node_rank_t node_rank;
|
||||||
|
orte_local_rank_t local_rank;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
if (NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
opal_setenv(param, vpid_str, true, env);
|
||||||
|
free(param);
|
||||||
|
|
||||||
|
/* although the vpid IS the process' rank within the job, users
|
||||||
|
* would appreciate being given a public environmental variable
|
||||||
|
* that also represents this value - something MPI specific - so
|
||||||
|
* do that here.
|
||||||
|
*
|
||||||
|
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||||
|
* We know - just live with it
|
||||||
|
*/
|
||||||
|
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, env);
|
||||||
|
free(vpid_str); /* done with this now */
|
||||||
|
|
||||||
|
/* users would appreciate being given a public environmental variable
|
||||||
|
* that also represents the local rank value - something MPI specific - so
|
||||||
|
* do that here.
|
||||||
|
*
|
||||||
|
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||||
|
* We know - just live with it
|
||||||
|
*/
|
||||||
|
if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||||
|
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
asprintf(&value, "%lu", (unsigned long) local_rank);
|
||||||
|
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, env);
|
||||||
|
free(value);
|
||||||
|
|
||||||
|
/* users would appreciate being given a public environmental variable
|
||||||
|
* that also represents the node rank value - something MPI specific - so
|
||||||
|
* do that here.
|
||||||
|
*
|
||||||
|
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||||
|
* We know - just live with it
|
||||||
|
*/
|
||||||
|
if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
||||||
|
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
asprintf(&value, "%lu", (unsigned long) node_rank);
|
||||||
|
opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, env);
|
||||||
|
/* set an mca param for it too */
|
||||||
|
if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
opal_setenv(param, value, true, env);
|
||||||
|
free(param);
|
||||||
|
free(value);
|
||||||
|
|
||||||
|
/* pass the number of restarts for this proc - will be zero for
|
||||||
|
* an initial start, but procs would like to know if they are being
|
||||||
|
* restarted so they can take appropriate action
|
||||||
|
*/
|
||||||
|
if (NULL == (param = mca_base_param_environ_variable("orte","num","restarts"))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
asprintf(&value, "%d", child->restarts);
|
||||||
|
opal_setenv(param, value, true, env);
|
||||||
|
free(param);
|
||||||
|
free(value);
|
||||||
|
|
||||||
|
/* if the proc should not barrier in orte_init, tell it */
|
||||||
|
if (child->do_not_barrier || 0 < child->restarts) {
|
||||||
|
if (NULL == (param = mca_base_param_environ_variable("orte","do_not","barrier"))) {
|
||||||
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
opal_setenv(param, "1", true, env);
|
||||||
|
free(param);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if the proc isn't going to forward IO, then we need to flag that
|
||||||
|
* it has "completed" iof termination as otherwise it will never fire
|
||||||
|
*/
|
||||||
|
if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
|
||||||
|
child->iof_complete = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/* define a timer release point so that we can wait for
|
/* define a timer release point so that we can wait for
|
||||||
* file descriptors to come available, if necessary
|
* file descriptors to come available, if necessary
|
||||||
*/
|
*/
|
||||||
@ -1186,7 +1291,7 @@ static void timer_cb(int fd, short event, void *cbdata)
|
|||||||
int orte_odls_base_default_launch_local(orte_jobid_t job,
|
int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||||
orte_odls_base_fork_local_proc_fn_t fork_local)
|
orte_odls_base_fork_local_proc_fn_t fork_local)
|
||||||
{
|
{
|
||||||
char *job_str, *vpid_str, *param, *value;
|
char *job_str, *param;
|
||||||
opal_list_item_t *item;
|
opal_list_item_t *item;
|
||||||
orte_app_context_t *app, **apps;
|
orte_app_context_t *app, **apps;
|
||||||
orte_app_idx_t i, num_apps;
|
orte_app_idx_t i, num_apps;
|
||||||
@ -1198,8 +1303,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
|||||||
opal_buffer_t alert;
|
opal_buffer_t alert;
|
||||||
orte_std_cntr_t proc_rank;
|
orte_std_cntr_t proc_rank;
|
||||||
orte_odls_job_t *jobdat;
|
orte_odls_job_t *jobdat;
|
||||||
orte_local_rank_t local_rank;
|
|
||||||
orte_node_rank_t node_rank;
|
|
||||||
char *pathenv = NULL, *mpiexec_pathenv = NULL;
|
char *pathenv = NULL, *mpiexec_pathenv = NULL;
|
||||||
char basedir[MAXPATHLEN];
|
char basedir[MAXPATHLEN];
|
||||||
char dir[MAXPATHLEN];
|
char dir[MAXPATHLEN];
|
||||||
@ -1639,10 +1742,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
|||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) {
|
|
||||||
ORTE_ERROR_LOG(rc);
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
|
||||||
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
|
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||||
@ -1652,97 +1751,11 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
|||||||
free(param);
|
free(param);
|
||||||
free(job_str);
|
free(job_str);
|
||||||
|
|
||||||
if (NULL == (param = mca_base_param_environ_variable("orte","ess","vpid"))) {
|
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
ORTE_ERROR_LOG(rc);
|
||||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
||||||
goto CLEANUP;
|
goto CLEANUP;
|
||||||
}
|
}
|
||||||
opal_setenv(param, vpid_str, true, &app->env);
|
|
||||||
free(param);
|
|
||||||
|
|
||||||
/* although the vpid IS the process' rank within the job, users
|
|
||||||
* would appreciate being given a public environmental variable
|
|
||||||
* that also represents this value - something MPI specific - so
|
|
||||||
* do that here.
|
|
||||||
*
|
|
||||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
|
||||||
* We know - just live with it
|
|
||||||
*/
|
|
||||||
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, &app->env);
|
|
||||||
free(vpid_str); /* done with this now */
|
|
||||||
|
|
||||||
/* users would appreciate being given a public environmental variable
|
|
||||||
* that also represents the local rank value - something MPI specific - so
|
|
||||||
* do that here.
|
|
||||||
*
|
|
||||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
|
||||||
* We know - just live with it
|
|
||||||
*/
|
|
||||||
if (ORTE_LOCAL_RANK_INVALID == (local_rank = orte_ess.get_local_rank(child->name))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
|
||||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
|
||||||
asprintf(&value, "%lu", (unsigned long) local_rank);
|
|
||||||
opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
|
|
||||||
free(value);
|
|
||||||
|
|
||||||
/* users would appreciate being given a public environmental variable
|
|
||||||
* that also represents the node rank value - something MPI specific - so
|
|
||||||
* do that here.
|
|
||||||
*
|
|
||||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
|
||||||
* We know - just live with it
|
|
||||||
*/
|
|
||||||
if (ORTE_NODE_RANK_INVALID == (node_rank = orte_ess.get_node_rank(child->name))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
|
|
||||||
rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
|
||||||
asprintf(&value, "%lu", (unsigned long) node_rank);
|
|
||||||
opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env);
|
|
||||||
/* set an mca param for it too */
|
|
||||||
if(NULL == (param = mca_base_param_environ_variable("orte","ess","node_rank"))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
||||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
|
||||||
opal_setenv(param, value, true, &app->env);
|
|
||||||
free(param);
|
|
||||||
free(value);
|
|
||||||
|
|
||||||
/* pass the number of restarts for this proc - will be zero for
|
|
||||||
* an initial start, but procs would like to know if they are being
|
|
||||||
* restarted so they can take appropriate action
|
|
||||||
*/
|
|
||||||
if (NULL == (param = mca_base_param_environ_variable("orte","num","restarts"))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
||||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
|
||||||
asprintf(&value, "%d", child->restarts);
|
|
||||||
opal_setenv(param, value, true, &app->env);
|
|
||||||
free(param);
|
|
||||||
free(value);
|
|
||||||
|
|
||||||
/* if the proc should not barrier in orte_init, tell it */
|
|
||||||
if (child->do_not_barrier || 0 < child->restarts) {
|
|
||||||
if (NULL == (param = mca_base_param_environ_variable("orte","do_not","barrier"))) {
|
|
||||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
|
||||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
|
||||||
goto CLEANUP;
|
|
||||||
}
|
|
||||||
opal_setenv(param, "1", true, &app->env);
|
|
||||||
free(param);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if the proc isn't going to forward IO, then we need to flag that
|
|
||||||
* it has "completed" iof termination as otherwise it will never fire
|
|
||||||
*/
|
|
||||||
if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) {
|
|
||||||
child->iof_complete = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if we are timing things, record when we are going to launch this proc */
|
/* if we are timing things, record when we are going to launch this proc */
|
||||||
if (orte_timing) {
|
if (orte_timing) {
|
||||||
gettimeofday(&child->starttime, NULL);
|
gettimeofday(&child->starttime, NULL);
|
||||||
@ -2857,5 +2870,63 @@ int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
|
|||||||
int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
||||||
orte_odls_base_fork_local_proc_fn_t fork_local)
|
orte_odls_base_fork_local_proc_fn_t fork_local)
|
||||||
{
|
{
|
||||||
return ORTE_SUCCESS;
|
int rc;
|
||||||
|
orte_app_context_t *app;
|
||||||
|
opal_list_item_t *item;
|
||||||
|
orte_odls_job_t *jobdat;
|
||||||
|
|
||||||
|
/* protect operations involving the global list of children */
|
||||||
|
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||||
|
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:restart_proc for proc %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
|
||||||
|
/* find this child's jobdat */
|
||||||
|
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||||
|
item != opal_list_get_end(&orte_local_jobdata);
|
||||||
|
item = opal_list_get_next(item)) {
|
||||||
|
jobdat = (orte_odls_job_t*)item;
|
||||||
|
if (jobdat->jobid == child->name->jobid) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
child->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||||
|
child->exit_code = 0;
|
||||||
|
child->waitpid_recvd = false;
|
||||||
|
child->iof_complete = false;
|
||||||
|
child->coll_recvd = false;
|
||||||
|
child->pid = 0;
|
||||||
|
child->init_recvd = false;
|
||||||
|
child->fini_recvd = false;
|
||||||
|
if (NULL != child->rml_uri) {
|
||||||
|
free(child->rml_uri);
|
||||||
|
child->rml_uri = NULL;
|
||||||
|
}
|
||||||
|
app = jobdat->apps[child->app_idx];
|
||||||
|
/* reset envars to match this child */
|
||||||
|
|
||||||
|
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
|
||||||
|
ORTE_ERROR_LOG(rc);
|
||||||
|
goto CLEANUP;
|
||||||
|
}
|
||||||
|
rc = fork_local(app, child, app->env, jobdat);
|
||||||
|
if (ORTE_SUCCESS == rc) {
|
||||||
|
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||||
|
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
|
||||||
|
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
CLEANUP:
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:restart of proc %s %s",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name),
|
||||||
|
(ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
|
||||||
|
|
||||||
|
opal_condition_signal(&orte_odls_globals.cond);
|
||||||
|
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -91,6 +91,23 @@ int orte_plm_base_setup_job(orte_job_t *jdata)
|
|||||||
/* store it on the global job data pool */
|
/* store it on the global job data pool */
|
||||||
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
ljob = ORTE_LOCAL_JOBID(jdata->jobid);
|
||||||
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
opal_pointer_array_set_item(orte_job_data, ljob, jdata);
|
||||||
|
|
||||||
|
/* if its restart limits have not been set, set them to the defaults */
|
||||||
|
if (jdata->max_global_restarts < 0) {
|
||||||
|
jdata->max_global_restarts = orte_max_global_restarts;
|
||||||
|
}
|
||||||
|
if (jdata->max_local_restarts < 0) {
|
||||||
|
jdata->max_local_restarts = orte_max_local_restarts;
|
||||||
|
}
|
||||||
|
/* consistency check */
|
||||||
|
if (jdata->max_global_restarts <= 0 &&
|
||||||
|
jdata->max_local_restarts <= 0) {
|
||||||
|
jdata->enable_recovery = false;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
jdata->enable_recovery = true;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get the allocation */
|
/* get the allocation */
|
||||||
|
@ -174,6 +174,11 @@ bool orte_report_bindings = false;
|
|||||||
/* barrier control */
|
/* barrier control */
|
||||||
bool orte_do_not_barrier = false;
|
bool orte_do_not_barrier = false;
|
||||||
|
|
||||||
|
/* process recovery */
|
||||||
|
bool orte_enable_recovery;
|
||||||
|
int32_t orte_max_global_restarts;
|
||||||
|
int32_t orte_max_local_restarts;
|
||||||
|
|
||||||
/* comm fn for updating state */
|
/* comm fn for updating state */
|
||||||
orte_default_comm_fn_t orte_comm;
|
orte_default_comm_fn_t orte_comm;
|
||||||
|
|
||||||
@ -266,16 +271,6 @@ int orte_dt_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if !ORTE_DISABLE_FULL_SUPPORT
|
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||||
/* get a clean output channel too */
|
|
||||||
{
|
|
||||||
opal_output_stream_t lds;
|
|
||||||
OBJ_CONSTRUCT(&lds, opal_output_stream_t);
|
|
||||||
lds.lds_want_stdout = true;
|
|
||||||
orte_clean_output = opal_output_open(&lds);
|
|
||||||
OBJ_DESTRUCT(&lds);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp = ORTE_JOB;
|
tmp = ORTE_JOB;
|
||||||
if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_job,
|
if (ORTE_SUCCESS != (rc = opal_dss.register_type(orte_dt_pack_job,
|
||||||
orte_dt_unpack_job,
|
orte_dt_unpack_job,
|
||||||
@ -640,8 +635,9 @@ static void orte_job_construct(orte_job_t* job)
|
|||||||
OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t);
|
OBJ_CONSTRUCT(&job->reported_cond, opal_condition_t);
|
||||||
job->not_reported = true;
|
job->not_reported = true;
|
||||||
|
|
||||||
job->max_local_restarts = 0;
|
job->enable_recovery = false;
|
||||||
job->max_global_restarts = 0;
|
job->max_local_restarts = -1;
|
||||||
|
job->max_global_restarts = -1;
|
||||||
|
|
||||||
job->launch_msg_sent.tv_sec = 0;
|
job->launch_msg_sent.tv_sec = 0;
|
||||||
job->launch_msg_sent.tv_usec = 0;
|
job->launch_msg_sent.tv_usec = 0;
|
||||||
|
@ -390,6 +390,8 @@ typedef struct {
|
|||||||
bool abort;
|
bool abort;
|
||||||
/* proc that caused that to happen */
|
/* proc that caused that to happen */
|
||||||
struct orte_proc_t *aborted_proc;
|
struct orte_proc_t *aborted_proc;
|
||||||
|
/* enable recovery of these processes */
|
||||||
|
bool enable_recovery;
|
||||||
/* max number of times a process can be restarted locally */
|
/* max number of times a process can be restarted locally */
|
||||||
int32_t max_local_restarts;
|
int32_t max_local_restarts;
|
||||||
/* max number of times a process can be relocated to another node */
|
/* max number of times a process can be relocated to another node */
|
||||||
@ -665,6 +667,11 @@ ORTE_DECLSPEC extern bool orte_report_bindings;
|
|||||||
/* barrier control */
|
/* barrier control */
|
||||||
ORTE_DECLSPEC extern bool orte_do_not_barrier;
|
ORTE_DECLSPEC extern bool orte_do_not_barrier;
|
||||||
|
|
||||||
|
/* process recovery */
|
||||||
|
ORTE_DECLSPEC extern bool orte_enable_recovery;
|
||||||
|
ORTE_DECLSPEC extern int32_t orte_max_global_restarts;
|
||||||
|
ORTE_DECLSPEC extern int32_t orte_max_local_restarts;
|
||||||
|
|
||||||
/* comm interface */
|
/* comm interface */
|
||||||
typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data);
|
typedef void (*orte_default_cbfunc_t)(int fd, short event, void *data);
|
||||||
|
|
||||||
|
@ -54,6 +54,20 @@ int orte_register_params(void)
|
|||||||
}
|
}
|
||||||
passed_thru = true;
|
passed_thru = true;
|
||||||
|
|
||||||
|
#if !ORTE_DISABLE_FULL_SUPPORT
|
||||||
|
/* get a clean output channel too - need to do this here because
|
||||||
|
* we use it below, and orterun and some other tools call this
|
||||||
|
* function prior to calling orte_init
|
||||||
|
*/
|
||||||
|
{
|
||||||
|
opal_output_stream_t lds;
|
||||||
|
OBJ_CONSTRUCT(&lds, opal_output_stream_t);
|
||||||
|
lds.lds_want_stdout = true;
|
||||||
|
orte_clean_output = opal_output_open(&lds);
|
||||||
|
OBJ_DESTRUCT(&lds);
|
||||||
|
}
|
||||||
|
#endif /* !ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
mca_base_param_reg_int_name("orte", "base_help_aggregate",
|
mca_base_param_reg_int_name("orte", "base_help_aggregate",
|
||||||
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
|
"If orte_base_help_aggregate is true, duplicate help messages will be aggregated rather than displayed individually. This can be helpful for parallel jobs that experience multiple identical failures; rather than print out the same help/failure message N times, display it once with a count of how many processes sent the same message.",
|
||||||
false, false,
|
false, false,
|
||||||
@ -448,6 +462,58 @@ int orte_register_params(void)
|
|||||||
(int) false, &value);
|
(int) false, &value);
|
||||||
orte_do_not_barrier = OPAL_INT_TO_BOOL(value);
|
orte_do_not_barrier = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("orte", "enable_recovery",
|
||||||
|
"Enable recovery from process failure [Default = disabled]",
|
||||||
|
false, false,
|
||||||
|
(int)false, &value);
|
||||||
|
orte_enable_recovery = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("orte", "max_global_restarts",
|
||||||
|
"Max number of times to relocate a failed process to a new node",
|
||||||
|
false, false,
|
||||||
|
-1, &orte_max_global_restarts);
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("orte", "max_local_restarts",
|
||||||
|
"Max number of times to locally restart a failed process before relocating it to a new node",
|
||||||
|
false, false,
|
||||||
|
-1, &orte_max_local_restarts);
|
||||||
|
if (orte_enable_recovery) {
|
||||||
|
if (orte_max_global_restarts <= 0 &&
|
||||||
|
orte_max_local_restarts <= 0) {
|
||||||
|
if (ORTE_PROC_IS_HNP) {
|
||||||
|
opal_output(orte_clean_output,
|
||||||
|
"------------------------------------------------------------\n"
|
||||||
|
"Although the MCA param orte_enable_recovery was set to true,\n"
|
||||||
|
"values for the max number of restarts was not provided:\n\n"
|
||||||
|
"Max global restarts: %d\n"
|
||||||
|
"Max local restarts: %d\n\n"
|
||||||
|
"At least one of these must be a positive value. We are disabling\n"
|
||||||
|
"process recovery, but continuing execution.\n"
|
||||||
|
"------------------------------------------------------------",
|
||||||
|
orte_max_global_restarts, orte_max_local_restarts);
|
||||||
|
}
|
||||||
|
orte_enable_recovery = false;
|
||||||
|
}
|
||||||
|
} else if (orte_max_global_restarts > 0 ||
|
||||||
|
orte_max_local_restarts > 0) {
|
||||||
|
if (ORTE_PROC_IS_HNP) {
|
||||||
|
opal_output(orte_clean_output,
|
||||||
|
"------------------------------------------------------------------\n"
|
||||||
|
"The MCA param errmgr_base_enable_recovery was not set to true, but\n"
|
||||||
|
"positive value(s) were provided for the number of restarts:\n\n"
|
||||||
|
"Max global restarts: %d\n"
|
||||||
|
"Max local restarts: %d\n\n"
|
||||||
|
"We are enabling process recovery and continuing execution. To avoid\n"
|
||||||
|
"this warning in the future, please set the errmgr_base_enable_recovery\n"
|
||||||
|
"param to non-zero.\n"
|
||||||
|
"------------------------------------------------------------------",
|
||||||
|
orte_max_global_restarts, orte_max_local_restarts);
|
||||||
|
}
|
||||||
|
orte_enable_recovery = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
#endif /* ORTE_DISABLE_FULL_SUPPORT */
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
@ -425,6 +425,18 @@ static opal_cmd_line_init_t cmd_line_init[] = {
|
|||||||
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
NULL, OPAL_CMD_LINE_TYPE_STRING,
|
||||||
"Report events to a tool listening at the specified URI" },
|
"Report events to a tool listening at the specified URI" },
|
||||||
|
|
||||||
|
{ "orte", "enable", "recovery", '\0', "enable-recovery", "enable-recovery", 0,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_BOOL,
|
||||||
|
"Enable recovery from process failure [Default = disabled]" },
|
||||||
|
|
||||||
|
{ "orte", "max", "global_restarts", '\0', "max-global-restarts", "max-global-restarts", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Max number of times to relocate a failed process to a new node" },
|
||||||
|
|
||||||
|
{ "orte", "max", "local_restarts", '\0', "max-local-restarts", "max-local-restarts", 1,
|
||||||
|
NULL, OPAL_CMD_LINE_TYPE_INT,
|
||||||
|
"Max number of times to locally restart a failed process before relocating it to a new node" },
|
||||||
|
|
||||||
/* End of list */
|
/* End of list */
|
||||||
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
{ NULL, NULL, NULL, '\0', NULL, NULL, 0,
|
||||||
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL }
|
||||||
@ -1135,11 +1147,11 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
|||||||
orte_debugger_finalize();
|
orte_debugger_finalize();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Turn off the errmgr recovery functionality, if it was enabled.
|
* Turn off the process recovery functionality, if it was enabled.
|
||||||
* This keeps the errmgr from trying to recover from the shutdown
|
* This keeps the errmgr from trying to recover from the shutdown
|
||||||
* procedure.
|
* procedure.
|
||||||
*/
|
*/
|
||||||
orte_errmgr_base.enable_recovery = false;
|
orte_enable_recovery = false;
|
||||||
orte_errmgr_base.shutting_down = true;
|
orte_errmgr_base.shutting_down = true;
|
||||||
|
|
||||||
/* terminate the orteds - they will automatically kill
|
/* terminate the orteds - they will automatically kill
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user