diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index 9c65391065..16a99cdbd9 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -443,12 +443,6 @@ static void proc_errors(int fd, short args, void *cbdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* record the first one to fail */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { - /* output an error message so the user knows what happened */ - orte_show_help("help-errmgr-base.txt", "node-died", true, - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_process_info.nodename, - ORTE_NAME_PRINT(proc), - pptr->node->name); /* mark the daemon job as failed */ jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ @@ -456,14 +450,25 @@ static void proc_errors(int fd, short args, void *cbdata) /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); - /* update our exit code */ - ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); - /* just in case the exit code hadn't been set, do it here - this - * won't override any reported exit code */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE); + if (!orte_enable_recovery) { + /* output an error message so the user knows what happened */ + orte_show_help("help-errmgr-base.txt", "node-died", true, + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_process_info.nodename, + ORTE_NAME_PRINT(proc), + pptr->node->name); + /* update our exit code */ + ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); + /* just in case the exit code hadn't been set, do it here - this + * won't override any reported exit code */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE); + } + } + /* if recovery is enabled, then we are done - otherwise, + * abort the system */ + if (!orte_enable_recovery) { + default_hnp_abort(jdata); } - /* abort the system */ - default_hnp_abort(jdata); goto cleanup; } @@ -498,7 +503,8 @@ static void proc_errors(int fd, short args, void *cbdata) keep_going: /* if this is a continuously operating job, then there is nothing more * to do - we let the job continue to run */ - if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) { + if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) || + ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) { /* always mark the waitpid as having fired */ ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED); /* if this is a remote proc, we won't hear anything more about it diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index 2944a86f57..5ac4fed36a 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -267,8 +267,10 @@ static void launch_daemons(int fd, short args, void *cbdata) /* start one orted on each node */ opal_argv_append(&argc, &argv, "--ntasks-per-node=1"); - /* alert us if any orteds die during startup */ - opal_argv_append(&argc, &argv, "--kill-on-bad-exit"); + if (!orte_enable_recovery) { + /* kill the job if any orteds die */ + opal_argv_append(&argc, &argv, "--kill-on-bad-exit"); + } /* ensure the orteds are not bound to a single processor, * just in case the TaskAffinity option is set by default. diff --git a/orte/tools/orte-clean/orte-clean.c b/orte/tools/orte-clean/orte-clean.c index c69620ab6b..fbbc04b5ff 100644 --- a/orte/tools/orte-clean/orte-clean.c +++ b/orte/tools/orte-clean/orte-clean.c @@ -183,7 +183,7 @@ main(int argc, char *argv[]) free(legacy); /* and finally get rid of any lingering pmix-related artifacts */ - asprintf(&legacy, "rm -f %s/pmix*", orte_process_info.tmpdir_base); + asprintf(&legacy, "rm -rf %s/pmix*", orte_process_info.tmpdir_base); system(legacy); free(legacy);