Enable ORTE to continue running when a node fails - user takes responsibility for zombies. Minor cleanup to orte-clean

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-06-27 09:05:26 -07:00 · 2017-06-27 09:05:26 -07:00 · 8a4565874e
--- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
+++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
@ -443,12 +443,6 @@ static void proc_errors(int fd, short args, void *cbdata)
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
        /* record the first one to fail */
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
-            /* output an error message so the user knows what happened */
-            orte_show_help("help-errmgr-base.txt", "node-died", true,
-                           ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                           orte_process_info.nodename,
-                           ORTE_NAME_PRINT(proc),
-                           pptr->node->name);
            /* mark the daemon job as failed */
            jdata->state = ORTE_JOB_STATE_COMM_FAILED;
            /* point to the lowest rank to cause the problem */
@ -456,14 +450,25 @@ static void proc_errors(int fd, short args, void *cbdata)
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
-            /* update our exit code */
-            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
-            /* just in case the exit code hadn't been set, do it here - this
-             * won't override any reported exit code */
-            ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
+            if (!orte_enable_recovery) {
+                /* output an error message so the user knows what happened */
+                orte_show_help("help-errmgr-base.txt", "node-died", true,
+                               ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                               orte_process_info.nodename,
+                               ORTE_NAME_PRINT(proc),
+                               pptr->node->name);
+                /* update our exit code */
+                ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
+                /* just in case the exit code hadn't been set, do it here - this
+                 * won't override any reported exit code */
+                ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
+            }
+        }
+        /* if recovery is enabled, then we are done - otherwise,
+         * abort the system */
+        if (!orte_enable_recovery) {
+            default_hnp_abort(jdata);
        }
-        /* abort the system */
-        default_hnp_abort(jdata);
        goto cleanup;
    }

@ -498,7 +503,8 @@ static void proc_errors(int fd, short args, void *cbdata)
  keep_going:
    /* if this is a continuously operating job, then there is nothing more
     * to do - we let the job continue to run */
-    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) {
+    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
+        ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
        /* always mark the waitpid as having fired */
        ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
        /* if this is a remote proc, we won't hear anything more about it
--- a/orte/mca/plm/slurm/plm_slurm_module.c
+++ b/orte/mca/plm/slurm/plm_slurm_module.c
@ -267,8 +267,10 @@ static void launch_daemons(int fd, short args, void *cbdata)
    /* start one orted on each node */
    opal_argv_append(&argc, &argv, "--ntasks-per-node=1");

-    /* alert us if any orteds die during startup */
-    opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
+    if (!orte_enable_recovery) {
+        /* kill the job if any orteds die */
+        opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
+    }

    /* ensure the orteds are not bound to a single processor,
     * just in case the TaskAffinity option is set by default.
--- a/orte/tools/orte-clean/orte-clean.c
+++ b/orte/tools/orte-clean/orte-clean.c
@ -183,7 +183,7 @@ main(int argc, char *argv[])
    free(legacy);

    /* and finally get rid of any lingering pmix-related artifacts */
-    asprintf(&legacy, "rm -f %s/pmix*", orte_process_info.tmpdir_base);
+    asprintf(&legacy, "rm -rf %s/pmix*", orte_process_info.tmpdir_base);
    system(legacy);
    free(legacy);