From bc7a7f3de557ee568d782288efce6b9cf2b7e46e Mon Sep 17 00:00:00 2001
From: Ralph Castain <rhc@open-mpi.org>
Date: Fri, 22 May 2015 17:29:06 -0700
Subject: [PATCH] Fix abnormal shutdown when a node dies

---
 orte/mca/errmgr/base/help-errmgr-base.txt     | 11 +++
 .../errmgr/default_hnp/errmgr_default_hnp.c   |  4 +
 .../default_orted/errmgr_default_orted.c      | 96 ++++++++++++++-----
 orte/mca/oob/tcp/oob_tcp_sendrecv.c           |  4 -
 4 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/orte/mca/errmgr/base/help-errmgr-base.txt b/orte/mca/errmgr/base/help-errmgr-base.txt
index bdf0c40b9d..cef3e78c07 100644
--- a/orte/mca/errmgr/base/help-errmgr-base.txt
+++ b/orte/mca/errmgr/base/help-errmgr-base.txt
@@ -59,3 +59,14 @@ of factors, including an inability to create a connection back
 to mpirun due to a lack of common network interfaces and/or no
 route found between them. Please check network connectivity
 (including firewalls and network routing requirements).
+#
+[node-died]
+ORTE has lost communication with its daemon located on node:
+
+  hostname:  %s
+
+This is usually due to either a failure of the TCP network
+connection to the node, or possibly an internal failure of
+the daemon itself. We cannot recover from this failure, and
+therefore will terminate the job.
+
diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
index a48fcbdaa5..9b2525eec7 100644
--- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
+++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
@@ -361,12 +361,16 @@ static void proc_errors(int fd, short args, void *cbdata)
                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
         /* record the first one to fail */
         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
+            /* output an error message so the user knows what happened */
+            orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
+            /* mark the daemon job as failed */
             jdata->state = ORTE_JOB_STATE_COMM_FAILED;
             /* point to the lowest rank to cause the problem */
             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
             /* retain the object so it doesn't get free'd */
             OBJ_RETAIN(pptr);
             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
+            /* update our exit code */
             ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
         }
         /* abort the system */
diff --git a/orte/mca/errmgr/default_orted/errmgr_default_orted.c b/orte/mca/errmgr/default_orted/errmgr_default_orted.c
index 7495e80db5..1da802588c 100644
--- a/orte/mca/errmgr/default_orted/errmgr_default_orted.c
+++ b/orte/mca/errmgr/default_orted/errmgr_default_orted.c
@@ -300,31 +300,81 @@ static void proc_errors(int fd, short args, void *cbdata)
                              "%s errmgr:default:orted daemon %s exited",
                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                              ORTE_NAME_PRINT(proc)));
-        /* are any of my children still alive */
-        for (i=0; i < orte_local_children->size; i++) {
-            if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
-                if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
-                    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
-                                         "%s errmgr:default:orted[%s(%d)] proc %s is alive",
-                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                                         __FILE__, __LINE__,
-                                         ORTE_NAME_PRINT(&child->name)));
-                    goto cleanup;
+        /* if we are using static ports, then it is possible that the HNP
+         * will not see this termination. So if the HNP didn't order us
+         * to terminate, then we should ensure it knows */
+        if (orte_static_ports && !orte_orteds_term_ordered) {
+            /* send an alert to the HNP */
+            alert = OBJ_NEW(opal_buffer_t);
+            /* pack update state command */
+            cmd = ORTE_PLM_UPDATE_PROC_STATE;
+            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
+                ORTE_ERROR_LOG(rc);
+                return;
+            }
+            /* get the proc_t */
+            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
+                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
+                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
+                goto cleanup;
+            }
+            /* pack only the data for this daemon - have to start with the jobid
+             * so the receiver can unpack it correctly
+             */
+            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
+                ORTE_ERROR_LOG(rc);
+                return;
+            }
+
+            /* now pack the daemon's info */
+            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
+                ORTE_ERROR_LOG(rc);
+                return;
+            }
+            /* send it */
+            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
+                                 "%s errmgr:default_orted reporting lost connection to daemon %s",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_NAME_PRINT(proc)));
+            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
+                                                  ORTE_RML_TAG_PLM,
+                                                  orte_rml_send_callback, NULL))) {
+                ORTE_ERROR_LOG(rc);
+                OBJ_RELEASE(alert);
+            }
+            /* mark that we notified the HNP for this job so we don't do it again */
+            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
+            /* continue on */
+            goto cleanup;
+        }
+
+        if (orte_orteds_term_ordered) {
+            /* are any of my children still alive */
+            for (i=0; i < orte_local_children->size; i++) {
+                if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
+                    if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
+                        OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
+                                             "%s errmgr:default:orted[%s(%d)] proc %s is alive",
+                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                             __FILE__, __LINE__,
+                                             ORTE_NAME_PRINT(&child->name)));
+                        goto cleanup;
+                    }
                 }
             }
-        }
-        /* if all my routes and children are gone, then terminate
-           ourselves nicely (i.e., this is a normal termination) */
-        if (0 == orte_routed.num_routes()) {
-            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
-                                 "%s errmgr:default:orted all routes gone - exiting",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
-            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
-        } else {
-            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
-                                 "%s errmgr:default:orted not exiting, num_routes() == %d",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                                 (int)orte_routed.num_routes()));
+            /* if all my routes and children are gone, then terminate
+               ourselves nicely (i.e., this is a normal termination) */
+            if (0 == orte_routed.num_routes()) {
+                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
+                                     "%s errmgr:default:orted all routes gone - exiting",
+                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
+                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
+            } else {
+                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
+                                     "%s errmgr:default:orted not exiting, num_routes() == %d",
+                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                     (int)orte_routed.num_routes()));
+            }
         }
         /* if not, then we can continue */
         goto cleanup;
diff --git a/orte/mca/oob/tcp/oob_tcp_sendrecv.c b/orte/mca/oob/tcp/oob_tcp_sendrecv.c
index f75827a7f3..35e72a702e 100644
--- a/orte/mca/oob/tcp/oob_tcp_sendrecv.c
+++ b/orte/mca/oob/tcp/oob_tcp_sendrecv.c
@@ -431,10 +431,6 @@ void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata)
     bool timing_same_as_hdr = false;
 #endif
 
-    if (orte_abnormal_term_ordered) {
-        return;
-    }
-
     opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                         "%s:tcp:recv:handler called for peer %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),