orte dvm: debug fix for DVM early quit

Ensure that job errors do not cause the DVM to fail unless the failed job is the DVM itself. Refs #2987, with improvements from Ralph Signed-off-by: Thomas Naughton <naughtont@ornl.gov> Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-02-22 15:33:45 -05:00 · 2017-02-22 15:33:45 -05:00 · beb5b250bf
--- a/orte/mca/errmgr/dvm/errmgr_dvm.c
+++ b/orte/mca/errmgr/dvm/errmgr_dvm.c
@ -2,14 +2,14 @@
 * Copyright (c) 2009-2011 The Trustees of Indiana University.
 *                         All rights reserved.
 * Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2010-2017 Oak Ridge National Labs.  All rights reserved.
 * Copyright (c) 2004-2011 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2011      Oracle and/or all its affiliates.  All rights reserved.
 * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
 *                         All rights reserved.
- * Copyright (c) 2014-2016 Intel, Inc.  All rights reserved.
+ * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
 * Copyright (c) 2017      IBM Corporation.  All rights reserved.
 * $COPYRIGHT$
 *
@ -142,8 +142,6 @@ static void job_errors(int fd, short args, void *cbdata)
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_job_state_t jobstate;
-    orte_exit_code_t sts;
-    orte_proc_t *aborted_proc;
    opal_buffer_t *answer;
    int32_t rc, ret;
    int room, *rmptr;
@ -175,110 +173,67 @@ static void job_errors(int fd, short args, void *cbdata)
                         ORTE_JOBID_PRINT(jdata->jobid),
                         orte_job_state_to_str(jobstate)));

-    if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
-        ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
-        ORTE_JOB_STATE_MAP_FAILED == jobstate ||
-        ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
-        /* disable routing as we may not have performed the daemon
-         * wireup - e.g., in a managed environment, all the daemons
-         * "phone home", but don't actually wireup into the routed
-         * network until they receive the launch message
-         */
-        orte_routing_is_enabled = false;
-        jdata->num_terminated = jdata->num_procs;
-        ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
-        /* if it was a dynamic spawn, then we better tell them this didn't work */
-        if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
-            rc = jobstate;
-            answer = OBJ_NEW(opal_buffer_t);
-            if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
-                ORTE_ERROR_LOG(ret);
-                OBJ_RELEASE(caddy);
-                return;
-            }
-            if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
-                ORTE_ERROR_LOG(ret);
-                OBJ_RELEASE(caddy);
-                return;
-            }
-            /* pack the room number */
-            rmptr = &room;
-            if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
-                if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
-                    ORTE_ERROR_LOG(ret);
-                    OBJ_RELEASE(caddy);
-                    return;
-                }
-            }
-            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
-                                 "%s errmgr:dvm sending dyn error release of job %s to %s",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                                 ORTE_JOBID_PRINT(jdata->jobid),
-                                 ORTE_NAME_PRINT(&jdata->originator)));
-            if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
-                                                           &jdata->originator, answer,
-                                                           ORTE_RML_TAG_LAUNCH_RESP,
-                                                           orte_rml_send_callback, NULL))) {
-                ORTE_ERROR_LOG(ret);
-                OBJ_RELEASE(answer);
-            }
+    if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
+        /* if the daemon job aborted and we haven't heard from everyone yet,
+         * then this could well have been caused by a daemon not finding
+         * a way back to us. In this case, output a message indicating a daemon
+         * died without reporting. Otherwise, say nothing as we
+         * likely already output an error message */
+        if (ORTE_JOB_STATE_ABORTED == jobstate &&
+            jdata->num_procs != jdata->num_reported) {
+            orte_routing_is_enabled = false;
+            orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
        }
+        /* there really isn't much else we can do since the problem
+         * is in the DVM itself, so best just to terminate */
+        jdata->num_terminated = jdata->num_procs;
+        /* activate the terminated state so we can exit */
+        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        OBJ_RELEASE(caddy);
        return;
    }

-    if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
-        ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
-        /* the job object for this job will have been NULL'd
-         * in the array if the job was solely local. If it isn't
-         * NULL, then we need to tell everyone else to die
-         */
-        aborted_proc = NULL;
-        if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
-            sts = aborted_proc->exit_code;
-            if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
-                if (WIFSIGNALED(sts)) { /* died on signal */
-#ifdef WCOREDUMP
-                    if (WCOREDUMP(sts)) {
-                        orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
-                                       WTERMSIG(sts));
-                        sts = WTERMSIG(sts);
-                    } else {
-                        orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
-                                       WTERMSIG(sts));
-                        sts = WTERMSIG(sts);
-                    }
-#else
-                    orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
-                                   WTERMSIG(sts));
-                    sts = WTERMSIG(sts);
-#endif /* WCOREDUMP */
-                } else {
-                    orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
-                                   WEXITSTATUS(sts));
-                    sts = WEXITSTATUS(sts);
-                }
-            }
-        }
-        /* if this is the daemon job, then we need to ensure we
-         * output an error message indicating we couldn't launch the
-         * daemons */
-        if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
-            orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
+    /* all other cases involve jobs submitted to the DVM - therefore,
+     * we only inform the submitter of the problem, but do NOT terminate
+     * the DVM itself */
+
+    rc = jobstate;
+    answer = OBJ_NEW(opal_buffer_t);
+    if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
+        ORTE_ERROR_LOG(ret);
+        OBJ_RELEASE(caddy);
+        return;
+    }
+    if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
+        ORTE_ERROR_LOG(ret);
+        OBJ_RELEASE(caddy);
+        return;
+    }
+    /* pack the room number */
+    rmptr = &room;
+    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
+        if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
+            ORTE_ERROR_LOG(ret);
+            OBJ_RELEASE(caddy);
+            return;
        }
    }
-
-    /* if the daemon job aborted and we haven't heard from everyone yet,
-     * then this could well have been caused by a daemon not finding
-     * a way back to us. In this case, output a message indicating a daemon
-     * died without reporting. Otherwise, say nothing as we
-     * likely already output an error message */
-    if (ORTE_JOB_STATE_ABORTED == jobstate &&
-        jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
-        jdata->num_procs != jdata->num_reported) {
-        orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
+    OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
+                         "%s errmgr:dvm sending notification of job %s failure to %s",
+                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                         ORTE_JOBID_PRINT(jdata->jobid),
+                         ORTE_NAME_PRINT(&jdata->originator)));
+    if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
+                                                   &jdata->originator, answer,
+                                                   ORTE_RML_TAG_LAUNCH_RESP,
+                                                   orte_rml_send_callback, NULL))) {
+        ORTE_ERROR_LOG(ret);
+        OBJ_RELEASE(answer);
    }
+    /* ensure we terminate any processes left running in the DVM */
+    _terminate_job(jdata->jobid);

+    /* cleanup */
    OBJ_RELEASE(caddy);
 }