1
1

orte dvm: debug fix for DVM early quit

Ensure that job errors do not cause the DVM to fail unless the failed job is the DVM itself.

Refs #2987, with improvements from Ralph

Signed-off-by: Thomas Naughton <naughtont@ornl.gov>
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Thomas Naughton 2017-02-22 15:33:45 -05:00
родитель 57f6646cbe
Коммит beb5b250bf

Просмотреть файл

@ -2,14 +2,14 @@
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2010-2017 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
@ -142,8 +142,6 @@ static void job_errors(int fd, short args, void *cbdata)
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
orte_job_t *jdata;
orte_job_state_t jobstate;
orte_exit_code_t sts;
orte_proc_t *aborted_proc;
opal_buffer_t *answer;
int32_t rc, ret;
int room, *rmptr;
@ -175,110 +173,67 @@ static void job_errors(int fd, short args, void *cbdata)
ORTE_JOBID_PRINT(jdata->jobid),
orte_job_state_to_str(jobstate)));
if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
/* disable routing as we may not have performed the daemon
* wireup - e.g., in a managed environment, all the daemons
* "phone home", but don't actually wireup into the routed
* network until they receive the launch message
*/
orte_routing_is_enabled = false;
jdata->num_terminated = jdata->num_procs;
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
/* if it was a dynamic spawn, then we better tell them this didn't work */
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
rc = jobstate;
answer = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(caddy);
return;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(caddy);
return;
}
/* pack the room number */
rmptr = &room;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(caddy);
return;
}
}
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm sending dyn error release of job %s to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&jdata->originator)));
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
}
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
/* if the daemon job aborted and we haven't heard from everyone yet,
* then this could well have been caused by a daemon not finding
* a way back to us. In this case, output a message indicating a daemon
* died without reporting. Otherwise, say nothing as we
* likely already output an error message */
if (ORTE_JOB_STATE_ABORTED == jobstate &&
jdata->num_procs != jdata->num_reported) {
orte_routing_is_enabled = false;
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
}
/* there really isn't much else we can do since the problem
* is in the DVM itself, so best just to terminate */
jdata->num_terminated = jdata->num_procs;
/* activate the terminated state so we can exit */
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
OBJ_RELEASE(caddy);
return;
}
if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
aborted_proc = NULL;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
sts = aborted_proc->exit_code;
if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
if (WIFSIGNALED(sts)) { /* died on signal */
#ifdef WCOREDUMP
if (WCOREDUMP(sts)) {
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
WTERMSIG(sts));
sts = WTERMSIG(sts);
} else {
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
WTERMSIG(sts));
sts = WTERMSIG(sts);
}
#else
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
WTERMSIG(sts));
sts = WTERMSIG(sts);
#endif /* WCOREDUMP */
} else {
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
WEXITSTATUS(sts));
sts = WEXITSTATUS(sts);
}
}
}
/* if this is the daemon job, then we need to ensure we
* output an error message indicating we couldn't launch the
* daemons */
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
/* all other cases involve jobs submitted to the DVM - therefore,
* we only inform the submitter of the problem, but do NOT terminate
* the DVM itself */
rc = jobstate;
answer = OBJ_NEW(opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(caddy);
return;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(caddy);
return;
}
/* pack the room number */
rmptr = &room;
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(caddy);
return;
}
}
/* if the daemon job aborted and we haven't heard from everyone yet,
* then this could well have been caused by a daemon not finding
* a way back to us. In this case, output a message indicating a daemon
* died without reporting. Otherwise, say nothing as we
* likely already output an error message */
if (ORTE_JOB_STATE_ABORTED == jobstate &&
jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
jdata->num_procs != jdata->num_reported) {
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s errmgr:dvm sending notification of job %s failure to %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid),
ORTE_NAME_PRINT(&jdata->originator)));
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
&jdata->originator, answer,
ORTE_RML_TAG_LAUNCH_RESP,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
}
/* ensure we terminate any processes left running in the DVM */
_terminate_job(jdata->jobid);
/* cleanup */
OBJ_RELEASE(caddy);
}