orte dvm: debug fix for DVM early quit
Ensure that job errors do not cause the DVM to fail unless the failed job is the DVM itself. Refs #2987, with improvements from Ralph Signed-off-by: Thomas Naughton <naughtont@ornl.gov> Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
57f6646cbe
Коммит
beb5b250bf
@ -2,14 +2,14 @@
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2010-2017 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -142,8 +142,6 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
orte_job_t *jdata;
|
||||
orte_job_state_t jobstate;
|
||||
orte_exit_code_t sts;
|
||||
orte_proc_t *aborted_proc;
|
||||
opal_buffer_t *answer;
|
||||
int32_t rc, ret;
|
||||
int room, *rmptr;
|
||||
@ -175,110 +173,67 @@ static void job_errors(int fd, short args, void *cbdata)
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
orte_job_state_to_str(jobstate)));
|
||||
|
||||
if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate ||
|
||||
ORTE_JOB_STATE_ALLOC_FAILED == jobstate ||
|
||||
ORTE_JOB_STATE_MAP_FAILED == jobstate ||
|
||||
ORTE_JOB_STATE_CANNOT_LAUNCH == jobstate) {
|
||||
/* disable routing as we may not have performed the daemon
|
||||
* wireup - e.g., in a managed environment, all the daemons
|
||||
* "phone home", but don't actually wireup into the routed
|
||||
* network until they receive the launch message
|
||||
*/
|
||||
orte_routing_is_enabled = false;
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
/* if it was a dynamic spawn, then we better tell them this didn't work */
|
||||
if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
|
||||
rc = jobstate;
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
/* pack the room number */
|
||||
rmptr = &room;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm sending dyn error release of job %s to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
ORTE_NAME_PRINT(&jdata->originator)));
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
||||
&jdata->originator, answer,
|
||||
ORTE_RML_TAG_LAUNCH_RESP,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
}
|
||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* if the daemon job aborted and we haven't heard from everyone yet,
|
||||
* then this could well have been caused by a daemon not finding
|
||||
* a way back to us. In this case, output a message indicating a daemon
|
||||
* died without reporting. Otherwise, say nothing as we
|
||||
* likely already output an error message */
|
||||
if (ORTE_JOB_STATE_ABORTED == jobstate &&
|
||||
jdata->num_procs != jdata->num_reported) {
|
||||
orte_routing_is_enabled = false;
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
|
||||
}
|
||||
/* there really isn't much else we can do since the problem
|
||||
* is in the DVM itself, so best just to terminate */
|
||||
jdata->num_terminated = jdata->num_procs;
|
||||
/* activate the terminated state so we can exit */
|
||||
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ORTE_JOB_STATE_FAILED_TO_START == jobstate ||
|
||||
ORTE_JOB_STATE_FAILED_TO_LAUNCH == jobstate) {
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
* NULL, then we need to tell everyone else to die
|
||||
*/
|
||||
aborted_proc = NULL;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, (void**)&aborted_proc, OPAL_PTR)) {
|
||||
sts = aborted_proc->exit_code;
|
||||
if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
|
||||
if (WIFSIGNALED(sts)) { /* died on signal */
|
||||
#ifdef WCOREDUMP
|
||||
if (WCOREDUMP(sts)) {
|
||||
orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true,
|
||||
WTERMSIG(sts));
|
||||
sts = WTERMSIG(sts);
|
||||
} else {
|
||||
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
|
||||
WTERMSIG(sts));
|
||||
sts = WTERMSIG(sts);
|
||||
}
|
||||
#else
|
||||
orte_show_help("help-plm-base.txt", "daemon-died-signal", true,
|
||||
WTERMSIG(sts));
|
||||
sts = WTERMSIG(sts);
|
||||
#endif /* WCOREDUMP */
|
||||
} else {
|
||||
orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true,
|
||||
WEXITSTATUS(sts));
|
||||
sts = WEXITSTATUS(sts);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* if this is the daemon job, then we need to ensure we
|
||||
* output an error message indicating we couldn't launch the
|
||||
* daemons */
|
||||
if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
|
||||
/* all other cases involve jobs submitted to the DVM - therefore,
|
||||
* we only inform the submitter of the problem, but do NOT terminate
|
||||
* the DVM itself */
|
||||
|
||||
rc = jobstate;
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jdata->jobid, 1, ORTE_JOBID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
/* pack the room number */
|
||||
rmptr = &room;
|
||||
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, (void**)&rmptr, OPAL_INT)) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &room, 1, OPAL_INT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* if the daemon job aborted and we haven't heard from everyone yet,
|
||||
* then this could well have been caused by a daemon not finding
|
||||
* a way back to us. In this case, output a message indicating a daemon
|
||||
* died without reporting. Otherwise, say nothing as we
|
||||
* likely already output an error message */
|
||||
if (ORTE_JOB_STATE_ABORTED == jobstate &&
|
||||
jdata->jobid == ORTE_PROC_MY_NAME->jobid &&
|
||||
jdata->num_procs != jdata->num_reported) {
|
||||
orte_show_help("help-errmgr-base.txt", "failed-daemon", true);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:dvm sending notification of job %s failure to %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jdata->jobid),
|
||||
ORTE_NAME_PRINT(&jdata->originator)));
|
||||
if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
|
||||
&jdata->originator, answer,
|
||||
ORTE_RML_TAG_LAUNCH_RESP,
|
||||
orte_rml_send_callback, NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
}
|
||||
/* ensure we terminate any processes left running in the DVM */
|
||||
_terminate_job(jdata->jobid);
|
||||
|
||||
/* cleanup */
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user