Deal with the corner case where we encounter an error when attempting to launch a daemon. In this case, we will order abnormal termination before daemons callback to us, and thus any attempt to send them a "die" message will fail. Ensure that mpirun at least exits cleanly in this scenario, thereby allowing the remote daemons that did get launched to commit suicide when comm fails.
cmr=v1.7.5:reviewer=jsquyres This commit was SVN r31068.
Этот коммит содержится в:
родитель
8e8154645b
Коммит
fbc5e3b773
@ -9,6 +9,7 @@
|
||||
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -294,6 +295,10 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
|
||||
proct->alive && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
|
||||
/* at least one is still alive */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: at least one proc (%s) still alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proct->name)));
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
@ -302,7 +307,12 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
"%s errmgr_hnp: all routes and children gone - ordering exit",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
"%s Comm failure: %d routes remain alive",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int)orte_routed.num_routes()));
|
||||
}
|
||||
goto cleanup;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
|
||||
@ -528,6 +538,13 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
"%s errmgr:hnp: unable to send message to proc %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
/* if this proc is one of my daemons, then we are truly
|
||||
* hosed - so just exit out
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
break;
|
||||
}
|
||||
/* kill all jobs */
|
||||
default_hnp_abort(jdata);
|
||||
break;
|
||||
|
@ -12,7 +12,7 @@
|
||||
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -114,6 +114,12 @@ static int rte_init(void)
|
||||
#endif
|
||||
|
||||
if (ORTE_PROC_IS_DAEMON) { /* I am a daemon, launched by mpirun */
|
||||
/* ensure that we always exit with a non-zero status
|
||||
* so that Slurm and other such RMs will terminate the
|
||||
* job if any daemon exits, whether normal termination or not
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
|
||||
/* we had to be given a jobid */
|
||||
if (NULL == orte_ess_base_jobid) {
|
||||
error = "missing jobid";
|
||||
|
@ -103,16 +103,21 @@ typedef void (*mca_oob_send_callback_fn_t)(int status,
|
||||
int count, void *cbdata);
|
||||
|
||||
ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
|
||||
#define ORTE_OOB_SEND(m) \
|
||||
do { \
|
||||
orte_oob_send_t *cd; \
|
||||
cd = OBJ_NEW(orte_oob_send_t); \
|
||||
cd->msg = (m); \
|
||||
opal_event_set(orte_event_base, &cd->ev, -1, \
|
||||
OPAL_EV_WRITE, \
|
||||
orte_oob_base_send_nb, cd); \
|
||||
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
|
||||
#define ORTE_OOB_SEND(m) \
|
||||
do { \
|
||||
orte_oob_send_t *cd; \
|
||||
opal_output_verbose(1, \
|
||||
orte_oob_base_framework.framework_output, \
|
||||
"%s OOB_SEND: %s:%d", \
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
|
||||
__FILE__, __LINE__); \
|
||||
cd = OBJ_NEW(orte_oob_send_t); \
|
||||
cd->msg = (m); \
|
||||
opal_event_set(orte_event_base, &cd->ev, -1, \
|
||||
OPAL_EV_WRITE, \
|
||||
orte_oob_base_send_nb, cd); \
|
||||
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
|
||||
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
|
||||
}while(0);
|
||||
|
||||
/* Our contact info is actually subject to change as transports
|
||||
|
@ -11,6 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -77,6 +78,11 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
|
||||
"%s plm:base:orted_cmd sending orted_exit commands",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
if (orte_orteds_term_ordered) {
|
||||
/* only do this once */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* flag that orteds are being terminated */
|
||||
orte_orteds_term_ordered = true;
|
||||
|
||||
|
@ -301,6 +301,10 @@ static void rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
|
||||
/* increment the #daemons terminated so we will exit properly */
|
||||
jdata->num_terminated++;
|
||||
/* remove it from the routing table to ensure num_routes
|
||||
* returns the correct value
|
||||
*/
|
||||
orte_routed.route_lost(&daemon->name);
|
||||
/* report that the daemon has failed so we can exit */
|
||||
ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
|
||||
}
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,6 +30,7 @@
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
@ -62,6 +64,10 @@ void orte_routed_base_xcast_routing(orte_grpcomm_collective_t *coll,
|
||||
opal_list_append(&coll->targets, &nm->super);
|
||||
}
|
||||
}
|
||||
/* if nobody is known alive, then we need to die */
|
||||
if (0 == opal_list_get_size(&coll->targets)) {
|
||||
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
|
||||
}
|
||||
} else {
|
||||
/* the xcast always goes to our children */
|
||||
for (item = opal_list_get_first(my_children);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user