1
1

Deal with the corner case where we encounter an error when attempting to launch a daemon. In this case, we will order abnormal termination before daemons callback to us, and thus any attempt to send them a "die" message will fail. Ensure that mpirun at least exits cleanly in this scenario, thereby allowing the remote daemons that did get launched to commit suicide when comm fails.

cmr=v1.7.5:reviewer=jsquyres

This commit was SVN r31068.
Этот коммит содержится в:
Ralph Castain 2014-03-14 15:32:30 +00:00
родитель 8e8154645b
Коммит fbc5e3b773
6 изменённых файлов: 56 добавлений и 12 удалений

Просмотреть файл

@ -9,6 +9,7 @@
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -294,6 +295,10 @@ static void proc_errors(int fd, short args, void *cbdata)
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
proct->alive && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
/* at least one is still alive */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: at least one proc (%s) still alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proct->name)));
goto cleanup;
}
}
@ -302,7 +307,12 @@ static void proc_errors(int fd, short args, void *cbdata)
"%s errmgr_hnp: all routes and children gone - ordering exit",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
} else {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
"%s Comm failure: %d routes remain alive",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int)orte_routed.num_routes()));
}
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
@ -528,6 +538,13 @@ static void proc_errors(int fd, short args, void *cbdata)
"%s errmgr:hnp: unable to send message to proc %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
/* if this proc is one of my daemons, then we are truly
* hosed - so just exit out
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
break;
}
/* kill all jobs */
default_hnp_abort(jdata);
break;

Просмотреть файл

@ -12,7 +12,7 @@
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -114,6 +114,12 @@ static int rte_init(void)
#endif
if (ORTE_PROC_IS_DAEMON) { /* I am a daemon, launched by mpirun */
/* ensure that we always exit with a non-zero status
* so that Slurm and other such RMs will terminate the
* job if any daemon exits, whether normal termination or not
*/
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
/* we had to be given a jobid */
if (NULL == orte_ess_base_jobid) {
error = "missing jobid";

Просмотреть файл

@ -103,16 +103,21 @@ typedef void (*mca_oob_send_callback_fn_t)(int status,
int count, void *cbdata);
ORTE_DECLSPEC void orte_oob_base_send_nb(int fd, short args, void *cbdata);
#define ORTE_OOB_SEND(m) \
do { \
orte_oob_send_t *cd; \
cd = OBJ_NEW(orte_oob_send_t); \
cd->msg = (m); \
opal_event_set(orte_event_base, &cd->ev, -1, \
OPAL_EV_WRITE, \
orte_oob_base_send_nb, cd); \
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
#define ORTE_OOB_SEND(m) \
do { \
orte_oob_send_t *cd; \
opal_output_verbose(1, \
orte_oob_base_framework.framework_output, \
"%s OOB_SEND: %s:%d", \
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
__FILE__, __LINE__); \
cd = OBJ_NEW(orte_oob_send_t); \
cd->msg = (m); \
opal_event_set(orte_event_base, &cd->ev, -1, \
OPAL_EV_WRITE, \
orte_oob_base_send_nb, cd); \
opal_event_set_priority(&cd->ev, ORTE_MSG_PRI); \
opal_event_active(&cd->ev, OPAL_EV_WRITE, 1); \
}while(0);
/* Our contact info is actually subject to change as transports

Просмотреть файл

@ -11,6 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -77,6 +78,11 @@ int orte_plm_base_orted_exit(orte_daemon_cmd_flag_t command)
"%s plm:base:orted_cmd sending orted_exit commands",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
if (orte_orteds_term_ordered) {
/* only do this once */
return ORTE_SUCCESS;
}
/* flag that orteds are being terminated */
orte_orteds_term_ordered = true;

Просмотреть файл

@ -301,6 +301,10 @@ static void rsh_wait_daemon(pid_t pid, int status, void* cbdata)
daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
/* increment the #daemons terminated so we will exit properly */
jdata->num_terminated++;
/* remove it from the routing table to ensure num_routes
* returns the correct value
*/
orte_routed.route_lost(&daemon->name);
/* report that the daemon has failed so we can exit */
ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
}

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -29,6 +30,7 @@
#include "orte/mca/ess/ess.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/state/state.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
@ -62,6 +64,10 @@ void orte_routed_base_xcast_routing(orte_grpcomm_collective_t *coll,
opal_list_append(&coll->targets, &nm->super);
}
}
/* if nobody is known alive, then we need to die */
if (0 == opal_list_get_size(&coll->targets)) {
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
}
} else {
/* the xcast always goes to our children */
for (item = opal_list_get_first(my_children);