* Before this commit, if we called ompi_mpi_abort() before MPI_INIT
completed successfully, Bad Things(tm) could happen. * Now we explicitly check orte_initialized (a new global in ORTE indicating whether we are between orte_init() and orte_finalize() or not), and if so, react accordingly. * If ORTE is initialized, use orte_system_info.nodename; otherwise, use gethostname(). * Add loop protection to ensure that ompi_mpi_abort() is not invoked multiple times recursively. This commit was SVN r13354.
This commit is contained in:
parent
a45e8bea05
commit
e90b3e415b
@ -35,13 +35,16 @@
|
||||
#include "opal/mca/backtrace/backtrace.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/params.h"
|
||||
#include "orte/mca/ns/ns.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/sys_info.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "ompi/runtime/mpiruntime.h"
|
||||
#include "ompi/runtime/params.h"
|
||||
|
||||
static bool have_been_invoked = false;
|
||||
|
||||
int
|
||||
ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
@ -49,28 +52,37 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
bool kill_remote_of_intercomm)
|
||||
{
|
||||
int count = 0, i, ret = OMPI_SUCCESS;
|
||||
char hostname[MAXHOSTNAMELEN];
|
||||
char *host, hostname[MAXHOSTNAMELEN];
|
||||
pid_t pid = 0;
|
||||
orte_process_name_t *abort_procs;
|
||||
orte_std_cntr_t nabort_procs;
|
||||
|
||||
|
||||
|
||||
/* Protection for recursive invocation */
|
||||
if (have_been_invoked) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
have_been_invoked = true;
|
||||
|
||||
/* If ORTE is initialized, use its nodename. Otherwise, call
|
||||
gethostname. */
|
||||
|
||||
if (orte_initialized) {
|
||||
host = orte_system_info.nodename;
|
||||
} else {
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
host = hostname;
|
||||
}
|
||||
pid = getpid();
|
||||
|
||||
/* Corner case: if we're being called as a result of the
|
||||
OMPI_ERR_INIT_FINALIZE macro (meaning that this is before
|
||||
MPI_INIT or after MPI_FINALIZE), then just abort nothing MPI or
|
||||
ORTE has been setup yet. */
|
||||
|
||||
if (!ompi_mpi_initialized || ompi_mpi_finalized) {
|
||||
orte_errmgr.error_detected(errcode, NULL);
|
||||
}
|
||||
|
||||
/* If we're going to print anything, get the hostname and PID of
|
||||
this process */
|
||||
|
||||
if (ompi_mpi_abort_print_stack ||
|
||||
0 != ompi_mpi_abort_delay) {
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
pid = getpid();
|
||||
if (orte_initialized) {
|
||||
orte_errmgr.error_detected(errcode, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Should we print a stack trace? */
|
||||
@ -81,7 +93,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
|
||||
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||
for (i = 0; i < len; ++i) {
|
||||
fprintf(stderr, "[%s:%d] [%d] func:%s\n", hostname, (int) pid,
|
||||
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
|
||||
i, messages[i]);
|
||||
fflush(stderr);
|
||||
}
|
||||
@ -99,20 +111,29 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
if (0 != ompi_mpi_abort_delay) {
|
||||
if (ompi_mpi_abort_delay < 0) {
|
||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
|
||||
hostname, (int) pid);
|
||||
host, (int) pid);
|
||||
fflush(stderr);
|
||||
while (1) {
|
||||
sleep(5);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
||||
hostname, (int) pid, ompi_mpi_abort_delay);
|
||||
host, (int) pid, ompi_mpi_abort_delay);
|
||||
do {
|
||||
sleep(1);
|
||||
} while (--ompi_mpi_abort_delay > 0);
|
||||
}
|
||||
}
|
||||
|
||||
/* If ORTE isn't setup yet, then don't even try killing everyone.
|
||||
Sorry, Charlie... */
|
||||
|
||||
if (!orte_initialized) {
|
||||
fprintf(stderr, "[%s:%d] Abort before MPI_INIT completed successfully; not able to guarantee that all other processes were killed!\n",
|
||||
host, (int) pid);
|
||||
exit(errcode);
|
||||
}
|
||||
|
||||
/* abort local procs in the communicator. If the communicator is
|
||||
an intercommunicator AND the abort has explicitly requested
|
||||
that we abort the remote procs, then do that as well. */
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -21,6 +22,7 @@
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/runtime/params.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
@ -34,6 +36,9 @@
|
||||
*/
|
||||
int orte_finalize(void)
|
||||
{
|
||||
if (!orte_initialized) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* We have now entered the finalization stage */
|
||||
orte_universe_info.state = ORTE_UNIVERSE_STATE_FINALIZE;
|
||||
@ -43,7 +48,8 @@ int orte_finalize(void)
|
||||
|
||||
/* finalize the opal utilities */
|
||||
opal_finalize();
|
||||
|
||||
|
||||
orte_initialized = false;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -22,6 +23,7 @@
|
||||
|
||||
#include "orte/orte_constants.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/runtime/params.h"
|
||||
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
@ -37,6 +39,10 @@ int orte_init(bool infrastructure)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (orte_initialized) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = opal_init())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
@ -49,6 +55,7 @@ int orte_init(bool infrastructure)
|
||||
|
||||
/* Since we are now finished with init, change the state to running */
|
||||
orte_universe_info.state = ORTE_UNIVERSE_STATE_RUNNING;
|
||||
orte_initialized = true;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -32,6 +32,10 @@
|
||||
/* globals used by RTE */
|
||||
int orte_debug_flag;
|
||||
struct timeval orte_abort_timeout;
|
||||
/*
|
||||
* Whether we have completed orte_init or not
|
||||
*/
|
||||
bool orte_initialized = false;
|
||||
|
||||
int orte_register_params(bool infrastructure)
|
||||
{
|
||||
|
@ -10,6 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -42,6 +43,11 @@ ORTE_DECLSPEC extern int orte_debug_flag;
|
||||
|
||||
ORTE_DECLSPEC extern struct timeval orte_abort_timeout;
|
||||
|
||||
/**
|
||||
* Whether ORTE is initialized or not
|
||||
*/
|
||||
ORTE_DECLSPEC extern bool orte_initialized;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user