1
1

* Before this commit, if we called ompi_mpi_abort() before MPI_INIT

completed successfully, Bad Things(tm) could happen.
 * Now we explicitly check orte_initialized (a new global in ORTE
   indicating whether we are between orte_init() and orte_finalize()
   or not), and if so, react accordingly.
 * If ORTE is initialized, use orte_system_info.nodename; otherwise,
   use gethostname().
 * Add loop protection to ensure that ompi_mpi_abort() is not invoked
   multiple times recursively.

This commit was SVN r13354.
This commit is contained in:
Jeff Squyres 2007-01-29 22:01:28 +00:00
parent a45e8bea05
commit e90b3e415b
5 changed files with 61 additions and 17 deletions

View File

@ -35,13 +35,16 @@
#include "opal/mca/backtrace/backtrace.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/params.h"
#include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/sys_info.h"
#include "ompi/communicator/communicator.h"
#include "ompi/proc/proc.h"
#include "ompi/runtime/mpiruntime.h"
#include "ompi/runtime/params.h"
static bool have_been_invoked = false;
int
ompi_mpi_abort(struct ompi_communicator_t* comm,
@ -49,28 +52,37 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
bool kill_remote_of_intercomm)
{
int count = 0, i, ret = OMPI_SUCCESS;
char hostname[MAXHOSTNAMELEN];
char *host, hostname[MAXHOSTNAMELEN];
pid_t pid = 0;
orte_process_name_t *abort_procs;
orte_std_cntr_t nabort_procs;
/* Protection for recursive invocation */
if (have_been_invoked) {
return OMPI_SUCCESS;
}
have_been_invoked = true;
/* If ORTE is initialized, use its nodename. Otherwise, call
gethostname. */
if (orte_initialized) {
host = orte_system_info.nodename;
} else {
gethostname(hostname, sizeof(hostname));
host = hostname;
}
pid = getpid();
/* Corner case: if we're being called as a result of the
OMPI_ERR_INIT_FINALIZE macro (meaning that this is before
MPI_INIT or after MPI_FINALIZE), then just abort nothing MPI or
ORTE has been setup yet. */
if (!ompi_mpi_initialized || ompi_mpi_finalized) {
orte_errmgr.error_detected(errcode, NULL);
}
/* If we're going to print anything, get the hostname and PID of
this process */
if (ompi_mpi_abort_print_stack ||
0 != ompi_mpi_abort_delay) {
gethostname(hostname, sizeof(hostname));
pid = getpid();
if (orte_initialized) {
orte_errmgr.error_detected(errcode, NULL);
}
}
/* Should we print a stack trace? */
@ -81,7 +93,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
for (i = 0; i < len; ++i) {
fprintf(stderr, "[%s:%d] [%d] func:%s\n", hostname, (int) pid,
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
i, messages[i]);
fflush(stderr);
}
@ -99,20 +111,29 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
if (0 != ompi_mpi_abort_delay) {
if (ompi_mpi_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
hostname, (int) pid);
host, (int) pid);
fflush(stderr);
while (1) {
sleep(5);
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
hostname, (int) pid, ompi_mpi_abort_delay);
host, (int) pid, ompi_mpi_abort_delay);
do {
sleep(1);
} while (--ompi_mpi_abort_delay > 0);
}
}
/* If ORTE isn't setup yet, then don't even try killing everyone.
Sorry, Charlie... */
if (!orte_initialized) {
fprintf(stderr, "[%s:%d] Abort before MPI_INIT completed successfully; not able to guarantee that all other processes were killed!\n",
host, (int) pid);
exit(errcode);
}
/* abort local procs in the communicator. If the communicator is
an intercommunicator AND the abort has explicitly requested
that we abort the remote procs, then do that as well. */

View File

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -21,6 +22,7 @@
#include "orte_config.h"
#include "orte/orte_constants.h"
#include "orte/runtime/params.h"
#include "opal/runtime/opal.h"
#include "orte/runtime/runtime.h"
@ -34,6 +36,9 @@
*/
int orte_finalize(void)
{
if (!orte_initialized) {
return ORTE_SUCCESS;
}
/* We have now entered the finalization stage */
orte_universe_info.state = ORTE_UNIVERSE_STATE_FINALIZE;
@ -43,7 +48,8 @@ int orte_finalize(void)
/* finalize the opal utilities */
opal_finalize();
orte_initialized = false;
return ORTE_SUCCESS;
}

View File

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -22,6 +23,7 @@
#include "orte/orte_constants.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/params.h"
#include "opal/runtime/opal.h"
#include "orte/runtime/runtime.h"
@ -37,6 +39,10 @@ int orte_init(bool infrastructure)
{
int rc;
if (orte_initialized) {
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = opal_init())) {
ORTE_ERROR_LOG(rc);
return rc;
@ -49,6 +55,7 @@ int orte_init(bool infrastructure)
/* Since we are now finished with init, change the state to running */
orte_universe_info.state = ORTE_UNIVERSE_STATE_RUNNING;
orte_initialized = true;
return ORTE_SUCCESS;
}

View File

@ -32,6 +32,10 @@
/* globals used by RTE */
int orte_debug_flag;
struct timeval orte_abort_timeout;
/*
* Whether we have completed orte_init or not
*/
bool orte_initialized = false;
int orte_register_params(bool infrastructure)
{

View File

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -42,6 +43,11 @@ ORTE_DECLSPEC extern int orte_debug_flag;
ORTE_DECLSPEC extern struct timeval orte_abort_timeout;
/**
* Whether ORTE is initialized or not
*/
ORTE_DECLSPEC extern bool orte_initialized;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif