1
1

* Before this commit, if we called ompi_mpi_abort() before MPI_INIT

completed successfully, Bad Things(tm) could happen.
 * Now we explicitly check orte_initialized (a new global in ORTE
   indicating whether we are between orte_init() and orte_finalize()
   or not), and if so, react accordingly.
 * If ORTE is initialized, use orte_system_info.nodename; otherwise,
   use gethostname().
 * Add loop protection to ensure that ompi_mpi_abort() is not invoked
   multiple times recursively.

This commit was SVN r13354.
Этот коммит содержится в:
Jeff Squyres 2007-01-29 22:01:28 +00:00
родитель a45e8bea05
Коммит e90b3e415b
5 изменённых файлов: 61 добавлений и 17 удалений

Просмотреть файл

@ -35,13 +35,16 @@
#include "opal/mca/backtrace/backtrace.h" #include "opal/mca/backtrace/backtrace.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/params.h"
#include "orte/mca/ns/ns.h" #include "orte/mca/ns/ns.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/util/sys_info.h"
#include "ompi/communicator/communicator.h" #include "ompi/communicator/communicator.h"
#include "ompi/proc/proc.h" #include "ompi/proc/proc.h"
#include "ompi/runtime/mpiruntime.h" #include "ompi/runtime/mpiruntime.h"
#include "ompi/runtime/params.h" #include "ompi/runtime/params.h"
static bool have_been_invoked = false;
int int
ompi_mpi_abort(struct ompi_communicator_t* comm, ompi_mpi_abort(struct ompi_communicator_t* comm,
@ -49,28 +52,37 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
bool kill_remote_of_intercomm) bool kill_remote_of_intercomm)
{ {
int count = 0, i, ret = OMPI_SUCCESS; int count = 0, i, ret = OMPI_SUCCESS;
char hostname[MAXHOSTNAMELEN]; char *host, hostname[MAXHOSTNAMELEN];
pid_t pid = 0; pid_t pid = 0;
orte_process_name_t *abort_procs; orte_process_name_t *abort_procs;
orte_std_cntr_t nabort_procs; orte_std_cntr_t nabort_procs;
/* Protection for recursive invocation */
if (have_been_invoked) {
return OMPI_SUCCESS;
}
have_been_invoked = true;
/* If ORTE is initialized, use its nodename. Otherwise, call
gethostname. */
if (orte_initialized) {
host = orte_system_info.nodename;
} else {
gethostname(hostname, sizeof(hostname));
host = hostname;
}
pid = getpid();
/* Corner case: if we're being called as a result of the /* Corner case: if we're being called as a result of the
OMPI_ERR_INIT_FINALIZE macro (meaning that this is before OMPI_ERR_INIT_FINALIZE macro (meaning that this is before
MPI_INIT or after MPI_FINALIZE), then just abort nothing MPI or MPI_INIT or after MPI_FINALIZE), then just abort nothing MPI or
ORTE has been setup yet. */ ORTE has been setup yet. */
if (!ompi_mpi_initialized || ompi_mpi_finalized) { if (!ompi_mpi_initialized || ompi_mpi_finalized) {
orte_errmgr.error_detected(errcode, NULL); if (orte_initialized) {
} orte_errmgr.error_detected(errcode, NULL);
}
/* If we're going to print anything, get the hostname and PID of
this process */
if (ompi_mpi_abort_print_stack ||
0 != ompi_mpi_abort_delay) {
gethostname(hostname, sizeof(hostname));
pid = getpid();
} }
/* Should we print a stack trace? */ /* Should we print a stack trace? */
@ -81,7 +93,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) { if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
for (i = 0; i < len; ++i) { for (i = 0; i < len; ++i) {
fprintf(stderr, "[%s:%d] [%d] func:%s\n", hostname, (int) pid, fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
i, messages[i]); i, messages[i]);
fflush(stderr); fflush(stderr);
} }
@ -99,20 +111,29 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
if (0 != ompi_mpi_abort_delay) { if (0 != ompi_mpi_abort_delay) {
if (ompi_mpi_abort_delay < 0) { if (ompi_mpi_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n", fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
hostname, (int) pid); host, (int) pid);
fflush(stderr); fflush(stderr);
while (1) { while (1) {
sleep(5); sleep(5);
} }
} else { } else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n", fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
hostname, (int) pid, ompi_mpi_abort_delay); host, (int) pid, ompi_mpi_abort_delay);
do { do {
sleep(1); sleep(1);
} while (--ompi_mpi_abort_delay > 0); } while (--ompi_mpi_abort_delay > 0);
} }
} }
/* If ORTE isn't setup yet, then don't even try killing everyone.
Sorry, Charlie... */
if (!orte_initialized) {
fprintf(stderr, "[%s:%d] Abort before MPI_INIT completed successfully; not able to guarantee that all other processes were killed!\n",
host, (int) pid);
exit(errcode);
}
/* abort local procs in the communicator. If the communicator is /* abort local procs in the communicator. If the communicator is
an intercommunicator AND the abort has explicitly requested an intercommunicator AND the abort has explicitly requested
that we abort the remote procs, then do that as well. */ that we abort the remote procs, then do that as well. */

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -21,6 +22,7 @@
#include "orte_config.h" #include "orte_config.h"
#include "orte/orte_constants.h" #include "orte/orte_constants.h"
#include "orte/runtime/params.h"
#include "opal/runtime/opal.h" #include "opal/runtime/opal.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
@ -34,6 +36,9 @@
*/ */
int orte_finalize(void) int orte_finalize(void)
{ {
if (!orte_initialized) {
return ORTE_SUCCESS;
}
/* We have now entered the finalization stage */ /* We have now entered the finalization stage */
orte_universe_info.state = ORTE_UNIVERSE_STATE_FINALIZE; orte_universe_info.state = ORTE_UNIVERSE_STATE_FINALIZE;
@ -43,7 +48,8 @@ int orte_finalize(void)
/* finalize the opal utilities */ /* finalize the opal utilities */
opal_finalize(); opal_finalize();
orte_initialized = false;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -22,6 +23,7 @@
#include "orte/orte_constants.h" #include "orte/orte_constants.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/params.h"
#include "opal/runtime/opal.h" #include "opal/runtime/opal.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
@ -37,6 +39,10 @@ int orte_init(bool infrastructure)
{ {
int rc; int rc;
if (orte_initialized) {
return ORTE_SUCCESS;
}
if (ORTE_SUCCESS != (rc = opal_init())) { if (ORTE_SUCCESS != (rc = opal_init())) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
@ -49,6 +55,7 @@ int orte_init(bool infrastructure)
/* Since we are now finished with init, change the state to running */ /* Since we are now finished with init, change the state to running */
orte_universe_info.state = ORTE_UNIVERSE_STATE_RUNNING; orte_universe_info.state = ORTE_UNIVERSE_STATE_RUNNING;
orte_initialized = true;
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -32,6 +32,10 @@
/* globals used by RTE */ /* globals used by RTE */
int orte_debug_flag; int orte_debug_flag;
struct timeval orte_abort_timeout; struct timeval orte_abort_timeout;
/*
* Whether we have completed orte_init or not
*/
bool orte_initialized = false;
int orte_register_params(bool infrastructure) int orte_register_params(bool infrastructure)
{ {

Просмотреть файл

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -42,6 +43,11 @@ ORTE_DECLSPEC extern int orte_debug_flag;
ORTE_DECLSPEC extern struct timeval orte_abort_timeout; ORTE_DECLSPEC extern struct timeval orte_abort_timeout;
/**
* Whether ORTE is initialized or not
*/
ORTE_DECLSPEC extern bool orte_initialized;
#if defined(c_plusplus) || defined(__cplusplus) #if defined(c_plusplus) || defined(__cplusplus)
} }
#endif #endif