* Before this commit, if we called ompi_mpi_abort() before MPI_INIT
completed successfully, Bad Things(tm) could happen. * Now we explicitly check orte_initialized (a new global in ORTE indicating whether we are between orte_init() and orte_finalize() or not), and if so, react accordingly. * If ORTE is initialized, use orte_system_info.nodename; otherwise, use gethostname(). * Add loop protection to ensure that ompi_mpi_abort() is not invoked multiple times recursively. This commit was SVN r13354.
Этот коммит содержится в:
родитель
a45e8bea05
Коммит
e90b3e415b
@ -35,13 +35,16 @@
|
|||||||
#include "opal/mca/backtrace/backtrace.h"
|
#include "opal/mca/backtrace/backtrace.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/runtime/runtime.h"
|
#include "orte/runtime/runtime.h"
|
||||||
|
#include "orte/runtime/params.h"
|
||||||
#include "orte/mca/ns/ns.h"
|
#include "orte/mca/ns/ns.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
#include "orte/util/sys_info.h"
|
||||||
#include "ompi/communicator/communicator.h"
|
#include "ompi/communicator/communicator.h"
|
||||||
#include "ompi/proc/proc.h"
|
#include "ompi/proc/proc.h"
|
||||||
#include "ompi/runtime/mpiruntime.h"
|
#include "ompi/runtime/mpiruntime.h"
|
||||||
#include "ompi/runtime/params.h"
|
#include "ompi/runtime/params.h"
|
||||||
|
|
||||||
|
static bool have_been_invoked = false;
|
||||||
|
|
||||||
int
|
int
|
||||||
ompi_mpi_abort(struct ompi_communicator_t* comm,
|
ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||||
@ -49,11 +52,27 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|||||||
bool kill_remote_of_intercomm)
|
bool kill_remote_of_intercomm)
|
||||||
{
|
{
|
||||||
int count = 0, i, ret = OMPI_SUCCESS;
|
int count = 0, i, ret = OMPI_SUCCESS;
|
||||||
char hostname[MAXHOSTNAMELEN];
|
char *host, hostname[MAXHOSTNAMELEN];
|
||||||
pid_t pid = 0;
|
pid_t pid = 0;
|
||||||
orte_process_name_t *abort_procs;
|
orte_process_name_t *abort_procs;
|
||||||
orte_std_cntr_t nabort_procs;
|
orte_std_cntr_t nabort_procs;
|
||||||
|
|
||||||
|
/* Protection for recursive invocation */
|
||||||
|
if (have_been_invoked) {
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
have_been_invoked = true;
|
||||||
|
|
||||||
|
/* If ORTE is initialized, use its nodename. Otherwise, call
|
||||||
|
gethostname. */
|
||||||
|
|
||||||
|
if (orte_initialized) {
|
||||||
|
host = orte_system_info.nodename;
|
||||||
|
} else {
|
||||||
|
gethostname(hostname, sizeof(hostname));
|
||||||
|
host = hostname;
|
||||||
|
}
|
||||||
|
pid = getpid();
|
||||||
|
|
||||||
/* Corner case: if we're being called as a result of the
|
/* Corner case: if we're being called as a result of the
|
||||||
OMPI_ERR_INIT_FINALIZE macro (meaning that this is before
|
OMPI_ERR_INIT_FINALIZE macro (meaning that this is before
|
||||||
@ -61,16 +80,9 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|||||||
ORTE has been setup yet. */
|
ORTE has been setup yet. */
|
||||||
|
|
||||||
if (!ompi_mpi_initialized || ompi_mpi_finalized) {
|
if (!ompi_mpi_initialized || ompi_mpi_finalized) {
|
||||||
orte_errmgr.error_detected(errcode, NULL);
|
if (orte_initialized) {
|
||||||
}
|
orte_errmgr.error_detected(errcode, NULL);
|
||||||
|
}
|
||||||
/* If we're going to print anything, get the hostname and PID of
|
|
||||||
this process */
|
|
||||||
|
|
||||||
if (ompi_mpi_abort_print_stack ||
|
|
||||||
0 != ompi_mpi_abort_delay) {
|
|
||||||
gethostname(hostname, sizeof(hostname));
|
|
||||||
pid = getpid();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Should we print a stack trace? */
|
/* Should we print a stack trace? */
|
||||||
@ -81,7 +93,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|||||||
|
|
||||||
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
|
||||||
for (i = 0; i < len; ++i) {
|
for (i = 0; i < len; ++i) {
|
||||||
fprintf(stderr, "[%s:%d] [%d] func:%s\n", hostname, (int) pid,
|
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
|
||||||
i, messages[i]);
|
i, messages[i]);
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
}
|
}
|
||||||
@ -99,20 +111,29 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
|||||||
if (0 != ompi_mpi_abort_delay) {
|
if (0 != ompi_mpi_abort_delay) {
|
||||||
if (ompi_mpi_abort_delay < 0) {
|
if (ompi_mpi_abort_delay < 0) {
|
||||||
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
|
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
|
||||||
hostname, (int) pid);
|
host, (int) pid);
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
while (1) {
|
while (1) {
|
||||||
sleep(5);
|
sleep(5);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
|
||||||
hostname, (int) pid, ompi_mpi_abort_delay);
|
host, (int) pid, ompi_mpi_abort_delay);
|
||||||
do {
|
do {
|
||||||
sleep(1);
|
sleep(1);
|
||||||
} while (--ompi_mpi_abort_delay > 0);
|
} while (--ompi_mpi_abort_delay > 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If ORTE isn't setup yet, then don't even try killing everyone.
|
||||||
|
Sorry, Charlie... */
|
||||||
|
|
||||||
|
if (!orte_initialized) {
|
||||||
|
fprintf(stderr, "[%s:%d] Abort before MPI_INIT completed successfully; not able to guarantee that all other processes were killed!\n",
|
||||||
|
host, (int) pid);
|
||||||
|
exit(errcode);
|
||||||
|
}
|
||||||
|
|
||||||
/* abort local procs in the communicator. If the communicator is
|
/* abort local procs in the communicator. If the communicator is
|
||||||
an intercommunicator AND the abort has explicitly requested
|
an intercommunicator AND the abort has explicitly requested
|
||||||
that we abort the remote procs, then do that as well. */
|
that we abort the remote procs, then do that as well. */
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -21,6 +22,7 @@
|
|||||||
#include "orte_config.h"
|
#include "orte_config.h"
|
||||||
|
|
||||||
#include "orte/orte_constants.h"
|
#include "orte/orte_constants.h"
|
||||||
|
#include "orte/runtime/params.h"
|
||||||
#include "opal/runtime/opal.h"
|
#include "opal/runtime/opal.h"
|
||||||
#include "orte/runtime/runtime.h"
|
#include "orte/runtime/runtime.h"
|
||||||
|
|
||||||
@ -34,6 +36,9 @@
|
|||||||
*/
|
*/
|
||||||
int orte_finalize(void)
|
int orte_finalize(void)
|
||||||
{
|
{
|
||||||
|
if (!orte_initialized) {
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/* We have now entered the finalization stage */
|
/* We have now entered the finalization stage */
|
||||||
orte_universe_info.state = ORTE_UNIVERSE_STATE_FINALIZE;
|
orte_universe_info.state = ORTE_UNIVERSE_STATE_FINALIZE;
|
||||||
@ -44,6 +49,7 @@ int orte_finalize(void)
|
|||||||
/* finalize the opal utilities */
|
/* finalize the opal utilities */
|
||||||
opal_finalize();
|
opal_finalize();
|
||||||
|
|
||||||
|
orte_initialized = false;
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -22,6 +23,7 @@
|
|||||||
|
|
||||||
#include "orte/orte_constants.h"
|
#include "orte/orte_constants.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
|
#include "orte/runtime/params.h"
|
||||||
|
|
||||||
#include "opal/runtime/opal.h"
|
#include "opal/runtime/opal.h"
|
||||||
#include "orte/runtime/runtime.h"
|
#include "orte/runtime/runtime.h"
|
||||||
@ -37,6 +39,10 @@ int orte_init(bool infrastructure)
|
|||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
|
if (orte_initialized) {
|
||||||
|
return ORTE_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (rc = opal_init())) {
|
if (ORTE_SUCCESS != (rc = opal_init())) {
|
||||||
ORTE_ERROR_LOG(rc);
|
ORTE_ERROR_LOG(rc);
|
||||||
return rc;
|
return rc;
|
||||||
@ -49,6 +55,7 @@ int orte_init(bool infrastructure)
|
|||||||
|
|
||||||
/* Since we are now finished with init, change the state to running */
|
/* Since we are now finished with init, change the state to running */
|
||||||
orte_universe_info.state = ORTE_UNIVERSE_STATE_RUNNING;
|
orte_universe_info.state = ORTE_UNIVERSE_STATE_RUNNING;
|
||||||
|
orte_initialized = true;
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,10 @@
|
|||||||
/* globals used by RTE */
|
/* globals used by RTE */
|
||||||
int orte_debug_flag;
|
int orte_debug_flag;
|
||||||
struct timeval orte_abort_timeout;
|
struct timeval orte_abort_timeout;
|
||||||
|
/*
|
||||||
|
* Whether we have completed orte_init or not
|
||||||
|
*/
|
||||||
|
bool orte_initialized = false;
|
||||||
|
|
||||||
int orte_register_params(bool infrastructure)
|
int orte_register_params(bool infrastructure)
|
||||||
{
|
{
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -42,6 +43,11 @@ ORTE_DECLSPEC extern int orte_debug_flag;
|
|||||||
|
|
||||||
ORTE_DECLSPEC extern struct timeval orte_abort_timeout;
|
ORTE_DECLSPEC extern struct timeval orte_abort_timeout;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether ORTE is initialized or not
|
||||||
|
*/
|
||||||
|
ORTE_DECLSPEC extern bool orte_initialized;
|
||||||
|
|
||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user