From e90b3e415b0bf5a0247dfcce7233e990c0df64ff Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 29 Jan 2007 22:01:28 +0000 Subject: [PATCH] * Before this commit, if we called ompi_mpi_abort() before MPI_INIT completed successfully, Bad Things(tm) could happen. * Now we explicitly check orte_initialized (a new global in ORTE indicating whether we are between orte_init() and orte_finalize() or not), and if so, react accordingly. * If ORTE is initialized, use orte_system_info.nodename; otherwise, use gethostname(). * Add loop protection to ensure that ompi_mpi_abort() is not invoked multiple times recursively. This commit was SVN r13354. --- ompi/runtime/ompi_mpi_abort.c | 53 ++++++++++++++++++++++++----------- orte/runtime/orte_finalize.c | 8 +++++- orte/runtime/orte_init.c | 7 +++++ orte/runtime/orte_params.c | 4 +++ orte/runtime/params.h | 6 ++++ 5 files changed, 61 insertions(+), 17 deletions(-) diff --git a/ompi/runtime/ompi_mpi_abort.c b/ompi/runtime/ompi_mpi_abort.c index 1d6dd88724..0c2abee30b 100644 --- a/ompi/runtime/ompi_mpi_abort.c +++ b/ompi/runtime/ompi_mpi_abort.c @@ -35,13 +35,16 @@ #include "opal/mca/backtrace/backtrace.h" #include "orte/util/proc_info.h" #include "orte/runtime/runtime.h" +#include "orte/runtime/params.h" #include "orte/mca/ns/ns.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/util/sys_info.h" #include "ompi/communicator/communicator.h" #include "ompi/proc/proc.h" #include "ompi/runtime/mpiruntime.h" #include "ompi/runtime/params.h" +static bool have_been_invoked = false; int ompi_mpi_abort(struct ompi_communicator_t* comm, @@ -49,28 +52,37 @@ ompi_mpi_abort(struct ompi_communicator_t* comm, bool kill_remote_of_intercomm) { int count = 0, i, ret = OMPI_SUCCESS; - char hostname[MAXHOSTNAMELEN]; + char *host, hostname[MAXHOSTNAMELEN]; pid_t pid = 0; orte_process_name_t *abort_procs; orte_std_cntr_t nabort_procs; - - + + /* Protection for recursive invocation */ + if (have_been_invoked) { + return OMPI_SUCCESS; + } + have_been_invoked = true; + + /* If ORTE is initialized, use its nodename. Otherwise, call + gethostname. */ + + if (orte_initialized) { + host = orte_system_info.nodename; + } else { + gethostname(hostname, sizeof(hostname)); + host = hostname; + } + pid = getpid(); + /* Corner case: if we're being called as a result of the OMPI_ERR_INIT_FINALIZE macro (meaning that this is before MPI_INIT or after MPI_FINALIZE), then just abort nothing MPI or ORTE has been setup yet. */ if (!ompi_mpi_initialized || ompi_mpi_finalized) { - orte_errmgr.error_detected(errcode, NULL); - } - - /* If we're going to print anything, get the hostname and PID of - this process */ - - if (ompi_mpi_abort_print_stack || - 0 != ompi_mpi_abort_delay) { - gethostname(hostname, sizeof(hostname)); - pid = getpid(); + if (orte_initialized) { + orte_errmgr.error_detected(errcode, NULL); + } } /* Should we print a stack trace? */ @@ -81,7 +93,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm, if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) { for (i = 0; i < len; ++i) { - fprintf(stderr, "[%s:%d] [%d] func:%s\n", hostname, (int) pid, + fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid, i, messages[i]); fflush(stderr); } @@ -99,20 +111,29 @@ ompi_mpi_abort(struct ompi_communicator_t* comm, if (0 != ompi_mpi_abort_delay) { if (ompi_mpi_abort_delay < 0) { fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n", - hostname, (int) pid); + host, (int) pid); fflush(stderr); while (1) { sleep(5); } } else { fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n", - hostname, (int) pid, ompi_mpi_abort_delay); + host, (int) pid, ompi_mpi_abort_delay); do { sleep(1); } while (--ompi_mpi_abort_delay > 0); } } + /* If ORTE isn't setup yet, then don't even try killing everyone. + Sorry, Charlie... */ + + if (!orte_initialized) { + fprintf(stderr, "[%s:%d] Abort before MPI_INIT completed successfully; not able to guarantee that all other processes were killed!\n", + host, (int) pid); + exit(errcode); + } + /* abort local procs in the communicator. If the communicator is an intercommunicator AND the abort has explicitly requested that we abort the remote procs, then do that as well. */ diff --git a/orte/runtime/orte_finalize.c b/orte/runtime/orte_finalize.c index cdb4fbbc05..9a0f01a2b5 100644 --- a/orte/runtime/orte_finalize.c +++ b/orte/runtime/orte_finalize.c @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -21,6 +22,7 @@ #include "orte_config.h" #include "orte/orte_constants.h" +#include "orte/runtime/params.h" #include "opal/runtime/opal.h" #include "orte/runtime/runtime.h" @@ -34,6 +36,9 @@ */ int orte_finalize(void) { + if (!orte_initialized) { + return ORTE_SUCCESS; + } /* We have now entered the finalization stage */ orte_universe_info.state = ORTE_UNIVERSE_STATE_FINALIZE; @@ -43,7 +48,8 @@ int orte_finalize(void) /* finalize the opal utilities */ opal_finalize(); - + + orte_initialized = false; return ORTE_SUCCESS; } diff --git a/orte/runtime/orte_init.c b/orte/runtime/orte_init.c index 4b13226288..32c3147d47 100644 --- a/orte/runtime/orte_init.c +++ b/orte/runtime/orte_init.c @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -22,6 +23,7 @@ #include "orte/orte_constants.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/runtime/params.h" #include "opal/runtime/opal.h" #include "orte/runtime/runtime.h" @@ -37,6 +39,10 @@ int orte_init(bool infrastructure) { int rc; + if (orte_initialized) { + return ORTE_SUCCESS; + } + if (ORTE_SUCCESS != (rc = opal_init())) { ORTE_ERROR_LOG(rc); return rc; @@ -49,6 +55,7 @@ int orte_init(bool infrastructure) /* Since we are now finished with init, change the state to running */ orte_universe_info.state = ORTE_UNIVERSE_STATE_RUNNING; + orte_initialized = true; return ORTE_SUCCESS; } diff --git a/orte/runtime/orte_params.c b/orte/runtime/orte_params.c index e2673fdeaa..5dffe15eb1 100644 --- a/orte/runtime/orte_params.c +++ b/orte/runtime/orte_params.c @@ -32,6 +32,10 @@ /* globals used by RTE */ int orte_debug_flag; struct timeval orte_abort_timeout; +/* + * Whether we have completed orte_init or not + */ +bool orte_initialized = false; int orte_register_params(bool infrastructure) { diff --git a/orte/runtime/params.h b/orte/runtime/params.h index 610e06452f..357066a2a1 100644 --- a/orte/runtime/params.h +++ b/orte/runtime/params.h @@ -10,6 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -42,6 +43,11 @@ ORTE_DECLSPEC extern int orte_debug_flag; ORTE_DECLSPEC extern struct timeval orte_abort_timeout; +/** + * Whether ORTE is initialized or not + */ +ORTE_DECLSPEC extern bool orte_initialized; + #if defined(c_plusplus) || defined(__cplusplus) } #endif