diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index dadaaccd97..b8e6ae3621 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -188,25 +188,16 @@ int ompi_comm_finalize(void) int max, i; ompi_communicator_t *comm; - /* MPI-2 section 4.8: call the attribute - delete functions attached to MPI_COMM_SELF - and destroy comm_self before any other communicator */ - comm = &ompi_mpi_comm_self; - if (NULL != comm->c_keyhash) { - ompi_attr_delete_all(COMM_ATTR, comm, comm->c_keyhash); - /* ignoring that the attribute delete functions might - return an errorcode != MPI_SUCCESS. - Hey, we are in finalize, can finalize fail ??? */ - OBJ_RELEASE(comm->c_keyhash); - } + /* Shut down MPI_COMM_SELF */ OBJ_DESTRUCT( &ompi_mpi_comm_self ); /* disconnect all dynamic communicators */ ompi_comm_dyn_finalize(); - /* Destroy all predefined communicators */ + /* Shut down MPI_COMM_WORLD */ OBJ_DESTRUCT( &ompi_mpi_comm_world ); + /* Shut down the parent communicator, if it exists */ if( ompi_mpi_comm_parent != &ompi_mpi_comm_null ) { /* Note that we pass ompi_mpi_comm_parent here (vs. &ompi_mpi_comm_parent) because it is of type @@ -217,6 +208,7 @@ int ompi_comm_finalize(void) OBJ_DESTRUCT (ompi_mpi_comm_parent); } + /* Shut down MPI_COMM_NULL */ OBJ_DESTRUCT( &ompi_mpi_comm_null ); /* Check whether we have some communicators left */ diff --git a/ompi/runtime/help-mpi-runtime.txt b/ompi/runtime/help-mpi-runtime.txt index 4c72788d89..7a1e47c6a4 100644 --- a/ompi/runtime/help-mpi-runtime.txt +++ b/ompi/runtime/help-mpi-runtime.txt @@ -42,3 +42,10 @@ Typical causes for this problem include: which case Open MPI will not bind any processes on that node - A startup mechanism was used which did not tell Open MPI which processors to bind processes to +[mpi_finalize:invoked_multiple_times] +The function MPI_FINALIZE was invoked multiple times in a single +process on host %s, PID %d. + +This indicates an erroneous MPI program; MPI_FINALIZE is only allowed +to be invoked exactly once in a process. + diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c index 30778b556a..ab0f3ca8e5 100644 --- a/ompi/runtime/ompi_mpi_finalize.c +++ b/ompi/runtime/ompi_mpi_finalize.c @@ -19,10 +19,25 @@ #include "ompi_config.h" +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#ifdef HAVE_NETDB_H +#include +#endif + #include "opal/event/event.h" #include "opal/runtime/opal_progress.h" #include "opal/mca/maffinity/base/base.h" #include "opal/mca/base/base.h" +#include "opal/util/show_help.h" +#include "opal/sys/atomic.h" #include "orte/util/proc_info.h" #include "orte/mca/schema/schema.h" @@ -64,10 +79,30 @@ int ompi_mpi_finalize(void) { int ret; + static int32_t finalize_has_already_started = 0; - /* Delete attributes on MPI_COMM_SELF per MPI-2:4.8. Must be done - before anything else in FINALIZE, and ensure that MPI_FINALIZED - still returns false. */ + /* Be a bit social if an erroneous program calls MPI_FINALIZE in + two different threads, otherwise we may deadlock in + ompi_comm_free() (or run into other nasty lions, tigers, or + bears) */ + + if (! opal_atomic_cmpset_32(&finalize_has_already_started, 0, 1)) { + /* Note that if we're already finalized, we cannot raise an + MPI exception. The best that we can do is write something + to stderr. */ + char hostname[MAXHOSTNAMELEN]; + pid_t pid = getpid(); + gethostname(hostname, sizeof(hostname)); + + opal_show_help("help-mpi-runtime.txt", + "mpi_finalize:invoked_multiple_times", + true, hostname, pid); + return MPI_ERR_OTHER; + } + + /* Per MPI-2:4.8, we have to free MPI_COMM_SELF before doing + anything else in MPI_FINALIZE (to include setting up such that + MPI_FINALIZED will return true). */ if (NULL != ompi_mpi_comm_self.c_keyhash) { ompi_attr_delete_all(COMM_ATTR, &ompi_mpi_comm_self, @@ -76,6 +111,8 @@ int ompi_mpi_finalize(void) ompi_mpi_comm_self.c_keyhash = NULL; } + /* Proceed with MPI_FINALIZE */ + ompi_mpi_finalized = true; #if OMPI_ENABLE_PROGRESS_THREADS == 0 opal_progress_events(OPAL_EVLOOP_NONBLOCK);