ompi_mpi_init: fix race condition
There was a race condition in 35438ae9b5: if multiple threads invoked ompi_mpi_init() simultaneously (which could happen from both MPI and OSHMEM), the code did not catch this condition -- Bad Things would happen. Now use an atomic cmp/set to ensure that only one thread is able to advance ompi_mpi_init from NOT_INITIALIZED to INIT_STARTED. Additionally, change the prototype of ompi_mpi_init() so that oshmem_init() can safely invoke ompi_mpi_init() multiple times (as long as MPI_FINALIZE has not started) without displaying an error. If multiple threads invoke oshmem_init() simultaneously, one of them will actually do the initialization, and the rest will loop waiting for it to complete. Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
Этот коммит содержится в:
родитель
64a5baaa28
Коммит
67ba8da76f
@ -9,7 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
|
||||||
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
|
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
@ -63,9 +63,9 @@ int MPI_Init(int *argc, char ***argv)
|
|||||||
don't lose anything) */
|
don't lose anything) */
|
||||||
|
|
||||||
if (NULL != argc && NULL != argv) {
|
if (NULL != argc && NULL != argv) {
|
||||||
err = ompi_mpi_init(*argc, *argv, required, &provided);
|
err = ompi_mpi_init(*argc, *argv, required, &provided, false);
|
||||||
} else {
|
} else {
|
||||||
err = ompi_mpi_init(0, NULL, required, &provided);
|
err = ompi_mpi_init(0, NULL, required, &provided, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Since we don't have a communicator to invoke an errorhandler on
|
/* Since we don't have a communicator to invoke an errorhandler on
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* Copyright (c) 2010 Oak Ridge National Labs. All rights reserved.
|
* Copyright (c) 2010 Oak Ridge National Labs. All rights reserved.
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
* and Technology (RIST). All rights reserved.
|
* and Technology (RIST). All rights reserved.
|
||||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
|
||||||
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -63,9 +63,9 @@ int MPI_Init_thread(int *argc, char ***argv, int required,
|
|||||||
don't lose anything) */
|
don't lose anything) */
|
||||||
|
|
||||||
if (NULL != argc && NULL != argv) {
|
if (NULL != argc && NULL != argv) {
|
||||||
err = ompi_mpi_init(*argc, *argv, required, provided);
|
err = ompi_mpi_init(*argc, *argv, required, provided, false);
|
||||||
} else {
|
} else {
|
||||||
err = ompi_mpi_init(0, NULL, required, provided);
|
err = ompi_mpi_init(0, NULL, required, provided, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Since we don't have a communicator to invoke an errorhandler on
|
/* Since we don't have a communicator to invoke an errorhandler on
|
||||||
|
@ -175,6 +175,8 @@ void ompi_mpi_thread_level(int requested, int *provided);
|
|||||||
* @param argv argv, typically from main() (IN)
|
* @param argv argv, typically from main() (IN)
|
||||||
* @param requested Thread support that is requested (IN)
|
* @param requested Thread support that is requested (IN)
|
||||||
* @param provided Thread support that is provided (OUT)
|
* @param provided Thread support that is provided (OUT)
|
||||||
|
* @param reinit_ok Return successfully (with no error) if someone has
|
||||||
|
* already called ompi_mpi_init().
|
||||||
*
|
*
|
||||||
* @returns MPI_SUCCESS if successful
|
* @returns MPI_SUCCESS if successful
|
||||||
* @returns Error code if unsuccessful
|
* @returns Error code if unsuccessful
|
||||||
@ -186,7 +188,8 @@ void ompi_mpi_thread_level(int requested, int *provided);
|
|||||||
*
|
*
|
||||||
* It is permissable to pass in (0, NULL) for (argc, argv).
|
* It is permissable to pass in (0, NULL) for (argc, argv).
|
||||||
*/
|
*/
|
||||||
int ompi_mpi_init(int argc, char **argv, int requested, int *provided);
|
int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
|
||||||
|
bool reinit_ok);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finalize the Open MPI MPI environment
|
* Finalize the Open MPI MPI environment
|
||||||
|
@ -368,7 +368,8 @@ static void fence_release(int status, void *cbdata)
|
|||||||
OPAL_POST_OBJECT(active);
|
OPAL_POST_OBJECT(active);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
|
||||||
|
bool reinit_ok)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
ompi_proc_t** procs;
|
ompi_proc_t** procs;
|
||||||
@ -384,28 +385,32 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
|
|
||||||
ompi_hook_base_mpi_init_top(argc, argv, requested, provided);
|
ompi_hook_base_mpi_init_top(argc, argv, requested, provided);
|
||||||
|
|
||||||
/* Ensure that we were not already initialized or finalized.
|
/* Ensure that we were not already initialized or finalized. */
|
||||||
|
int32_t expected = OMPI_MPI_STATE_NOT_INITIALIZED;
|
||||||
This lock is held for the duration of ompi_mpi_init() and
|
int32_t desired = OMPI_MPI_STATE_INIT_STARTED;
|
||||||
ompi_mpi_finalize(). Hence, if we get it, then no other thread
|
|
||||||
is inside the critical section (and we don't have to check the
|
|
||||||
*_started bool variables). */
|
|
||||||
opal_atomic_rmb();
|
|
||||||
int32_t state = ompi_mpi_state;
|
|
||||||
|
|
||||||
if (state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
|
||||||
opal_show_help("help-mpi-runtime.txt",
|
|
||||||
"mpi_init: already finalized", true);
|
|
||||||
return MPI_ERR_OTHER;
|
|
||||||
} else if (state >= OMPI_MPI_STATE_INIT_STARTED) {
|
|
||||||
opal_show_help("help-mpi-runtime.txt",
|
|
||||||
"mpi_init: invoked multiple times", true);
|
|
||||||
return MPI_ERR_OTHER;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Indicate that we have *started* MPI_INIT* */
|
|
||||||
opal_atomic_wmb();
|
opal_atomic_wmb();
|
||||||
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_STARTED);
|
if (!opal_atomic_compare_exchange_strong_32(&ompi_mpi_state, &expected,
|
||||||
|
desired)) {
|
||||||
|
// If we failed to atomically transition ompi_mpi_state from
|
||||||
|
// NOT_INITIALIZED to INIT_STARTED, then someone else already
|
||||||
|
// did that, and we should return.
|
||||||
|
if (expected >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
||||||
|
opal_show_help("help-mpi-runtime.txt",
|
||||||
|
"mpi_init: already finalized", true);
|
||||||
|
return MPI_ERR_OTHER;
|
||||||
|
} else if (expected >= OMPI_MPI_STATE_INIT_STARTED) {
|
||||||
|
// In some cases (e.g., oshmem_shmem_init()), we may call
|
||||||
|
// ompi_mpi_init() multiple times. In such cases, just
|
||||||
|
// silently return successfully.
|
||||||
|
if (reinit_ok) {
|
||||||
|
return MPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
opal_show_help("help-mpi-runtime.txt",
|
||||||
|
"mpi_init: invoked multiple times", true);
|
||||||
|
return MPI_ERR_OTHER;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Figure out the final MPI thread levels. If we were not
|
/* Figure out the final MPI thread levels. If we were not
|
||||||
compiled for support for MPI threads, then don't allow
|
compiled for support for MPI threads, then don't allow
|
||||||
|
@ -147,8 +147,13 @@ int oshmem_shmem_init(int argc, char **argv, int requested, int *provided)
|
|||||||
OMPI_TIMING_INIT(32);
|
OMPI_TIMING_INIT(32);
|
||||||
|
|
||||||
if (!oshmem_shmem_initialized) {
|
if (!oshmem_shmem_initialized) {
|
||||||
if (ompi_mpi_state < OMPI_MPI_STATE_INIT_COMPLETED) {
|
ret = ompi_mpi_init(argc, argv, requested, provided, true);
|
||||||
ret = ompi_mpi_init(argc, argv, requested, provided);
|
|
||||||
|
// It's posible that another thread is initializing MPI and
|
||||||
|
// has not completed yet. Keep checking until it is
|
||||||
|
// completed.
|
||||||
|
while (ompi_mpi_state < OMPI_MPI_STATE_INIT_COMPLETED) {
|
||||||
|
usleep(1);
|
||||||
}
|
}
|
||||||
OMPI_TIMING_NEXT("ompi_mpi_init");
|
OMPI_TIMING_NEXT("ompi_mpi_init");
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user