1
1

Merge pull request #5234 from jsquyres/pr/oshmem-init-race

ompi_mpi_init: fix race condition
Этот коммит содержится в:
bosilca 2018-06-06 12:14:00 -04:00 коммит произвёл GitHub
родитель 356947fead 9b9cb5fef0
Коммит fa1386768f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 42 добавлений и 32 удалений

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science * Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
@ -63,9 +63,9 @@ int MPI_Init(int *argc, char ***argv)
don't lose anything) */ don't lose anything) */
if (NULL != argc && NULL != argv) { if (NULL != argc && NULL != argv) {
err = ompi_mpi_init(*argc, *argv, required, &provided); err = ompi_mpi_init(*argc, *argv, required, &provided, false);
} else { } else {
err = ompi_mpi_init(0, NULL, required, &provided); err = ompi_mpi_init(0, NULL, required, &provided, false);
} }
/* Since we don't have a communicator to invoke an errorhandler on /* Since we don't have a communicator to invoke an errorhandler on

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2010 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2010 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science * Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights * Copyright (c) 2016 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -63,9 +63,9 @@ int MPI_Init_thread(int *argc, char ***argv, int required,
don't lose anything) */ don't lose anything) */
if (NULL != argc && NULL != argv) { if (NULL != argc && NULL != argv) {
err = ompi_mpi_init(*argc, *argv, required, provided); err = ompi_mpi_init(*argc, *argv, required, provided, false);
} else { } else {
err = ompi_mpi_init(0, NULL, required, provided); err = ompi_mpi_init(0, NULL, required, provided, false);
} }
/* Since we don't have a communicator to invoke an errorhandler on /* Since we don't have a communicator to invoke an errorhandler on

Просмотреть файл

@ -175,6 +175,8 @@ void ompi_mpi_thread_level(int requested, int *provided);
* @param argv argv, typically from main() (IN) * @param argv argv, typically from main() (IN)
* @param requested Thread support that is requested (IN) * @param requested Thread support that is requested (IN)
* @param provided Thread support that is provided (OUT) * @param provided Thread support that is provided (OUT)
* @param reinit_ok Return successfully (with no error) if someone has
* already called ompi_mpi_init().
* *
* @returns MPI_SUCCESS if successful * @returns MPI_SUCCESS if successful
* @returns Error code if unsuccessful * @returns Error code if unsuccessful
@ -186,7 +188,8 @@ void ompi_mpi_thread_level(int requested, int *provided);
* *
* It is permissable to pass in (0, NULL) for (argc, argv). * It is permissable to pass in (0, NULL) for (argc, argv).
*/ */
int ompi_mpi_init(int argc, char **argv, int requested, int *provided); int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
bool reinit_ok);
/** /**
* Finalize the Open MPI MPI environment * Finalize the Open MPI MPI environment

Просмотреть файл

@ -368,7 +368,8 @@ static void fence_release(int status, void *cbdata)
OPAL_POST_OBJECT(active); OPAL_POST_OBJECT(active);
} }
int ompi_mpi_init(int argc, char **argv, int requested, int *provided) int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
bool reinit_ok)
{ {
int ret; int ret;
ompi_proc_t** procs; ompi_proc_t** procs;
@ -384,28 +385,36 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
ompi_hook_base_mpi_init_top(argc, argv, requested, provided); ompi_hook_base_mpi_init_top(argc, argv, requested, provided);
/* Ensure that we were not already initialized or finalized. /* Ensure that we were not already initialized or finalized. */
int32_t expected = OMPI_MPI_STATE_NOT_INITIALIZED;
This lock is held for the duration of ompi_mpi_init() and int32_t desired = OMPI_MPI_STATE_INIT_STARTED;
ompi_mpi_finalize(). Hence, if we get it, then no other thread
is inside the critical section (and we don't have to check the
*_started bool variables). */
opal_atomic_rmb();
int32_t state = ompi_mpi_state;
if (state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
opal_show_help("help-mpi-runtime.txt",
"mpi_init: already finalized", true);
return MPI_ERR_OTHER;
} else if (state >= OMPI_MPI_STATE_INIT_STARTED) {
opal_show_help("help-mpi-runtime.txt",
"mpi_init: invoked multiple times", true);
return MPI_ERR_OTHER;
}
/* Indicate that we have *started* MPI_INIT* */
opal_atomic_wmb(); opal_atomic_wmb();
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_STARTED); if (!opal_atomic_compare_exchange_strong_32(&ompi_mpi_state, &expected,
desired)) {
// If we failed to atomically transition ompi_mpi_state from
// NOT_INITIALIZED to INIT_STARTED, then someone else already
// did that, and we should return.
if (expected >= OMPI_MPI_STATE_FINALIZE_STARTED) {
opal_show_help("help-mpi-runtime.txt",
"mpi_init: already finalized", true);
return MPI_ERR_OTHER;
} else if (expected >= OMPI_MPI_STATE_INIT_STARTED) {
// In some cases (e.g., oshmem_shmem_init()), we may call
// ompi_mpi_init() multiple times. In such cases, just
// silently return successfully once the initializing
// thread has completed.
if (reinit_ok) {
while (ompi_mpi_state < OMPI_MPI_STATE_INIT_COMPLETED) {
usleep(1);
}
return MPI_SUCCESS;
}
opal_show_help("help-mpi-runtime.txt",
"mpi_init: invoked multiple times", true);
return MPI_ERR_OTHER;
}
}
/* Figure out the final MPI thread levels. If we were not /* Figure out the final MPI thread levels. If we were not
compiled for support for MPI threads, then don't allow compiled for support for MPI threads, then don't allow

Просмотреть файл

@ -147,9 +147,7 @@ int oshmem_shmem_init(int argc, char **argv, int requested, int *provided)
OMPI_TIMING_INIT(32); OMPI_TIMING_INIT(32);
if (!oshmem_shmem_initialized) { if (!oshmem_shmem_initialized) {
if (ompi_mpi_state < OMPI_MPI_STATE_INIT_COMPLETED) { ret = ompi_mpi_init(argc, argv, requested, provided, true);
ret = ompi_mpi_init(argc, argv, requested, provided);
}
OMPI_TIMING_NEXT("ompi_mpi_init"); OMPI_TIMING_NEXT("ompi_mpi_init");
if (OSHMEM_SUCCESS != ret) { if (OSHMEM_SUCCESS != ret) {