mpi/finalized: revamp INITIALIZED/FINALIZED
Per MPI-3.1:8.7.1 p361:11-13, it's valid for MPI_FINALIZED to be invoked during an attribute destruction callback (e.g., during the destruction of keyvals on MPI_COMM_SELF during the very beginning of MPI_FINALIZE). In such cases, MPI_FINALIZED must return "false". Prior to this commit, we hung in FINALIZED if it were invoked during a COMM_SELF attribute destruction callback in FINALIZE. See https://github.com/open-mpi/ompi/issues/5084. This commit converts the MPI_INITIALIZED / MPI_FINALIZED infrastructure to use a single enum (ompi_mpi_state, set atomically) to represent the state of MPI: - not initialized - init started - init completed - finalize started - finalize past COMM_SELF destruction - finalize completed The "finalize past COMM_SELF destruction" state is what allows us to return "false" from MPI_FINALIZED before COMM_SELF has been fully destroyed / all attribute callbacks have been invoked. Since this state is checked at nearly every MPI API call (to see if we're outside of the INIT/FINALIZE epoch), care was taken to use atomics to *set* the ompi_mpi_state value in ompi_mpi_init() and ompi_mpi_finalize(), but performance-critical code paths can simply read the variable without needing to use a slow call to an opal_atomic_*() function. Thanks to @AndrewGaspar for reporting the issue. Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
Этот коммит содержится в:
родитель
0d66e02179
Коммит
35438ae9b5
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
|
||||
@ -193,11 +193,22 @@ struct ompi_request_t;
|
||||
* This macro directly invokes the ompi_mpi_errors_are_fatal_handler()
|
||||
* when an error occurs because MPI_COMM_WORLD does not exist (because
|
||||
* we're before MPI_Init() or after MPI_Finalize()).
|
||||
*
|
||||
* NOTE: The ompi_mpi_state variable is a volatile that is set
|
||||
* atomically in ompi_mpi_init() and ompi_mpi_finalize(). The
|
||||
* appropriate memory barriers are done in those 2 functions such that
|
||||
* we do not need to do a read memory barrier here (in
|
||||
* potentially-performance-critical code paths) before reading the
|
||||
* variable.
|
||||
*/
|
||||
#define OMPI_ERR_INIT_FINALIZE(name) \
|
||||
if( OPAL_UNLIKELY(!ompi_mpi_initialized || ompi_mpi_finalized) ) { \
|
||||
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
|
||||
}
|
||||
#define OMPI_ERR_INIT_FINALIZE(name) \
|
||||
{ \
|
||||
int32_t state = ompi_mpi_state; \
|
||||
if (OPAL_UNLIKELY(state < OMPI_MPI_STATE_INIT_COMPLETED || \
|
||||
state > OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT)) { \
|
||||
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the macro to invoke to directly invoke an MPI error
|
||||
|
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2008-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
||||
@ -149,7 +149,8 @@ void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
|
||||
|
||||
static void out(char *str, char *arg)
|
||||
{
|
||||
if (ompi_rte_initialized && !ompi_mpi_finalized) {
|
||||
if (ompi_rte_initialized &&
|
||||
ompi_mpi_state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
|
||||
if (NULL != arg) {
|
||||
opal_output(0, str, arg);
|
||||
} else {
|
||||
@ -280,7 +281,9 @@ static void backend_fatal_no_aggregate(char *type,
|
||||
{
|
||||
char *arg;
|
||||
|
||||
assert(!ompi_mpi_initialized || ompi_mpi_finalized);
|
||||
int32_t state = ompi_mpi_state;
|
||||
assert(state < OMPI_MPI_STATE_INIT_COMPLETED ||
|
||||
state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
|
||||
|
||||
fflush(stdout);
|
||||
fflush(stderr);
|
||||
@ -289,7 +292,7 @@ static void backend_fatal_no_aggregate(char *type,
|
||||
|
||||
/* Per #2152, print out in plain english if something was invoked
|
||||
before MPI_INIT* or after MPI_FINALIZE */
|
||||
if (!ompi_mpi_init_started && !ompi_mpi_initialized) {
|
||||
if (state < OMPI_MPI_STATE_INIT_STARTED) {
|
||||
if (NULL != arg) {
|
||||
out("*** The %s() function was called before MPI_INIT was invoked.\n"
|
||||
"*** This is disallowed by the MPI standard.\n", arg);
|
||||
@ -300,7 +303,7 @@ static void backend_fatal_no_aggregate(char *type,
|
||||
"*** function was invoked, sorry. :-(\n", NULL);
|
||||
}
|
||||
out("*** Your MPI job will now abort.\n", NULL);
|
||||
} else if (ompi_mpi_finalized) {
|
||||
} else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
|
||||
if (NULL != arg) {
|
||||
out("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
|
||||
"*** This is disallowed by the MPI standard.\n", arg);
|
||||
|
@ -2,6 +2,7 @@
|
||||
* Copyright (c) 2011 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -159,7 +160,7 @@ int mca_coll_fca_barrier(struct ompi_communicator_t *comm,
|
||||
int ret;
|
||||
|
||||
FCA_VERBOSE(5,"Using FCA Barrier");
|
||||
if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
|
||||
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
|
||||
FCA_VERBOSE(5, "In finalize, reverting to previous barrier");
|
||||
goto orig_barrier;
|
||||
}
|
||||
|
@ -4,6 +4,7 @@
|
||||
* Copyright (c) 2017 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -241,7 +242,7 @@ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,
|
||||
|
||||
int mca_coll_hcoll_progress(void)
|
||||
{
|
||||
if (ompi_mpi_finalized){
|
||||
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
||||
hcoll_rte_p2p_disabled_notify();
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
Copyright (c) 2011 Mellanox Technologies. All rights reserved.
|
||||
Copyright (c) 2015 Research Organization for Information Science
|
||||
and Technology (RIST). All rights reserved.
|
||||
Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
||||
$COPYRIGHT$
|
||||
|
||||
Additional copyrights may follow
|
||||
@ -21,7 +22,7 @@ int mca_coll_hcoll_barrier(struct ompi_communicator_t *comm,
|
||||
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
|
||||
HCOL_VERBOSE(20,"RUNNING HCOL BARRIER");
|
||||
|
||||
if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
|
||||
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
|
||||
HCOL_VERBOSE(5, "In finalize, reverting to previous barrier");
|
||||
goto orig_barrier;
|
||||
}
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -69,7 +70,7 @@ mca_io_romio314_file_close (ompi_file_t *fh)
|
||||
which we obviously can't do if we've started to MPI_Finalize).
|
||||
The user didn't close the file, so they should expect
|
||||
unexpected behavior. */
|
||||
if (ompi_mpi_finalized) {
|
||||
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
* Copyright (C) 2001-2011 Mellanox Technologies Ltd. ALL RIGHTS RESERVED.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -265,7 +266,7 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
if (ompi_mpi_finalized) {
|
||||
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
||||
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
|
||||
mxm_ep_powerdown(ompi_pml_yalla.mxm_ep);
|
||||
}
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -44,13 +44,7 @@ int MPI_Finalized(int *flag)
|
||||
|
||||
ompi_hook_base_mpi_finalized_top(flag);
|
||||
|
||||
/* We must obtain the lock to guarnatee consistent values of
|
||||
ompi_mpi_initialized and ompi_mpi_finalized. Note, too, that
|
||||
this lock is held for the bulk of the duration of
|
||||
ompi_mpi_init() and ompi_mpi_finalize(), so when we get the
|
||||
lock, we are guaranteed that some other thread is not part way
|
||||
through initialization or finalization. */
|
||||
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
|
||||
int32_t state = ompi_mpi_state;
|
||||
|
||||
if (MPI_PARAM_CHECK) {
|
||||
if (NULL == flag) {
|
||||
@ -59,12 +53,11 @@ int MPI_Finalized(int *flag)
|
||||
whether we're currently (after MPI_Init and before
|
||||
MPI_Finalize) or not */
|
||||
|
||||
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
|
||||
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
|
||||
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
|
||||
FUNC_NAME);
|
||||
} else {
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
/* We have no MPI object here so call ompi_errhandle_invoke
|
||||
* directly */
|
||||
return ompi_errhandler_invoke(NULL, NULL, -1,
|
||||
@ -74,8 +67,7 @@ int MPI_Finalized(int *flag)
|
||||
}
|
||||
}
|
||||
|
||||
*flag = ompi_mpi_finalized;
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
*flag = (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
|
||||
|
||||
ompi_hook_base_mpi_finalized_bottom(flag);
|
||||
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||
@ -58,7 +58,9 @@ int MPI_Get_library_version(char *version, int *resultlen)
|
||||
(i.e., use a NULL communicator, which will end up at the
|
||||
default errhandler, which is abort). */
|
||||
|
||||
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
|
||||
int32_t state = ompi_mpi_state;
|
||||
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
|
||||
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
|
||||
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
|
||||
FUNC_NAME);
|
||||
} else {
|
||||
|
@ -12,6 +12,7 @@
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -54,7 +55,9 @@ int MPI_Get_version(int *version, int *subversion)
|
||||
(i.e., use a NULL communicator, which will end up at the
|
||||
default errhandler, which is abort). */
|
||||
|
||||
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
|
||||
int32_t state = ompi_mpi_state;
|
||||
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
|
||||
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
|
||||
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
|
||||
FUNC_NAME);
|
||||
} else {
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -44,13 +44,7 @@ int MPI_Initialized(int *flag)
|
||||
|
||||
ompi_hook_base_mpi_initialized_top(flag);
|
||||
|
||||
/* We must obtain the lock to guarnatee consistent values of
|
||||
ompi_mpi_initialized and ompi_mpi_finalized. Note, too, that
|
||||
this lock is held for the bulk of the duration of
|
||||
ompi_mpi_init() and ompi_mpi_finalize(), so when we get the
|
||||
lock, we are guaranteed that some other thread is not part way
|
||||
through initialization or finalization. */
|
||||
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
|
||||
int32_t state = ompi_mpi_state;
|
||||
|
||||
if (MPI_PARAM_CHECK) {
|
||||
if (NULL == flag) {
|
||||
@ -59,12 +53,11 @@ int MPI_Initialized(int *flag)
|
||||
whether we're currently (after MPI_Init and before
|
||||
MPI_Finalize) or not */
|
||||
|
||||
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
|
||||
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
|
||||
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
|
||||
FUNC_NAME);
|
||||
} else {
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
/* We have no MPI object here so call ompi_errhandle_invoke
|
||||
* directly */
|
||||
return ompi_errhandler_invoke(NULL, NULL, -1,
|
||||
@ -74,8 +67,7 @@ int MPI_Initialized(int *flag)
|
||||
}
|
||||
}
|
||||
|
||||
*flag = ompi_mpi_initialized;
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
*flag = (state >= OMPI_MPI_STATE_INIT_COMPLETED);
|
||||
|
||||
ompi_hook_base_mpi_initialized_bottom(flag);
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2017 IBM Corporation. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -39,7 +39,9 @@ int MPI_T_finalize (void)
|
||||
if (0 == --ompi_mpit_init_count) {
|
||||
(void) ompi_info_close_components ();
|
||||
|
||||
if ((!ompi_mpi_initialized || ompi_mpi_finalized) &&
|
||||
int32_t state = ompi_mpi_state;
|
||||
if ((state < OMPI_MPI_STATE_INIT_COMPLETED ||
|
||||
state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) &&
|
||||
(NULL != ompi_mpi_main_thread)) {
|
||||
/* we are not between MPI_Init and MPI_Finalize so we
|
||||
* have to free the ompi_mpi_main_thread */
|
||||
|
@ -4,6 +4,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -64,8 +65,11 @@ const int PERUSE_num_events = (sizeof(PERUSE_events) / sizeof(peruse_event_assoc
|
||||
int PERUSE_Init (void)
|
||||
{
|
||||
if (MPI_PARAM_CHECK) {
|
||||
if (!ompi_mpi_initialized || ompi_mpi_finalized)
|
||||
int32_t state = ompi_mpi_state;
|
||||
if (state < OMPI_MPI_STATE_INIT_COMPLETED ||
|
||||
state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
||||
return PERUSE_ERR_INIT;
|
||||
}
|
||||
}
|
||||
ompi_peruse_init ();
|
||||
return PERUSE_SUCCESS;
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
@ -51,15 +51,9 @@ struct ompi_predefined_datatype_t;
|
||||
/** Mutex to protect all the _init and _finalize variables */
|
||||
OMPI_DECLSPEC extern opal_mutex_t ompi_mpi_bootstrap_mutex;
|
||||
/** Did MPI start to initialize? */
|
||||
OMPI_DECLSPEC extern volatile bool ompi_mpi_init_started;
|
||||
OMPI_DECLSPEC extern volatile int32_t ompi_mpi_state;
|
||||
/** Has the RTE been initialized? */
|
||||
OMPI_DECLSPEC extern volatile bool ompi_rte_initialized;
|
||||
/** Is MPI fully initialized? */
|
||||
OMPI_DECLSPEC extern volatile bool ompi_mpi_initialized;
|
||||
/** Did MPI start to finalize? */
|
||||
OMPI_DECLSPEC extern volatile bool ompi_mpi_finalize_started;
|
||||
/** Has MPI been fully finalized? */
|
||||
OMPI_DECLSPEC extern volatile bool ompi_mpi_finalized;
|
||||
|
||||
/** Do we have multiple threads? */
|
||||
OMPI_DECLSPEC extern bool ompi_mpi_thread_multiple;
|
||||
@ -70,6 +64,29 @@ OMPI_DECLSPEC extern int ompi_mpi_thread_provided;
|
||||
/** Identifier of the main thread */
|
||||
OMPI_DECLSPEC extern struct opal_thread_t *ompi_mpi_main_thread;
|
||||
|
||||
/*
|
||||
* State of the MPI runtime.
|
||||
*
|
||||
* Atomically set/read in the ompi_mpi_state global variable (for
|
||||
* functions such as MPI_INITIALIZED and MPI_FINALIZED).
|
||||
*/
|
||||
typedef enum {
|
||||
OMPI_MPI_STATE_NOT_INITIALIZED = 0,
|
||||
|
||||
OMPI_MPI_STATE_INIT_STARTED,
|
||||
OMPI_MPI_STATE_INIT_COMPLETED,
|
||||
|
||||
/* The PAST_COMM_SELF_DESTRUCT state is needed because attribute
|
||||
callbacks that are invoked during the very beginning of
|
||||
MPI_FINALIZE are supposed to return FALSE if they call
|
||||
MPI_FINALIZED. Hence, we need to distinguish between "We've
|
||||
started MPI_FINALIZE" and "We're far enough in MPI_FINALIZE
|
||||
that we now need to return TRUE from MPI_FINALIZED." */
|
||||
OMPI_MPI_STATE_FINALIZE_STARTED,
|
||||
OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT,
|
||||
OMPI_MPI_STATE_FINALIZE_COMPLETED
|
||||
} ompi_mpi_state_t;
|
||||
|
||||
/*
|
||||
* These variables are for the MPI F03 bindings (F03 must bind Fortran
|
||||
* varaiables to symbols; it cannot bind Fortran variables to the
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2014 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -166,16 +166,20 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
|
||||
/* If the RTE isn't setup yet/any more, then don't even try
|
||||
killing everyone. Sorry, Charlie... */
|
||||
int32_t state = ompi_mpi_state;
|
||||
if (!ompi_rte_initialized) {
|
||||
fprintf(stderr, "[%s:%05d] Local abort %s completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
||||
host, (int) pid, ompi_mpi_finalized ?
|
||||
host, (int) pid,
|
||||
state >= OMPI_MPI_STATE_FINALIZE_STARTED ?
|
||||
"after MPI_FINALIZE started" : "before MPI_INIT completed");
|
||||
_exit(errcode == 0 ? 1 : errcode);
|
||||
}
|
||||
|
||||
/* If OMPI is initialized and we have a non-NULL communicator,
|
||||
then try to kill just that set of processes */
|
||||
if (ompi_mpi_initialized && !ompi_mpi_finalized && NULL != comm) {
|
||||
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
|
||||
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT &&
|
||||
NULL != comm) {
|
||||
try_kill_peers(comm, errcode);
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006 University of Houston. All rights reserved.
|
||||
@ -115,17 +115,9 @@ int ompi_mpi_finalize(void)
|
||||
|
||||
ompi_hook_base_mpi_finalize_top();
|
||||
|
||||
/* Be a bit social if an erroneous program calls MPI_FINALIZE in
|
||||
two different threads, otherwise we may deadlock in
|
||||
ompi_comm_free() (or run into other nasty lions, tigers, or
|
||||
bears).
|
||||
|
||||
This lock is held for the duration of ompi_mpi_init() and
|
||||
ompi_mpi_finalize(). Hence, if we get it, then no other thread
|
||||
is inside the critical section (and we don't have to check the
|
||||
*_started bool variables). */
|
||||
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
|
||||
if (!ompi_mpi_initialized || ompi_mpi_finalized) {
|
||||
int32_t state = ompi_mpi_state;
|
||||
if (state < OMPI_MPI_STATE_INIT_COMPLETED ||
|
||||
state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
||||
/* Note that if we're not initialized or already finalized, we
|
||||
cannot raise an MPI exception. The best that we can do is
|
||||
write something to stderr. */
|
||||
@ -133,19 +125,19 @@ int ompi_mpi_finalize(void)
|
||||
pid_t pid = getpid();
|
||||
gethostname(hostname, sizeof(hostname));
|
||||
|
||||
if (ompi_mpi_initialized) {
|
||||
if (state < OMPI_MPI_STATE_INIT_COMPLETED) {
|
||||
opal_show_help("help-mpi-runtime.txt",
|
||||
"mpi_finalize: not initialized",
|
||||
true, hostname, pid);
|
||||
} else if (ompi_mpi_finalized) {
|
||||
} else if (state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
||||
opal_show_help("help-mpi-runtime.txt",
|
||||
"mpi_finalize:invoked_multiple_times",
|
||||
true, hostname, pid);
|
||||
}
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
return MPI_ERR_OTHER;
|
||||
}
|
||||
ompi_mpi_finalize_started = true;
|
||||
opal_atomic_wmb();
|
||||
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_FINALIZE_STARTED);
|
||||
|
||||
ompi_mpiext_fini();
|
||||
|
||||
@ -160,9 +152,14 @@ int ompi_mpi_finalize(void)
|
||||
ompi_mpi_comm_self.comm.c_keyhash = NULL;
|
||||
}
|
||||
|
||||
/* Proceed with MPI_FINALIZE */
|
||||
|
||||
ompi_mpi_finalized = true;
|
||||
/* Mark that we are past COMM_SELF destruction so that
|
||||
MPI_FINALIZED can return an accurate value (per MPI-3.1,
|
||||
FINALIZED needs to return FALSE to MPI_FINALIZED until after
|
||||
COMM_SELF is destroyed / all the attribute callbacks have been
|
||||
invoked) */
|
||||
opal_atomic_wmb();
|
||||
opal_atomic_swap_32(&ompi_mpi_state,
|
||||
OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
|
||||
|
||||
/* As finalize is the last legal MPI call, we are allowed to force the release
|
||||
* of the user buffer used for bsend, before going anywhere further.
|
||||
@ -513,8 +510,10 @@ int ompi_mpi_finalize(void)
|
||||
|
||||
/* All done */
|
||||
|
||||
done:
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
done:
|
||||
opal_atomic_wmb();
|
||||
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_FINALIZE_COMPLETED);
|
||||
|
||||
ompi_hook_base_mpi_finalize_bottom();
|
||||
|
||||
return ret;
|
||||
|
@ -10,7 +10,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2018 Cisco Systems, Inc. All rights reserved
|
||||
* Copyright (c) 2006-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2009 University of Houston. All rights reserved.
|
||||
@ -130,11 +130,7 @@ const char ompi_version_string[] = OMPI_IDENT_STRING;
|
||||
* Global variables and symbols for the MPI layer
|
||||
*/
|
||||
|
||||
opal_mutex_t ompi_mpi_bootstrap_mutex = OPAL_MUTEX_STATIC_INIT;
|
||||
volatile bool ompi_mpi_init_started = false;
|
||||
volatile bool ompi_mpi_initialized = false;
|
||||
volatile bool ompi_mpi_finalize_started = false;
|
||||
volatile bool ompi_mpi_finalized = false;
|
||||
volatile int32_t ompi_mpi_state = OMPI_MPI_STATE_NOT_INITIALIZED;
|
||||
volatile bool ompi_rte_initialized = false;
|
||||
|
||||
bool ompi_mpi_thread_multiple = false;
|
||||
@ -394,21 +390,22 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
ompi_mpi_finalize(). Hence, if we get it, then no other thread
|
||||
is inside the critical section (and we don't have to check the
|
||||
*_started bool variables). */
|
||||
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
|
||||
if (ompi_mpi_finalized) {
|
||||
opal_atomic_rmb();
|
||||
int32_t state = ompi_mpi_state;
|
||||
|
||||
if (state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
||||
opal_show_help("help-mpi-runtime.txt",
|
||||
"mpi_init: already finalized", true);
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
return MPI_ERR_OTHER;
|
||||
} else if (ompi_mpi_initialized) {
|
||||
} else if (state >= OMPI_MPI_STATE_INIT_STARTED) {
|
||||
opal_show_help("help-mpi-runtime.txt",
|
||||
"mpi_init: invoked multiple times", true);
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
return MPI_ERR_OTHER;
|
||||
}
|
||||
|
||||
/* Indicate that we have *started* MPI_INIT* */
|
||||
ompi_mpi_init_started = true;
|
||||
opal_atomic_wmb();
|
||||
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_STARTED);
|
||||
|
||||
/* Figure out the final MPI thread levels. If we were not
|
||||
compiled for support for MPI threads, then don't allow
|
||||
@ -988,7 +985,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
"mpi_init:startup:internal-failure", true,
|
||||
"MPI_INIT", "MPI_INIT", error, err_msg, ret);
|
||||
}
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
ompi_hook_base_mpi_init_error(argc, argv, requested, provided);
|
||||
OMPI_TIMING_FINALIZE;
|
||||
return ret;
|
||||
@ -1010,8 +1006,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);
|
||||
|
||||
/* All done. Wasn't that simple? */
|
||||
|
||||
ompi_mpi_initialized = true;
|
||||
opal_atomic_wmb();
|
||||
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED);
|
||||
|
||||
/* Finish last measurement, output results
|
||||
* and clear timing structure */
|
||||
@ -1019,8 +1015,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
OMPI_TIMING_OUT;
|
||||
OMPI_TIMING_FINALIZE;
|
||||
|
||||
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
|
||||
|
||||
ompi_hook_base_mpi_init_bottom(argc, argv, requested, provided);
|
||||
|
||||
return MPI_SUCCESS;
|
||||
|
@ -3,6 +3,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -75,8 +76,14 @@ int oshmem_shmem_finalize(void)
|
||||
}
|
||||
}
|
||||
|
||||
if ((OSHMEM_SUCCESS == ret) && ompi_mpi_initialized
|
||||
&& !ompi_mpi_finalized && oshmem_shmem_globalexit_status == 0) {
|
||||
/* Note: ompi_mpi_state is set atomically in ompi_mpi_init() and
|
||||
ompi_mpi_finalize(). Those 2 functions have the appropriate
|
||||
memory barriers such that we don't need one here. */
|
||||
int32_t state = ompi_mpi_state;
|
||||
if ((OSHMEM_SUCCESS == ret) &&
|
||||
(state >= OMPI_MPI_STATE_INIT_COMPLETED &&
|
||||
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) &&
|
||||
oshmem_shmem_globalexit_status == 0) {
|
||||
PMPI_Comm_free(&oshmem_comm_world);
|
||||
ret = ompi_mpi_finalize();
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015-2016 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -147,7 +147,7 @@ int oshmem_shmem_init(int argc, char **argv, int requested, int *provided)
|
||||
OMPI_TIMING_INIT(32);
|
||||
|
||||
if (!oshmem_shmem_initialized) {
|
||||
if (!ompi_mpi_initialized && !ompi_mpi_finalized) {
|
||||
if (ompi_mpi_state < OMPI_MPI_STATE_INIT_COMPLETED) {
|
||||
ret = ompi_mpi_init(argc, argv, requested, provided);
|
||||
}
|
||||
OMPI_TIMING_NEXT("ompi_mpi_init");
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user