Fix for a bug Galen noticed yesterday -- make the shared memory only
be allocated the first time a sm coll is selected for a communicator, not before. This commit was SVN r7647.
Этот коммит содержится в:
родитель
1fe18814da
Коммит
b22fab2826
@ -37,7 +37,7 @@
|
||||
#define D(foo)
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
#include <sched.h>
|
||||
#define SPIN sched_yield()
|
||||
#else
|
||||
@ -190,6 +190,21 @@ extern "C" {
|
||||
the division once and then just use the value without
|
||||
having to re-calculate. */
|
||||
int sm_segs_per_inuse_flag;
|
||||
|
||||
/** Whether the component's shared memory has been [lazily]
|
||||
initialized or not */
|
||||
bool sm_component_setup;
|
||||
|
||||
/** Once the component has been lazily initialized, keep the
|
||||
state of it around */
|
||||
bool sm_component_setup_success;
|
||||
|
||||
/** A lock protecting the lazy initialzation of the component
|
||||
(SINCE THERE IS NO STATIC INITIALIZER FOR
|
||||
opal_atomic_lock_t, THIS *MUST* BE THE LAST MEMBER OF THE
|
||||
STRUCT!) */
|
||||
opal_atomic_lock_t sm_component_setup_lock;
|
||||
|
||||
};
|
||||
/**
|
||||
* Convenience typedef
|
||||
|
@ -133,7 +133,11 @@ mca_coll_sm_component_t mca_coll_sm_component = {
|
||||
0, /* mpool data size -- filled in below */
|
||||
NULL, /* data mpool pointer */
|
||||
false, /* whether this process created the data mpool */
|
||||
NULL /* pointer to meta data about bootstrap area */
|
||||
NULL, /* pointer to meta data about bootstrap area */
|
||||
false, /* whether the component sm has been [lazily] inited or not */
|
||||
false /* whether lazy init was successful or not */
|
||||
/* the lock for lazy initialization is not initialized here --
|
||||
there is no static initializer for opal_atomic_lock_t */
|
||||
};
|
||||
|
||||
|
||||
|
@ -110,7 +110,6 @@ static const mca_coll_base_module_1_0_0_t module = {
|
||||
int mca_coll_sm_init_query(bool enable_progress_threads,
|
||||
bool enable_mpi_threads)
|
||||
{
|
||||
int ret;
|
||||
#if 0
|
||||
/* JMS: Arrgh. Unfortunately, we don't have this information by
|
||||
the time this is invoked -- the GPR compound command doesn't
|
||||
@ -132,29 +131,12 @@ int mca_coll_sm_init_query(bool enable_progress_threads,
|
||||
free(procs);
|
||||
#endif
|
||||
|
||||
/* Ok, we have local peers. So setup the bootstrap file */
|
||||
/* Don't do much here because we don't really want to allocate any
|
||||
shared memory until this component is selected to be used. */
|
||||
|
||||
if (OMPI_SUCCESS != (ret = bootstrap_init())) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Can we get an mpool allocation? See if there was one created
|
||||
already. If not, try to make one. */
|
||||
|
||||
mca_coll_sm_component.sm_data_mpool =
|
||||
mca_mpool_base_module_lookup(mca_coll_sm_component.sm_mpool_name);
|
||||
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
||||
mca_coll_sm_component.sm_data_mpool =
|
||||
mca_mpool_base_module_create(mca_coll_sm_component.sm_mpool_name,
|
||||
NULL, NULL);
|
||||
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
||||
mca_coll_sm_bootstrap_finalize();
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
mca_coll_sm_component.sm_data_mpool_created = true;
|
||||
} else {
|
||||
mca_coll_sm_component.sm_data_mpool_created = false;
|
||||
}
|
||||
mca_coll_sm_component.sm_data_mpool_created = false;
|
||||
mca_coll_sm_component.sm_component_setup = false;
|
||||
opal_atomic_init(&mca_coll_sm_component.sm_component_setup_lock, 0);
|
||||
|
||||
/* Alles gut */
|
||||
|
||||
@ -171,6 +153,13 @@ const mca_coll_base_module_1_0_0_t *
|
||||
mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority,
|
||||
struct mca_coll_base_comm_t **data)
|
||||
{
|
||||
/* See if someone has previously lazily initialized and failed */
|
||||
|
||||
if (mca_coll_sm_component.sm_component_setup &&
|
||||
!mca_coll_sm_component.sm_component_setup_success) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* If we're intercomm, or if there's only one process in the
|
||||
communicator, or if not all the processes in the communicator
|
||||
are not on this node, then we don't want to run */
|
||||
@ -227,6 +216,51 @@ sm_module_init(struct ompi_communicator_t *comm)
|
||||
char *base;
|
||||
const int num_barrier_buffers = 2;
|
||||
|
||||
/* Once-per-component setup. This may happen at any time --
|
||||
during MPI_INIT or later. So we must protect this with locks
|
||||
to ensure that only one thread in the process actually does
|
||||
this setup. */
|
||||
|
||||
opal_atomic_lock(&mca_coll_sm_component.sm_component_setup_lock);
|
||||
if (!mca_coll_sm_component.sm_component_setup) {
|
||||
mca_coll_sm_component.sm_component_setup = true;
|
||||
|
||||
if (OMPI_SUCCESS != bootstrap_init()) {
|
||||
mca_coll_sm_component.sm_component_setup_success = false;
|
||||
opal_atomic_unlock(&mca_coll_sm_component.sm_component_setup_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Can we get an mpool allocation? See if there was one created
|
||||
already. If not, try to make one. */
|
||||
|
||||
mca_coll_sm_component.sm_data_mpool =
|
||||
mca_mpool_base_module_lookup(mca_coll_sm_component.sm_mpool_name);
|
||||
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
||||
mca_coll_sm_component.sm_data_mpool =
|
||||
mca_mpool_base_module_create(mca_coll_sm_component.sm_mpool_name,
|
||||
NULL, NULL);
|
||||
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
||||
mca_coll_sm_bootstrap_finalize();
|
||||
mca_coll_sm_component.sm_component_setup_success = false;
|
||||
opal_atomic_unlock(&mca_coll_sm_component.sm_component_setup_lock);
|
||||
return NULL;
|
||||
}
|
||||
mca_coll_sm_component.sm_data_mpool_created = true;
|
||||
} else {
|
||||
mca_coll_sm_component.sm_data_mpool_created = false;
|
||||
}
|
||||
mca_coll_sm_component.sm_component_setup_success = true;
|
||||
}
|
||||
opal_atomic_unlock(&mca_coll_sm_component.sm_component_setup_lock);
|
||||
|
||||
/* Double check to see if some interleaved lazy init failed before
|
||||
we got in here */
|
||||
|
||||
if (!mca_coll_sm_component.sm_component_setup_success) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Get some space to setup memory affinity (just easier to try to
|
||||
alloc here to handle the error case) */
|
||||
|
||||
@ -727,8 +761,9 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
||||
|
||||
/*
|
||||
* This function is not static and has a prefix-rule-enabled name
|
||||
* because it gets called from the component. This is only called
|
||||
* once -- no need for reference counting or thread protection.
|
||||
* because it gets called from the component (but may also be called
|
||||
* from above). This is only called once -- no need for reference
|
||||
* counting or thread protection.
|
||||
*/
|
||||
int mca_coll_sm_bootstrap_finalize(void)
|
||||
{
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user