Fix for a bug Galen noticed yesterday -- make the shared memory only
be allocated the first time a sm coll is selected for a communicator, not before. This commit was SVN r7647.
Этот коммит содержится в:
родитель
1fe18814da
Коммит
b22fab2826
@ -37,7 +37,7 @@
|
|||||||
#define D(foo)
|
#define D(foo)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if 0
|
#if OMPI_ENABLE_DEBUG
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#define SPIN sched_yield()
|
#define SPIN sched_yield()
|
||||||
#else
|
#else
|
||||||
@ -190,6 +190,21 @@ extern "C" {
|
|||||||
the division once and then just use the value without
|
the division once and then just use the value without
|
||||||
having to re-calculate. */
|
having to re-calculate. */
|
||||||
int sm_segs_per_inuse_flag;
|
int sm_segs_per_inuse_flag;
|
||||||
|
|
||||||
|
/** Whether the component's shared memory has been [lazily]
|
||||||
|
initialized or not */
|
||||||
|
bool sm_component_setup;
|
||||||
|
|
||||||
|
/** Once the component has been lazily initialized, keep the
|
||||||
|
state of it around */
|
||||||
|
bool sm_component_setup_success;
|
||||||
|
|
||||||
|
/** A lock protecting the lazy initialzation of the component
|
||||||
|
(SINCE THERE IS NO STATIC INITIALIZER FOR
|
||||||
|
opal_atomic_lock_t, THIS *MUST* BE THE LAST MEMBER OF THE
|
||||||
|
STRUCT!) */
|
||||||
|
opal_atomic_lock_t sm_component_setup_lock;
|
||||||
|
|
||||||
};
|
};
|
||||||
/**
|
/**
|
||||||
* Convenience typedef
|
* Convenience typedef
|
||||||
|
@ -133,7 +133,11 @@ mca_coll_sm_component_t mca_coll_sm_component = {
|
|||||||
0, /* mpool data size -- filled in below */
|
0, /* mpool data size -- filled in below */
|
||||||
NULL, /* data mpool pointer */
|
NULL, /* data mpool pointer */
|
||||||
false, /* whether this process created the data mpool */
|
false, /* whether this process created the data mpool */
|
||||||
NULL /* pointer to meta data about bootstrap area */
|
NULL, /* pointer to meta data about bootstrap area */
|
||||||
|
false, /* whether the component sm has been [lazily] inited or not */
|
||||||
|
false /* whether lazy init was successful or not */
|
||||||
|
/* the lock for lazy initialization is not initialized here --
|
||||||
|
there is no static initializer for opal_atomic_lock_t */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -110,7 +110,6 @@ static const mca_coll_base_module_1_0_0_t module = {
|
|||||||
int mca_coll_sm_init_query(bool enable_progress_threads,
|
int mca_coll_sm_init_query(bool enable_progress_threads,
|
||||||
bool enable_mpi_threads)
|
bool enable_mpi_threads)
|
||||||
{
|
{
|
||||||
int ret;
|
|
||||||
#if 0
|
#if 0
|
||||||
/* JMS: Arrgh. Unfortunately, we don't have this information by
|
/* JMS: Arrgh. Unfortunately, we don't have this information by
|
||||||
the time this is invoked -- the GPR compound command doesn't
|
the time this is invoked -- the GPR compound command doesn't
|
||||||
@ -132,29 +131,12 @@ int mca_coll_sm_init_query(bool enable_progress_threads,
|
|||||||
free(procs);
|
free(procs);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Ok, we have local peers. So setup the bootstrap file */
|
/* Don't do much here because we don't really want to allocate any
|
||||||
|
shared memory until this component is selected to be used. */
|
||||||
|
|
||||||
if (OMPI_SUCCESS != (ret = bootstrap_init())) {
|
mca_coll_sm_component.sm_data_mpool_created = false;
|
||||||
return ret;
|
mca_coll_sm_component.sm_component_setup = false;
|
||||||
}
|
opal_atomic_init(&mca_coll_sm_component.sm_component_setup_lock, 0);
|
||||||
|
|
||||||
/* Can we get an mpool allocation? See if there was one created
|
|
||||||
already. If not, try to make one. */
|
|
||||||
|
|
||||||
mca_coll_sm_component.sm_data_mpool =
|
|
||||||
mca_mpool_base_module_lookup(mca_coll_sm_component.sm_mpool_name);
|
|
||||||
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
|
||||||
mca_coll_sm_component.sm_data_mpool =
|
|
||||||
mca_mpool_base_module_create(mca_coll_sm_component.sm_mpool_name,
|
|
||||||
NULL, NULL);
|
|
||||||
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
|
||||||
mca_coll_sm_bootstrap_finalize();
|
|
||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
||||||
}
|
|
||||||
mca_coll_sm_component.sm_data_mpool_created = true;
|
|
||||||
} else {
|
|
||||||
mca_coll_sm_component.sm_data_mpool_created = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Alles gut */
|
/* Alles gut */
|
||||||
|
|
||||||
@ -171,6 +153,13 @@ const mca_coll_base_module_1_0_0_t *
|
|||||||
mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority,
|
mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority,
|
||||||
struct mca_coll_base_comm_t **data)
|
struct mca_coll_base_comm_t **data)
|
||||||
{
|
{
|
||||||
|
/* See if someone has previously lazily initialized and failed */
|
||||||
|
|
||||||
|
if (mca_coll_sm_component.sm_component_setup &&
|
||||||
|
!mca_coll_sm_component.sm_component_setup_success) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* If we're intercomm, or if there's only one process in the
|
/* If we're intercomm, or if there's only one process in the
|
||||||
communicator, or if not all the processes in the communicator
|
communicator, or if not all the processes in the communicator
|
||||||
are not on this node, then we don't want to run */
|
are not on this node, then we don't want to run */
|
||||||
@ -227,6 +216,51 @@ sm_module_init(struct ompi_communicator_t *comm)
|
|||||||
char *base;
|
char *base;
|
||||||
const int num_barrier_buffers = 2;
|
const int num_barrier_buffers = 2;
|
||||||
|
|
||||||
|
/* Once-per-component setup. This may happen at any time --
|
||||||
|
during MPI_INIT or later. So we must protect this with locks
|
||||||
|
to ensure that only one thread in the process actually does
|
||||||
|
this setup. */
|
||||||
|
|
||||||
|
opal_atomic_lock(&mca_coll_sm_component.sm_component_setup_lock);
|
||||||
|
if (!mca_coll_sm_component.sm_component_setup) {
|
||||||
|
mca_coll_sm_component.sm_component_setup = true;
|
||||||
|
|
||||||
|
if (OMPI_SUCCESS != bootstrap_init()) {
|
||||||
|
mca_coll_sm_component.sm_component_setup_success = false;
|
||||||
|
opal_atomic_unlock(&mca_coll_sm_component.sm_component_setup_lock);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Can we get an mpool allocation? See if there was one created
|
||||||
|
already. If not, try to make one. */
|
||||||
|
|
||||||
|
mca_coll_sm_component.sm_data_mpool =
|
||||||
|
mca_mpool_base_module_lookup(mca_coll_sm_component.sm_mpool_name);
|
||||||
|
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
||||||
|
mca_coll_sm_component.sm_data_mpool =
|
||||||
|
mca_mpool_base_module_create(mca_coll_sm_component.sm_mpool_name,
|
||||||
|
NULL, NULL);
|
||||||
|
if (NULL == mca_coll_sm_component.sm_data_mpool) {
|
||||||
|
mca_coll_sm_bootstrap_finalize();
|
||||||
|
mca_coll_sm_component.sm_component_setup_success = false;
|
||||||
|
opal_atomic_unlock(&mca_coll_sm_component.sm_component_setup_lock);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
mca_coll_sm_component.sm_data_mpool_created = true;
|
||||||
|
} else {
|
||||||
|
mca_coll_sm_component.sm_data_mpool_created = false;
|
||||||
|
}
|
||||||
|
mca_coll_sm_component.sm_component_setup_success = true;
|
||||||
|
}
|
||||||
|
opal_atomic_unlock(&mca_coll_sm_component.sm_component_setup_lock);
|
||||||
|
|
||||||
|
/* Double check to see if some interleaved lazy init failed before
|
||||||
|
we got in here */
|
||||||
|
|
||||||
|
if (!mca_coll_sm_component.sm_component_setup_success) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* Get some space to setup memory affinity (just easier to try to
|
/* Get some space to setup memory affinity (just easier to try to
|
||||||
alloc here to handle the error case) */
|
alloc here to handle the error case) */
|
||||||
|
|
||||||
@ -727,8 +761,9 @@ static int bootstrap_comm(ompi_communicator_t *comm)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* This function is not static and has a prefix-rule-enabled name
|
* This function is not static and has a prefix-rule-enabled name
|
||||||
* because it gets called from the component. This is only called
|
* because it gets called from the component (but may also be called
|
||||||
* once -- no need for reference counting or thread protection.
|
* from above). This is only called once -- no need for reference
|
||||||
|
* counting or thread protection.
|
||||||
*/
|
*/
|
||||||
int mca_coll_sm_bootstrap_finalize(void)
|
int mca_coll_sm_bootstrap_finalize(void)
|
||||||
{
|
{
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user