Update MPI init to properly skip barriers
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
3b71be4db4
Коммит
0353be9704
@ -59,7 +59,7 @@
|
||||
#include "opal/mca/rcache/rcache.h"
|
||||
#include "opal/mca/mpool/base/base.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
#include "opal/util/timings.h"
|
||||
#include "opal/util/opal_environ.h"
|
||||
|
||||
@ -366,8 +366,8 @@ static int ompi_register_mca_variables(void)
|
||||
|
||||
static void fence_release(int status, void *cbdata)
|
||||
{
|
||||
volatile bool *active = (volatile bool*)cbdata;
|
||||
*active = false;
|
||||
opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata;
|
||||
OPAL_PMIX_WAKEUP_THREAD(lock);
|
||||
}
|
||||
|
||||
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
@ -377,9 +377,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
size_t nprocs;
|
||||
char *error = NULL;
|
||||
ompi_errhandler_errtrk_t errtrk;
|
||||
volatile bool active;
|
||||
opal_list_t info;
|
||||
opal_value_t *kv;
|
||||
opal_pmix_lock_t lock;
|
||||
bool background_fence = false;
|
||||
|
||||
OMPI_TIMING_INIT(32);
|
||||
|
||||
@ -682,24 +683,20 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) {
|
||||
/* execute the fence_nb in the background to collect
|
||||
* the data */
|
||||
if (!ompi_async_mpi_init) {
|
||||
/* we are going to execute a barrier at the
|
||||
* end of MPI_Init. We can only have ONE fence
|
||||
* operation with the identical involved procs
|
||||
* at a time, so we will need to wait when we
|
||||
* get there */
|
||||
active = true;
|
||||
opal_pmix.fence_nb(NULL, true, fence_release, (void*)&active);
|
||||
} else {
|
||||
opal_pmix.fence_nb(NULL, true, NULL, NULL);
|
||||
}
|
||||
background_fence = true;
|
||||
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
|
||||
opal_pmix.fence_nb(NULL, true, fence_release, (void*)&lock);
|
||||
} else if (!opal_pmix_base_async_modex) {
|
||||
active = true;
|
||||
/* we want to do the modex */
|
||||
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
|
||||
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
|
||||
fence_release, (void*)&active);
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||
fence_release, (void*)&lock);
|
||||
/* cannot just wait on thread as we need to call opal_progress */
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(lock.active);
|
||||
OPAL_PMIX_DESTRUCT_LOCK(&lock);
|
||||
}
|
||||
} else {
|
||||
/* otherwise, we don't want to do the modex, so fall thru */
|
||||
} else if (!opal_pmix_base_async_modex || opal_pmix_collect_all_data) {
|
||||
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
|
||||
}
|
||||
|
||||
@ -866,24 +863,24 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
/* Next timing measurement */
|
||||
OMPI_TIMING_NEXT("modex-barrier");
|
||||
|
||||
/* wait for everyone to reach this point - this is a hard
|
||||
* barrier requirement at this time, though we hope to relax
|
||||
* it at a later point */
|
||||
if (!ompi_async_mpi_init) {
|
||||
/* if we executed the above fence in the background, then
|
||||
* we have to wait here for it to complete. However, there
|
||||
* is no reason to do two barriers! */
|
||||
if (opal_pmix_base_async_modex && opal_pmix_collect_all_data) {
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||
/* if we executed the above fence in the background, then
|
||||
* we have to wait here for it to complete. However, there
|
||||
* is no reason to do two barriers! */
|
||||
if (background_fence) {
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(lock.active);
|
||||
OPAL_PMIX_DESTRUCT_LOCK(&lock);
|
||||
} else if (!ompi_async_mpi_init) {
|
||||
/* wait for everyone to reach this point - this is a hard
|
||||
* barrier requirement at this time, though we hope to relax
|
||||
* it at a later point */
|
||||
if (NULL != opal_pmix.fence_nb) {
|
||||
OPAL_PMIX_CONSTRUCT_LOCK(&lock);
|
||||
opal_pmix.fence_nb(NULL, false,
|
||||
fence_release, (void*)&lock);
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(lock.active);
|
||||
OPAL_PMIX_DESTRUCT_LOCK(&lock);
|
||||
} else {
|
||||
active = true;
|
||||
if (NULL != opal_pmix.fence_nb) {
|
||||
opal_pmix.fence_nb(NULL, false,
|
||||
fence_release, (void*)&active);
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||
} else {
|
||||
opal_pmix.fence(NULL, false);
|
||||
}
|
||||
opal_pmix.fence(NULL, false);
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user