When direct launching applications, we must allow the MPI layer to progress during RTE-level barriers. Neither SLURM nor Cray provide non-blocking fence functions, so push those calls into a separate event thread (use the OPAL async thread for this purpose so we don't create another one) and let the MPI thread sping in wait_for_completion. This also restores the "lazy" completion during MPI_Finalize to minimize cpu utilization.
Update external as well Revise the change: we still need the MPI_Barrier in MPI_Finalize when we use a blocking fence, but do use the "lazy" wait for completion. Replace the direct logic in MPI_Init with a cleaner macro
Этот коммит содержится в:
родитель
7f65c2b18e
Коммит
01ba861f2a
@ -16,7 +16,7 @@
|
||||
* Copyright (c) 2006 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Sandia National Laboratories. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -248,19 +248,20 @@ int ompi_mpi_finalize(void)
|
||||
more details). */
|
||||
if (NULL != opal_pmix.fence_nb) {
|
||||
active = true;
|
||||
/* Note that the non-blocking PMIx fence will cycle calling
|
||||
opal_progress(), which will allow any other pending
|
||||
communications/actions to complete. See
|
||||
https://github.com/open-mpi/ompi/issues/1576 for the
|
||||
original bug report. */
|
||||
/* Note that use of the non-blocking PMIx fence will
|
||||
* allow us to lazily cycle calling
|
||||
* opal_progress(), which will allow any other pending
|
||||
* communications/actions to complete. See
|
||||
* https://github.com/open-mpi/ompi/issues/1576 for the
|
||||
* original bug report. */
|
||||
opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
|
||||
OMPI_WAIT_FOR_COMPLETION(active);
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||
} else {
|
||||
/* However, we cannot guarantee that the provided PMIx has
|
||||
fence_nb. If it doesn't, then do the best we can: an MPI
|
||||
barrier on COMM_WORLD (which isn't the best because of the
|
||||
reasons cited above), followed by a blocking PMIx fence
|
||||
(which may not necessarily call opal_progress()). */
|
||||
* fence_nb. If it doesn't, then do the best we can: an MPI
|
||||
* barrier on COMM_WORLD (which isn't the best because of the
|
||||
* reasons cited above), followed by a blocking PMIx fence
|
||||
* (which does not call opal_progress()). */
|
||||
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
|
||||
comm->c_coll.coll_barrier(comm, comm->c_coll.coll_barrier_module);
|
||||
|
||||
|
@ -362,6 +362,12 @@ static int ompi_register_mca_variables(void)
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static void fence_release(int status, void *cbdata)
|
||||
{
|
||||
volatile bool *active = (volatile bool*)cbdata;
|
||||
*active = false;
|
||||
}
|
||||
|
||||
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
{
|
||||
int ret;
|
||||
@ -370,6 +376,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
char *error = NULL;
|
||||
char *cmd=NULL, *av=NULL;
|
||||
ompi_errhandler_errtrk_t errtrk;
|
||||
volatile bool active;
|
||||
OPAL_TIMING_DECLARE(tm);
|
||||
OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);
|
||||
|
||||
@ -634,7 +641,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
* if data exchange is required. The modex occurs solely across procs
|
||||
* in our job. If a barrier is required, the "modex" function will
|
||||
* perform it internally */
|
||||
OPAL_MODEX();
|
||||
active = true;
|
||||
opal_pmix.commit();
|
||||
if (!opal_pmix_base_async_modex) {
|
||||
if (NULL != opal_pmix.fence_nb) {
|
||||
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
|
||||
fence_release, (void*)&active);
|
||||
OMPI_WAIT_FOR_COMPLETION(active);
|
||||
} else {
|
||||
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_TIMING_MNEXT((&tm,"time from modex to first barrier"));
|
||||
|
||||
@ -802,7 +819,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
/* wait for everyone to reach this point - this is a hard
|
||||
* barrier requirement at this time, though we hope to relax
|
||||
* it at a later point */
|
||||
opal_pmix.fence(NULL, 0);
|
||||
active = true;
|
||||
opal_pmix.commit();
|
||||
if (NULL != opal_pmix.fence_nb) {
|
||||
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
|
||||
fence_release, (void*)&active);
|
||||
OMPI_WAIT_FOR_COMPLETION(active);
|
||||
} else {
|
||||
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
|
||||
}
|
||||
|
||||
/* check for timing request - get stop time and report elapsed
|
||||
time if so, then start the clock again */
|
||||
@ -839,8 +864,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
e.g. hierarch, might create subcommunicators. The threadlevel
|
||||
requested by all processes is required in order to know
|
||||
which cid allocation algorithm can be used. */
|
||||
if ( OMPI_SUCCESS !=
|
||||
( ret = ompi_comm_cid_init ())) {
|
||||
if (OMPI_SUCCESS != ( ret = ompi_comm_cid_init ())) {
|
||||
error = "ompi_mpi_init: ompi_comm_cid_init failed";
|
||||
goto error;
|
||||
}
|
||||
|
@ -57,7 +57,8 @@ static int cray_resolve_peers(const char *nodename,
|
||||
opal_list_t *procs);
|
||||
static int cray_resolve_nodes(opal_jobid_t jobid, char **nodelist);
|
||||
static int cray_put(opal_pmix_scope_t scope, opal_value_t *kv);
|
||||
static int cray_fence(opal_list_t *procs, int collect_data);
|
||||
static int cray_fencenb(opal_list_t *procs, int collect_data,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static int cray_commit(void);
|
||||
static int cray_get(const opal_process_name_t *id,
|
||||
const char *key, opal_list_t *info,
|
||||
@ -90,8 +91,8 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
|
||||
.initialized = cray_initialized,
|
||||
.abort = cray_abort,
|
||||
.commit = cray_commit,
|
||||
.fence = cray_fence,
|
||||
.fence_nb = NULL,
|
||||
.fence = NULL,
|
||||
.fence_nb = cray_fencenb,
|
||||
.put = cray_put,
|
||||
.get = cray_get,
|
||||
.get_nb = cray_get_nb,
|
||||
@ -119,6 +120,17 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
|
||||
// usage accounting
|
||||
static int pmix_init_count = 0;
|
||||
|
||||
// local object
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
opal_event_t ev;
|
||||
opal_pmix_op_cbfunc_t opcbfunc;
|
||||
void *cbdata;
|
||||
} pmi_opcaddy_t;
|
||||
OBJ_CLASS_INSTANCE(pmi_opcaddy_t,
|
||||
opal_object_t,
|
||||
NULL, NULL);
|
||||
|
||||
// PMI constant values:
|
||||
static int pmix_kvslen_max = 0;
|
||||
static int pmix_keylen_max = 0;
|
||||
@ -512,8 +524,9 @@ static int cray_commit(void)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int cray_fence(opal_list_t *procs, int collect_data)
|
||||
static void fencenb(int sd, short args, void *cbdata)
|
||||
{
|
||||
pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
|
||||
int rc, cnt;
|
||||
int32_t i;
|
||||
int *all_lens = NULL;
|
||||
@ -550,7 +563,8 @@ static int cray_fence(opal_list_t *procs, int collect_data)
|
||||
|
||||
send_buffer = OBJ_NEW(opal_buffer_t);
|
||||
if (NULL == send_buffer) {
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
rc = OPAL_ERR_OUT_OF_RESOURCE;
|
||||
goto fn_exit;
|
||||
}
|
||||
|
||||
opal_dss.copy_payload(send_buffer, mca_pmix_cray_component.cache_global);
|
||||
@ -732,7 +746,27 @@ fn_exit:
|
||||
if (r_bytes_and_ranks != NULL) {
|
||||
free(r_bytes_and_ranks);
|
||||
}
|
||||
return rc;
|
||||
if (NULL != op->opcbfunc) {
|
||||
op->opcbfunc(rc, op->cbdata);
|
||||
}
|
||||
OBJ_RELEASE(op);
|
||||
return;
|
||||
}
|
||||
|
||||
static int cray_fencenb(opal_list_t *procs, int collect_data,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
pmi_opcaddy_t *op;
|
||||
|
||||
/* thread-shift this so we don't block in Cray's barrier */
|
||||
op = OBJ_NEW(pmi_opcaddy_t);
|
||||
op->opcbfunc = cbfunc;
|
||||
op->cbdata = cbdata;
|
||||
event_assign(&op->ev, opal_pmix_base.evbase, -1,
|
||||
EV_WRITE, fencenb, op);
|
||||
event_active(&op->ev, EV_WRITE, 1);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int cray_get(const opal_process_name_t *id, const char *key, opal_list_t *info, opal_value_t **kv)
|
||||
|
2
opal/mca/pmix/external/pmix_ext_client.c
поставляемый
2
opal/mca/pmix/external/pmix_ext_client.c
поставляемый
@ -369,6 +369,8 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
|
||||
if (collect_data) {
|
||||
PMIX_INFO_CONSTRUCT(&info);
|
||||
(void)strncpy(info.key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
|
||||
info.value.type = PMIX_BOOL;
|
||||
info.value.data.flag = true;
|
||||
iptr = &info;
|
||||
n = 1;
|
||||
} else {
|
||||
|
@ -250,21 +250,6 @@ extern int opal_pmix_base_exchange(opal_value_t *info,
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
|
||||
/**
|
||||
* Provide a simplified macro for calling the fence function
|
||||
* that takes into account directives and availability of
|
||||
* non-blocking operations
|
||||
*/
|
||||
#define OPAL_MODEX() \
|
||||
do { \
|
||||
opal_pmix.commit(); \
|
||||
if (!opal_pmix_base_async_modex) { \
|
||||
opal_pmix.fence(NULL, \
|
||||
opal_pmix_collect_all_data); \
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
/**
|
||||
* Provide a macro for accessing a base function that exchanges
|
||||
* data values between two procs using the PMIx Publish/Lookup
|
||||
|
@ -364,6 +364,8 @@ int pmix1_fencenb(opal_list_t *procs, int collect_data,
|
||||
if (collect_data) {
|
||||
PMIX_INFO_CONSTRUCT(&info);
|
||||
(void)strncpy(info.key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
|
||||
info.value.type = PMIX_BOOL;
|
||||
info.value.data.flag = true;
|
||||
iptr = &info;
|
||||
n = 1;
|
||||
} else {
|
||||
|
@ -36,7 +36,8 @@ static int s1_initialized(void);
|
||||
static int s1_abort(int flag, const char msg[],
|
||||
opal_list_t *procs);
|
||||
static int s1_commit(void);
|
||||
static int s1_fence(opal_list_t *procs, int collect_data);
|
||||
static int s1_fencenb(opal_list_t *procs, int collect_data,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static int s1_put(opal_pmix_scope_t scope,
|
||||
opal_value_t *kv);
|
||||
static int s1_get(const opal_process_name_t *id,
|
||||
@ -59,7 +60,7 @@ const opal_pmix_base_module_t opal_pmix_s1_module = {
|
||||
.initialized = s1_initialized,
|
||||
.abort = s1_abort,
|
||||
.commit = s1_commit,
|
||||
.fence = s1_fence,
|
||||
.fence_nb = s1_fencenb,
|
||||
.put = s1_put,
|
||||
.get = s1_get,
|
||||
.publish = s1_publish,
|
||||
@ -78,6 +79,17 @@ const opal_pmix_base_module_t opal_pmix_s1_module = {
|
||||
// usage accounting
|
||||
static int pmix_init_count = 0;
|
||||
|
||||
// local object
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
opal_event_t ev;
|
||||
opal_pmix_op_cbfunc_t opcbfunc;
|
||||
void *cbdata;
|
||||
} pmi_opcaddy_t;
|
||||
OBJ_CLASS_INSTANCE(pmi_opcaddy_t,
|
||||
opal_object_t,
|
||||
NULL, NULL);
|
||||
|
||||
// PMI constant values:
|
||||
static int pmix_kvslen_max = 0;
|
||||
static int pmix_keylen_max = 0;
|
||||
@ -512,8 +524,9 @@ static int s1_commit(void)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int s1_fence(opal_list_t *procs, int collect_data)
|
||||
static void fencenb(int sd, short args, void *cbdata)
|
||||
{
|
||||
pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
|
||||
int rc;
|
||||
int32_t i;
|
||||
opal_value_t *kp, kvn;
|
||||
@ -527,7 +540,8 @@ static int s1_fence(opal_list_t *procs, int collect_data)
|
||||
/* use the PMI barrier function */
|
||||
if (PMI_SUCCESS != (rc = PMI_Barrier())) {
|
||||
OPAL_PMI_ERROR(rc, "PMI_Barrier");
|
||||
return OPAL_ERROR;
|
||||
rc = OPAL_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
|
||||
@ -548,7 +562,7 @@ static int s1_fence(opal_list_t *procs, int collect_data)
|
||||
&kp, pmix_kvs_name, pmix_vallen_max, kvs_get);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
if (NULL == kp || NULL == kp->data.string) {
|
||||
/* if we share a node, but we don't know anything more, then
|
||||
@ -579,6 +593,27 @@ static int s1_fence(opal_list_t *procs, int collect_data)
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (NULL != op->opcbfunc) {
|
||||
op->opcbfunc(rc, op->cbdata);
|
||||
}
|
||||
OBJ_RELEASE(op);
|
||||
return;
|
||||
}
|
||||
|
||||
static int s1_fencenb(opal_list_t *procs, int collect_data,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
pmi_opcaddy_t *op;
|
||||
|
||||
/* thread-shift this so we don't block in SLURM's barrier */
|
||||
op = OBJ_NEW(pmi_opcaddy_t);
|
||||
op->opcbfunc = cbfunc;
|
||||
op->cbdata = cbdata;
|
||||
event_assign(&op->ev, opal_pmix_base.evbase, -1,
|
||||
EV_WRITE, fencenb, op);
|
||||
event_active(&op->ev, EV_WRITE, 1);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -43,7 +43,8 @@ static int s2_initialized(void);
|
||||
static int s2_abort(int flag, const char msg[],
|
||||
opal_list_t *procs);
|
||||
static int s2_commit(void);
|
||||
static int s2_fence(opal_list_t *procs, int collect_data);
|
||||
static int s2_fencenb(opal_list_t *procs, int collect_data,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
|
||||
static int s2_put(opal_pmix_scope_t scope,
|
||||
opal_value_t *kv);
|
||||
static int s2_get(const opal_process_name_t *id,
|
||||
@ -66,7 +67,7 @@ const opal_pmix_base_module_t opal_pmix_s2_module = {
|
||||
.initialized = s2_initialized,
|
||||
.abort = s2_abort,
|
||||
.commit = s2_commit,
|
||||
.fence = s2_fence,
|
||||
.fence_nb = s2_fencenb,
|
||||
.put = s2_put,
|
||||
.get = s2_get,
|
||||
.publish = s2_publish,
|
||||
@ -85,6 +86,17 @@ const opal_pmix_base_module_t opal_pmix_s2_module = {
|
||||
// usage accounting
|
||||
static int pmix_init_count = 0;
|
||||
|
||||
// local object
|
||||
typedef struct {
|
||||
opal_object_t super;
|
||||
opal_event_t ev;
|
||||
opal_pmix_op_cbfunc_t opcbfunc;
|
||||
void *cbdata;
|
||||
} pmi_opcaddy_t;
|
||||
OBJ_CLASS_INSTANCE(pmi_opcaddy_t,
|
||||
opal_object_t,
|
||||
NULL, NULL);
|
||||
|
||||
// PMI constant values:
|
||||
static int pmix_kvslen_max = 0;
|
||||
static int pmix_keylen_max = 0;
|
||||
@ -530,8 +542,9 @@ static int s2_commit(void)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static int s2_fence(opal_list_t *procs, int collect_data)
|
||||
static void fencenb(int sd, short args, void *cbdata)
|
||||
{
|
||||
pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
|
||||
int rc;
|
||||
int32_t i;
|
||||
opal_value_t *kp, kvn;
|
||||
@ -549,7 +562,8 @@ static int s2_fence(opal_list_t *procs, int collect_data)
|
||||
|
||||
/* now call fence */
|
||||
if (PMI2_SUCCESS != PMI2_KVS_Fence()) {
|
||||
return OPAL_ERROR;
|
||||
rc = OPAL_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* get the modex data from each local process and set the
|
||||
@ -566,7 +580,7 @@ static int s2_fence(opal_list_t *procs, int collect_data)
|
||||
&kp, pmix_kvs_name, pmix_vallen_max, kvs_get);
|
||||
if (OPAL_SUCCESS != rc) {
|
||||
OPAL_ERROR_LOG(rc);
|
||||
return rc;
|
||||
goto cleanup;
|
||||
}
|
||||
if (NULL == kp || NULL == kp->data.string) {
|
||||
/* if we share a node, but we don't know anything more, then
|
||||
@ -597,6 +611,27 @@ static int s2_fence(opal_list_t *procs, int collect_data)
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
if (NULL != op->opcbfunc) {
|
||||
op->opcbfunc(rc, op->cbdata);
|
||||
}
|
||||
OBJ_RELEASE(op);
|
||||
return;
|
||||
}
|
||||
|
||||
static int s2_fencenb(opal_list_t *procs, int collect_data,
|
||||
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
|
||||
{
|
||||
pmi_opcaddy_t *op;
|
||||
|
||||
/* thread-shift this so we don't block in SLURM's barrier */
|
||||
op = OBJ_NEW(pmi_opcaddy_t);
|
||||
op->opcbfunc = cbfunc;
|
||||
op->cbdata = cbdata;
|
||||
event_assign(&op->ev, opal_pmix_base.evbase, -1,
|
||||
EV_WRITE, fencenb, op);
|
||||
event_active(&op->ev, EV_WRITE, 1);
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user