1
1
openmpi/opal/mca/pmix/base/pmix_base_frame.c
Ralph Castain 9fc3079ac2 Implement a background fence that collects all data during modex operation
The direct modex operation is slow, especially at scale for even modestly-connected applications. Likewise, blocking in MPI_Init while we wait for a full modex to complete takes too long. However, as George pointed out, there is a middle ground here. We could kickoff the modex operation in the background, and then trap any modex_recv's until the modex completes and the data is delivered. For most non-benchmark apps, this may prove to be the best of the available options as they are likely to perform other (non-communicating) setup operations after MPI_Init, and so there is a reasonable chance that the modex will actually be done before the first modex_recv gets called.

Once we get instant-on-enabled hardware, this won't be necessary. Clearly, zero time will always out-perform the time spent doing a modex. However, this provides a decent compromise in the interim.

This PR changes the default settings of a few relevant params to make "background modex" the default behavior:

* pmix_base_async_modex -> defaults to true

* pmix_base_collect_data -> continues to default to true (no change)

* async_mpi_init - defaults to true. Note that the prior code attempted to base the default setting of this value on the setting of pmix_base_async_modex. Unfortunately, the pmix value isn't set prior to setting async_mpi_init, and so that attempt failed to accomplish anything.

The logic in MPI_Init is:

* if async_modex AND collect_data are set, AND we have a non-blocking fence available, then we execute the background modex operation

* if async_modex is set, but collect_data is false, then we simply skip the modex entirely - no fence is performed

* if async_modex is not set, then we block until the fence completes (regardless of collecting data or not)

* if we do NOT have a non-blocking fence (e.g., we are not using PMIx), then we always perform the full blocking modex operation.

* if we do perform the background modex, and the user requested the barrier be performed at the end of MPI_Init, then we check to see if the modex has completed when we reach that point. If it has, then we execute the barrier. However, if the modex has NOT completed, then we block until the modex does complete and skip the extra barrier. So we never perform two barriers in that case.

HTH
Ralph

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-04-21 10:29:23 -07:00

158 строки
4.4 KiB
C

/*
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/mca/mca.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/mca/base/base.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/pmix/base/base.h"
/*
* The following file was created by configure. It contains extern
* components and the definition of an array of pointers to each
* module's public mca_base_module_t struct.
*/
#include "opal/mca/pmix/base/static-components.h"
/* Note that this initializer is important -- do not remove it! See
https://github.com/open-mpi/ompi/issues/375 for details. */
opal_pmix_base_module_t opal_pmix = { 0 };
bool opal_pmix_collect_all_data = true;
int opal_pmix_verbose_output = -1;
bool opal_pmix_base_async_modex = false;
opal_pmix_base_t opal_pmix_base = {0};
static int opal_pmix_base_frame_register(mca_base_register_flag_t flags)
{
opal_pmix_base_async_modex = true;
(void) mca_base_var_register("opal", "pmix", "base", "async_modex", "Use asynchronous modex mode",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_base_async_modex);
opal_pmix_collect_all_data = true;
(void) mca_base_var_register("opal", "pmix", "base", "collect_data", "Collect all data during modex",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &opal_pmix_collect_all_data);
return OPAL_SUCCESS;
}
static int opal_pmix_base_frame_close(void)
{
int rc;
rc = mca_base_framework_components_close(&opal_pmix_base_framework, NULL);
/* reset the opal_pmix function pointers to NULL */
memset(&opal_pmix, 0, sizeof(opal_pmix));
return rc;
}
static int opal_pmix_base_frame_open(mca_base_open_flag_t flags)
{
int rc;
/* Open up all available components */
rc = mca_base_framework_components_open(&opal_pmix_base_framework, flags);
/* ensure the function pointers are NULL */
memset(&opal_pmix, 0, sizeof(opal_pmix));
/* pass across the verbosity */
opal_pmix_verbose_output = opal_pmix_base_framework.framework_output;
return rc;
}
MCA_BASE_FRAMEWORK_DECLARE(opal, pmix, "OPAL PMI Client Framework",
opal_pmix_base_frame_register,
opal_pmix_base_frame_open,
opal_pmix_base_frame_close,
mca_pmix_base_static_components, 0);
/**** PMIX FRAMEWORK OBJECTS ****/
static void lkcon(opal_pmix_pdata_t *p)
{
p->proc.jobid = OPAL_JOBID_INVALID;
p->proc.vpid = OPAL_VPID_INVALID;
OBJ_CONSTRUCT(&p->value, opal_value_t);
}
static void lkdes(opal_pmix_pdata_t *p)
{
OBJ_DESTRUCT(&p->value);
}
OBJ_CLASS_INSTANCE(opal_pmix_pdata_t,
opal_list_item_t,
lkcon, lkdes);
static void mdcon(opal_pmix_modex_data_t *p)
{
p->proc.jobid = OPAL_JOBID_INVALID;
p->proc.vpid = OPAL_VPID_INVALID;
p->blob = NULL;
p->size = 0;
}
static void mddes(opal_pmix_modex_data_t *p)
{
if (NULL != p->blob) {
free(p->blob);
}
}
OBJ_CLASS_INSTANCE(opal_pmix_modex_data_t,
opal_list_item_t,
mdcon, mddes);
static void apcon(opal_pmix_app_t *p)
{
p->cmd = NULL;
p->argv = NULL;
p->env = NULL;
p->cwd = NULL;
p->maxprocs = 0;
OBJ_CONSTRUCT(&p->info, opal_list_t);
}
static void apdes(opal_pmix_app_t *p)
{
if (NULL != p->cmd) {
free(p->cmd);
}
if (NULL != p->argv) {
opal_argv_free(p->argv);
}
if (NULL != p->env) {
opal_argv_free(p->env);
}
if (NULL != p->cwd) {
free(p->cwd);
}
OPAL_LIST_DESTRUCT(&p->info);
}
OBJ_CLASS_INSTANCE(opal_pmix_app_t,
opal_list_item_t,
apcon, apdes);
static void qcon(opal_pmix_query_t *p)
{
p->keys = NULL;
OBJ_CONSTRUCT(&p->qualifiers, opal_list_t);
}
static void qdes(opal_pmix_query_t *p)
{
if (NULL != p->keys) {
opal_argv_free(p->keys);
}
OPAL_LIST_DESTRUCT(&p->qualifiers);
}
OBJ_CLASS_INSTANCE(opal_pmix_query_t,
opal_list_item_t,
qcon, qdes);