1
1
openmpi/opal/mca/pmix/pmix2x/pmix2x_component.c

130 строки
4.1 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "opal_config.h"
#include "opal/constants.h"
#include "opal/class/opal_list.h"
#include "opal/util/proc.h"
#include "opal/mca/pmix/pmix.h"
#include "pmix2x.h"
/*
* Public string showing the pmix external component version number
*/
const char *opal_pmix_pmix2x_component_version_string =
"OPAL pmix2x MCA component version " OPAL_VERSION;
/*
* Local function
*/
static int external_register(void);
static int external_open(void);
static int external_close(void);
static int external_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointers to our public functions in it
*/
mca_pmix_pmix2x_component_t mca_pmix_pmix2x_component = {
{
/* First, the mca_component_t struct containing meta information
about the component itself */
.base_version = {
/* Indicate that we are a pmix v1.1.0 component (which also
implies a specific MCA version) */
OPAL_PMIX_BASE_VERSION_2_0_0,
/* Component name and version */
.mca_component_name = "pmix2x",
MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
OPAL_RELEASE_VERSION),
/* Component open and close functions */
.mca_open_component = external_open,
.mca_close_component = external_close,
.mca_query_component = external_component_query,
.mca_register_component_params = external_register
},
/* Next the MCA v1.0.0 component meta data */
.base_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
},
.native_launch = false
};
static int external_register(void)
{
mca_base_component_t *component = &mca_pmix_pmix2x_component.super.base_version;
mca_pmix_pmix2x_component.silence_warning = false;
(void) mca_base_component_var_register (component, "silence_warning",
"Silence warning about PMIX_INSTALL_PREFIX",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_pmix_pmix2x_component.silence_warning);
return OPAL_SUCCESS;
}
static int external_open(void)
{
mca_pmix_pmix2x_component.evindex = 0;
OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.jobids, opal_list_t);
OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.events, opal_list_t);
Implement a background fence that collects all data during modex operation The direct modex operation is slow, especially at scale for even modestly-connected applications. Likewise, blocking in MPI_Init while we wait for a full modex to complete takes too long. However, as George pointed out, there is a middle ground here. We could kickoff the modex operation in the background, and then trap any modex_recv's until the modex completes and the data is delivered. For most non-benchmark apps, this may prove to be the best of the available options as they are likely to perform other (non-communicating) setup operations after MPI_Init, and so there is a reasonable chance that the modex will actually be done before the first modex_recv gets called. Once we get instant-on-enabled hardware, this won't be necessary. Clearly, zero time will always out-perform the time spent doing a modex. However, this provides a decent compromise in the interim. This PR changes the default settings of a few relevant params to make "background modex" the default behavior: * pmix_base_async_modex -> defaults to true * pmix_base_collect_data -> continues to default to true (no change) * async_mpi_init - defaults to true. Note that the prior code attempted to base the default setting of this value on the setting of pmix_base_async_modex. Unfortunately, the pmix value isn't set prior to setting async_mpi_init, and so that attempt failed to accomplish anything. The logic in MPI_Init is: * if async_modex AND collect_data are set, AND we have a non-blocking fence available, then we execute the background modex operation * if async_modex is set, but collect_data is false, then we simply skip the modex entirely - no fence is performed * if async_modex is not set, then we block until the fence completes (regardless of collecting data or not) * if we do NOT have a non-blocking fence (e.g., we are not using PMIx), then we always perform the full blocking modex operation. * if we do perform the background modex, and the user requested the barrier be performed at the end of MPI_Init, then we check to see if the modex has completed when we reach that point. If it has, then we execute the barrier. However, if the modex has NOT completed, then we block until the modex does complete and skip the extra barrier. So we never perform two barriers in that case. HTH Ralph Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-04-21 10:29:23 -07:00
OBJ_CONSTRUCT(&mca_pmix_pmix2x_component.dmdx, opal_list_t);
return OPAL_SUCCESS;
}
static int external_close(void)
{
OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.jobids);
OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.events);
Implement a background fence that collects all data during modex operation The direct modex operation is slow, especially at scale for even modestly-connected applications. Likewise, blocking in MPI_Init while we wait for a full modex to complete takes too long. However, as George pointed out, there is a middle ground here. We could kickoff the modex operation in the background, and then trap any modex_recv's until the modex completes and the data is delivered. For most non-benchmark apps, this may prove to be the best of the available options as they are likely to perform other (non-communicating) setup operations after MPI_Init, and so there is a reasonable chance that the modex will actually be done before the first modex_recv gets called. Once we get instant-on-enabled hardware, this won't be necessary. Clearly, zero time will always out-perform the time spent doing a modex. However, this provides a decent compromise in the interim. This PR changes the default settings of a few relevant params to make "background modex" the default behavior: * pmix_base_async_modex -> defaults to true * pmix_base_collect_data -> continues to default to true (no change) * async_mpi_init - defaults to true. Note that the prior code attempted to base the default setting of this value on the setting of pmix_base_async_modex. Unfortunately, the pmix value isn't set prior to setting async_mpi_init, and so that attempt failed to accomplish anything. The logic in MPI_Init is: * if async_modex AND collect_data are set, AND we have a non-blocking fence available, then we execute the background modex operation * if async_modex is set, but collect_data is false, then we simply skip the modex entirely - no fence is performed * if async_modex is not set, then we block until the fence completes (regardless of collecting data or not) * if we do NOT have a non-blocking fence (e.g., we are not using PMIx), then we always perform the full blocking modex operation. * if we do perform the background modex, and the user requested the barrier be performed at the end of MPI_Init, then we check to see if the modex has completed when we reach that point. If it has, then we execute the barrier. However, if the modex has NOT completed, then we block until the modex does complete and skip the extra barrier. So we never perform two barriers in that case. HTH Ralph Signed-off-by: Ralph Castain <rhc@open-mpi.org>
2017-04-21 10:29:23 -07:00
OPAL_LIST_DESTRUCT(&mca_pmix_pmix2x_component.dmdx);
return OPAL_SUCCESS;
}
static int external_component_query(mca_base_module_t **module, int *priority)
{
char *t, *id;
/* see if a PMIx server is present */
if (NULL != (t = getenv("PMIX_SERVER_URI")) ||
NULL != (id = getenv("PMIX_ID"))) {
/* if PMIx is present, then we are a client and need to use it */
*priority = 100;
} else {
/* we could be a server, so we still need to be considered */
*priority = 5;
}
*module = (mca_base_module_t *)&opal_pmix_pmix2x_module;
return OPAL_SUCCESS;
}