1
1
openmpi/ompi/mca/common/sm/common_sm.c

643 строки
22 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/argv.h"
#if OPAL_ENABLE_FT_CR == 1
#include "opal/runtime/opal_cr.h"
#endif
#include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "ompi/constants.h"
#include "ompi/mca/dpm/dpm.h"
#include "common_sm_rml.h"
#include "common_sm_mmap.h"
#if MCA_COMMON_SM_SYSV
#include "common_sm_sysv.h"
#endif /* MCA_COMMON_SM_SYSV */
#if MCA_COMMON_SM_WINDOWS
#include "common_sm_windows.h"
#endif /* MCA_COMMON_SM_WINDOWS */
#if MCA_COMMON_SM_POSIX
#include "common_sm_posix.h"
#endif /* MCA_COMMON_SM_POSIX */
/**
* ASSUMING local proc homogeneity with respect to all utilized shared memory
* facilities. that is, if one local proc deems a particular shared memory
* facility acceptable, then ALL local procs should be able to utilize that
* facility. as it stands, this is an important point because one process
* dictates to all other local procs which common sm component will be selected
* based on its own, local run-time test.
*/
static bool initialized = false;
static int sysv_index = -1;
static int posix_index = -1;
static int common_sm_index = -1;
static char **sm_argv = NULL;
static char *sm_params = NULL;
static mca_common_sm_init_fn_t sm_init = NULL;
static mca_common_sm_seg_alloc_fn_t sm_seg_alloc = NULL;
static mca_common_sm_fini_fn_t sm_fini = NULL;
/* should be more than enough to store all common sm component names */
static char sm_default[32];
/* holds common sm help string */
char sm_avail_help_str[OPAL_PATH_MAX];
/**
* lock to protect multiple instances of query_sm_components()
* from being invoked simultaneously (because of rml usage).
*/
static opal_mutex_t mutex;
/* common shared memory component information */
typedef struct
{
/* flag indicating whether or not the component is available */
bool avail;
/* component name */
char *sm_name;
} mca_common_sm_info_t;
/**
* NOTE:
* o array position dictates the default order in which
* the common shared memory components will be queried.
* o first component successfully queried gets selected.
* o sm_name format: {component availability, "component name,"}
*
* if you change the order of sm_avail_table below,
* don't forget to update mca_common_sm_comp_index_map_t.
*
* placing mmap before sysv in the list prevents sysv from ever being selected
* (in the default case). this is because, at least for now, mmap's selection
* query always succeeds. that is, sysv must be explicitly requested.
* NOTE: mmap is the default for now.
*
* {component availability, component name}
*/
static const mca_common_sm_info_t sm_avail_table[] =
{
{true , "mmap," }, /* assume mmap is always available */
{(bool)MCA_COMMON_SM_POSIX, "posix,"},
{(bool)MCA_COMMON_SM_SYSV , "sysv," },
{false , NULL } /* MUST BE LAST ITEM */
};
/* component index enum */
typedef enum
{
MCA_COMMON_SM_COMP_INDEX_MMAP = 0,
MCA_COMMON_SM_COMP_INDEX_POSIX,
MCA_COMMON_SM_COMP_INDEX_SYSV,
MCA_COMMON_SM_COMP_INDEX_NONE /* MUST BE LAST ITEM */
} mca_common_sm_comp_index_map_t;
/**
* list of RML messages that have arrived that have not yet been
* consumed by the thread who is looking to complete its component
* initialization based on the contents of the RML message.
*/
static opal_list_t pending_rml_msgs;
/******************************************************************************/
/* STATIC UTILITY FUNCTIONS */
/******************************************************************************/
/******************************************************************************/
/**
* this routine selects the common sm component that corresponds to
* sm_component_index's value.
*
* @param sm_component_index index corresponding to the common sm component that
* is to be selected. (IN)
*/
static void
select_common_sm_component(int sm_component_index)
{
switch (sm_component_index)
{
#if MCA_COMMON_SM_POSIX
case MCA_COMMON_SM_COMP_INDEX_POSIX:
sm_init = mca_common_sm_posix_init;
sm_seg_alloc = mca_common_sm_posix_seg_alloc;
sm_fini = mca_common_sm_posix_fini;
break;
#endif
case MCA_COMMON_SM_COMP_INDEX_MMAP:
#if !MCA_COMMON_SM_WINDOWS
sm_init = mca_common_sm_mmap_init;
sm_seg_alloc = mca_common_sm_mmap_seg_alloc;
sm_fini = mca_common_sm_mmap_fini;
#else /* MCA_COMMON_SM_WINDOWS */
sm_init = mca_common_sm_windows_init;
sm_seg_alloc = mca_common_sm_windows_seg_alloc;
sm_fini = mca_common_sm_windows_fini;
#endif
break;
#if MCA_COMMON_SM_SYSV
case MCA_COMMON_SM_COMP_INDEX_SYSV:
sm_init = mca_common_sm_sysv_init;
sm_seg_alloc = mca_common_sm_sysv_seg_alloc;
sm_fini = mca_common_sm_sysv_fini;
break;
#endif
case MCA_COMMON_SM_COMP_INDEX_NONE:
sm_init = NULL;
sm_seg_alloc = NULL;
sm_fini = NULL;
break;
default:
sm_init = NULL;
sm_seg_alloc = NULL;
sm_fini = NULL;
opal_output(0, "WARNING: invalid common sm component index.");
break;
}
}
/******************************************************************************/
/**
* this routine performs a series of run-time tests that determines whether or
* not a particular common sm component can be selected safely. once a component
* is successfully selected, its component index is returned.
*
* @return index corresponding to the selected common sm component. see
* mca_common_sm_comp_index_map_t for valid values.
*/
static int
query_sm_components(void)
{
int help_msg_displayed = 0;
int sm_component_index = MCA_COMMON_SM_COMP_INDEX_NONE;
int i;
if (NULL != sm_argv)
{
MCA_COMMON_SM_OUTPUT_VERBOSE("looking for available components");
for (i = 0; NULL != sm_argv[i]; ++i)
{
if (0 == strcasecmp(sm_argv[i], "posix"))
{
#if !MCA_COMMON_SM_POSIX
if (!help_msg_displayed)
{
orte_show_help("help-mpi-common-sm.txt",
"sm support",
1,
sm_argv[i]);
help_msg_displayed = 1;
}
#else /* MCA_COMMON_SM_POSIX */
MCA_COMMON_SM_OUTPUT_VERBOSE("querying posix");
/**
* make sure that we can safely use posix sm on this system
*/
if (OMPI_SUCCESS ==
mca_common_sm_posix_component_query())
{
MCA_COMMON_SM_OUTPUT_VERBOSE("selecting posix");
sm_component_index = MCA_COMMON_SM_COMP_INDEX_POSIX;
break;
}
else /* let the user know that we tried posix and failed */
{
MCA_COMMON_SM_OUTPUT_VERBOSE("cannot select posix");
orte_show_help("help-mpi-common-sm.txt",
"sm rt test fail",
1,
"Posix");
}
#endif
}
else if (0 == strcasecmp(sm_argv[i], "mmap"))
{
MCA_COMMON_SM_OUTPUT_VERBOSE("selecting mmap");
/* there is no run-time test for mmap, so just select it */
sm_component_index = MCA_COMMON_SM_COMP_INDEX_MMAP;
break;
}
else if (0 == strcasecmp(sm_argv[i], "sysv"))
{
#if !MCA_COMMON_SM_SYSV
if (!help_msg_displayed)
{
orte_show_help("help-mpi-common-sm.txt",
"sm support",
1,
sm_argv[i]);
help_msg_displayed = 1;
}
#else /* MCA_COMMON_SM_SYSV */
MCA_COMMON_SM_OUTPUT_VERBOSE("querying sysv");
/* SKG - disable sysv support when cr is enabled.
* could presumably work properly someday.
*/
#if OPAL_ENABLE_FT_CR == 1
if (!opal_cr_is_enabled)
{
#endif /* OPAL_ENABLE_FT_CR */
/* make sure that we can safely use sysv on this system */
if (OMPI_SUCCESS == mca_common_sm_sysv_component_query())
{
MCA_COMMON_SM_OUTPUT_VERBOSE("selecting sysv");
sm_component_index = MCA_COMMON_SM_COMP_INDEX_SYSV;
break;
}
else /* let the user know that we tried sysv and failed */
{
MCA_COMMON_SM_OUTPUT_VERBOSE("cannot select sysv");
orte_show_help("help-mpi-common-sm.txt",
"sm rt test fail",
1,
"System V");
}
#if OPAL_ENABLE_FT_CR == 1
}
else
{
orte_show_help("help-mpi-common-sm.txt",
"sysv with cr",
1);
help_msg_displayed = 1;
}
#endif /* OPAL_ENABLE_FT_CR */
#endif
}
else /* unknown value */
{
if (!help_msg_displayed)
{
orte_show_help("help-mpi-common-sm.txt",
"sm support",
1,
sm_argv[i]);
help_msg_displayed = 1;
}
}
}
}
if (MCA_COMMON_SM_COMP_INDEX_NONE == sm_component_index)
{
MCA_COMMON_SM_OUTPUT_VERBOSE("no component selected");
}
return sm_component_index;
}
/******************************************************************************/
int
mca_common_sm_param_register(mca_base_component_t *c)
{
/* also using sysv_index's value as an initialization flag */
if (-1 == sysv_index)
{
int i;
char *last_char;
memset(sm_default, '\0', sizeof(sm_default));
/* populate sm_default with all available common sm component names */
for (i = 0; NULL != sm_avail_table[i].sm_name; ++i)
{
if (sm_avail_table[i].avail)
{
strncat(sm_default,
sm_avail_table[i].sm_name,
sizeof(sm_default) - 1);
}
}
/* remove the last comma from the char buff */
if (NULL != (last_char = strrchr(sm_default, ',')))
{
*last_char = '\0';
}
/* set up help string */
snprintf(
sm_avail_help_str,
sizeof(sm_avail_help_str) - 1,
"Which shared memory support will be used. Valid values: (%s)%s",
sm_default,
(i > 1) ? " - or a comma delimited combination of them "
"(order dependent). The first component that is successfully "
"selected is used." : "."
);
sysv_index = mca_base_param_reg_int_name(
"mpi",
"common_sm_have_sysv_support",
"Whether shared memory has System V support or not",
false,
true,
MCA_COMMON_SM_SYSV,
NULL
);
posix_index = mca_base_param_reg_int_name(
"mpi",
"common_sm_have_posix_support",
"Whether shared memory has POSIX support or not",
false,
true,
MCA_COMMON_SM_POSIX,
NULL
);
}
/* register mpi_common_sm */
common_sm_index = mca_base_param_reg_string_name("mpi",
"common_sm",
sm_avail_help_str,
false,
false,
/* default value */
sm_default,
&sm_params);
/* also register MCA param synonyms for the component */
mca_base_param_reg_syn(sysv_index, c, "have_sysv_support", false);
mca_base_param_reg_syn(posix_index, c, "have_posix_support", false);
mca_base_param_reg_syn(common_sm_index, c, "store", false);
if (OPAL_SUCCESS != mca_base_param_lookup_string(common_sm_index,
&sm_params))
{
return OMPI_ERROR;
}
/* empty string == try all available */
if (0 == strcmp(sm_params, ""))
{
if (NULL == (sm_argv = opal_argv_split(sm_default, ',')))
{
opal_output(0,
"WARNING: could not parse mpi_common_sm request.");
}
}
/* try what the user specified */
else
{
if (NULL == (sm_argv = opal_argv_split(sm_params, ',')))
{
opal_output(0,
"WARNING: could not parse mpi_common_sm request.");
}
}
return OMPI_SUCCESS;
}
/******************************************************************************/
mca_common_sm_module_t *
mca_common_sm_init(ompi_proc_t **procs,
size_t num_procs,
size_t size,
char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment)
{
size_t num_local_procs = 0;
bool found_lowest = false;
bool lowest;
size_t p;
ompi_proc_t *temp_proc;
/**
* NOTE: the selected component's init routine, unlike mca_common_sm_init,
* must be provided with:
* o a SORTED procs array
* o the number of LOCAL processes within procs array
*
* so always do the following before calling sm_init:
* o reorder procs array to have all the local procs at the beginning.
* o look for the local proc with the lowest name.
* o determine the number of local procs.
* o ensure that procs[0] is the lowest named process.
*/
for (p = 0; p < num_procs; ++p)
{
if (OPAL_PROC_ON_LOCAL_NODE(procs[p]->proc_flags))
{
/* if we don't have a lowest, save the first one */
if (!found_lowest)
{
procs[0] = procs[p];
found_lowest = true;
}
else
{
/* save this proc */
procs[num_local_procs] = procs[p];
/**
* if we have a new lowest, swap it with position 0
* so that procs[0] is always the lowest named proc
*/
if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
&(procs[p]->proc_name),
&(procs[0]->proc_name)) < 0)
{
temp_proc = procs[0];
procs[0] = procs[p];
procs[num_local_procs] = temp_proc;
}
}
/**
* regardless of the comparisons above, we found
* another proc on the local node, so increment
*/
++num_local_procs;
}
}
/* if there is less than 2 local processes, there's nothing to do. */
if (num_local_procs < 2)
{
return NULL;
}
if (!initialized)
{
mca_common_sm_rml_sm_info_t sm_info;
sm_info.id = MCA_COMMON_SM_COMP_INDEX_NONE;
memset(sm_info.posix_fname_buff,
'\0',
OMPI_COMMON_SM_POSIX_FILE_LEN_MAX);
lowest = (0 == orte_util_compare_name_fields(
ORTE_NS_CMP_ALL,
ORTE_PROC_MY_NAME,
&(procs[0]->proc_name)));
/**
* lock here to prevent multiple threads from invoking this function
* simultaneously. the critical section we're protecting is usage of
* the RML in this block.
*/
opal_mutex_lock(&mutex);
OBJ_CONSTRUCT(&(pending_rml_msgs), opal_list_t);
/**
* figure out if i am the lowest proc in the group.
* if i am, select a common sm component and send its index to the rest
* of the local procs so they can select the same common sm component.
*/
if (lowest)
{
/* get the component index */
sm_info.id = query_sm_components();
}
/* no return code check here because the error
* path is the same as the expected path */
mca_common_sm_rml_info_bcast(&sm_info,
procs,
num_local_procs,
OMPI_RML_TAG_COMMON_SM_COMP_INDEX,
lowest,
file_name,
&(pending_rml_msgs));
opal_mutex_unlock(&mutex);
select_common_sm_component(sm_info.id);
initialized = true;
}
if (NULL != sm_init)
{
/* notice that we are passing a SORTED procs array to the selected
* component along with the number of LOCAL processes found within
* procs.
*/
return sm_init(procs,
num_local_procs,
size,
file_name,
size_ctl_structure,
data_seg_alignment);
}
return NULL;
}
/******************************************************************************/
/**
* This routine is the same as mca_common_sm_mmap_init() except that
* it takes an (ompi_group_t *) parameter to specify the peers rather
* than an array of procs. Unlike mca_common_sm_mmap_init(), the
* group must contain *only* local peers, or this function will return
* NULL and not create any shared memory segment.
*/
mca_common_sm_module_t *
mca_common_sm_init_group(ompi_group_t *group,
size_t size,
char *file_name,
size_t size_ctl_structure,
size_t data_seg_alignment)
{
mca_common_sm_module_t *ret = NULL;
ompi_proc_t **procs = NULL;
/* make sure sm_init has been properly initialized. do this because
* sm_init_group only does prep work before passing along the real work to
* sm_init.
*/
if (NULL != sm_init)
{
size_t i;
size_t group_size;
ompi_proc_t *proc;
/* if there is less than 2 procs, there's nothing to do */
if ((group_size = ompi_group_size(group)) < 2)
{
goto out;
}
if (NULL == (procs = (ompi_proc_t **)
malloc(sizeof(ompi_proc_t *) * group_size)))
{
ORTE_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
goto out;
}
/* make sure that all the procs in the group are local */
for (i = 0; i < group_size; ++i)
{
proc = ompi_group_peer_lookup(group, i);
if (!OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags))
{
goto out;
}
procs[i] = proc;
}
/* let sm_init take care of the rest ... */
ret = sm_init(procs,
group_size,
size,
file_name,
size_ctl_structure,
data_seg_alignment);
}
out:
if (NULL != procs)
{
free(procs);
}
return ret;
}
/******************************************************************************/
void *
mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool,
size_t *size,
mca_mpool_base_registration_t **registration)
{
if (NULL != sm_seg_alloc)
{
return sm_seg_alloc(mpool, size, registration);
}
return NULL;
}
/******************************************************************************/
int
mca_common_sm_fini(mca_common_sm_module_t *mca_common_sm_module)
{
if (NULL != sm_argv)
{
opal_argv_free(sm_argv);
sm_argv = NULL;
}
if (NULL != sm_fini)
{
return sm_fini(mca_common_sm_module);
}
return OMPI_ERR_NOT_FOUND;
}