1
1

Bring over the functionality from the /tmp/jnysal-openib-wireup

branch:

 * Support btl_openib_if_include and btl_openib_if_exclude MCA
   parameters, similar to those supported by other BTLs.  Each take a
   comma-delimited lists of identifiers.  Identifiers can be HCA
   interface names (e.g., ipath0, mthca1, etc.)  or an HCA interface
   name and port numbers (e.g., ipath0:1, mthca1:2, etc.).  It is an
   error to specify both _include and _exclude.  If you specify a
   non-existant (or non-ACTIVE) HCA and/or port, you'll get a warning
   unless you disable the warning by setting the MCA parameter
   btl_openib_warn_nonexistent_if to 0.
 * Start updating to use BEGIN_C_DECLS and END_C_DECLS
 * A few other minor fixes that were picked up along the way.

This commit was SVN r15063.
Этот коммит содержится в:
Jeff Squyres 2007-06-14 01:59:25 +00:00
родитель de0f1eef89
Коммит 1e18265c16
4 изменённых файлов: 274 добавлений и 21 удалений

Просмотреть файл

@ -44,9 +44,7 @@
#include "btl_openib_frag.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
BEGIN_C_DECLS
#define MCA_BTL_IB_LEAVE_PINNED 1
#define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll
@ -129,6 +127,10 @@ struct mca_btl_openib_component_t {
#if OMPI_HAVE_POSIX_THREADS
int32_t fatal_counter; /**< Counts number on fatal events that we got on all hcas */
#endif
char *if_include;
char **if_include_list;
char *if_exclude;
char **if_exclude_list;
/** Colon-delimited list of filenames for HCA parameters */
char *hca_params_file_names;
@ -142,6 +144,13 @@ struct mca_btl_openib_component_t {
/** Whether we want a warning if non default GID prefix is not configured
on multiport setup */
bool warn_default_gid_prefix;
/** Whether we want a warning if the user specifies a non-existent
HCA and/or port via btl_openib_if_[in|ex]clude MCA params */
bool warn_nonexistent_if;
/** Dummy argv-style list; a copy of names from the
if_[in|ex]clude list that we use for error checking (to ensure
that they all exist) */
char **if_list;
#ifdef HAVE_IBV_FORK_INIT
/** Whether we want fork support or not */
int want_fork_support;
@ -505,7 +514,6 @@ static inline int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl,
return OMPI_SUCCESS;
}
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
END_C_DECLS
#endif /* MCA_BTL_IB_H */

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
@ -32,6 +32,7 @@
#include "ompi/mca/btl/btl.h"
#include "opal/sys/timer.h"
#include "opal/sys/atomic.h"
#include "opal/util/argv.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/errmgr/errmgr.h"
@ -87,6 +88,7 @@ static void btl_openib_frag_progress_pending(
static int openib_reg_mr(void *reg_data, void *base, size_t size,
mca_mpool_base_registration_t *reg);
static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg);
static int get_port_list(mca_btl_openib_hca_t *hca, int *allowed_ports);
#if OMPI_HAVE_POSIX_THREADS
void* btl_openib_async_thread(void *one_hca);
#endif
@ -461,10 +463,11 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
{
struct mca_mpool_base_resources_t mpool_resources;
mca_btl_openib_hca_t *hca;
uint8_t i;
int ret = -1;
uint8_t i, k = 0;
int ret = -1, port_cnt;
ompi_btl_openib_ini_values_t values, default_values;
int *allowed_ports;
hca = malloc(sizeof(mca_btl_openib_hca_t));
if(NULL == hca){
BTL_ERROR(("Failed malloc: %s:%d\n", __FILE__, __LINE__));
@ -486,7 +489,13 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
ibv_get_device_name(ib_dev), strerror(errno)));
goto close_hca;
}
/* If mca_btl_if_include/exclude were specified, get usable ports */
allowed_ports = (int*) malloc(hca->ib_dev_attr.phys_port_cnt * sizeof(int));
port_cnt = get_port_list(hca, allowed_ports);
if(0 == port_cnt) {
ret = OMPI_SUCCESS;
goto close_hca;
}
/* Load in vendor/part-specific HCA parameters. Note that even if
we don't find values for this vendor/part, "values" will be set
indicating that it does not have good values */
@ -583,17 +592,16 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
ret = OMPI_SUCCESS;
/* Note ports are 1 based hence j = 1 */
for(i = 1; i <= hca->ib_dev_attr.phys_port_cnt; i++){
/* Note ports are 1 based (i >= 1) */
for(k = 0; k < port_cnt; k++){
struct ibv_port_attr ib_port_attr;
i = allowed_ports[k];
if(ibv_query_port(hca->ib_dev_context, i, &ib_port_attr)){
BTL_ERROR(("error getting port attributes for device %s "
"port number %d errno says %s",
ibv_get_device_name(ib_dev), i, strerror(errno)));
break;
}
if(IBV_PORT_ACTIVE == ib_port_attr.state){
if (0 == mca_btl_openib_component.ib_pkey_val) {
@ -663,6 +671,9 @@ dealloc_pd:
ibv_dealloc_pd(hca->ib_pd);
close_hca:
ibv_close_device(hca->ib_dev_context);
if(NULL != allowed_ports) {
free(allowed_ports);
}
free_hca:
free(hca);
return ret;
@ -705,7 +716,7 @@ btl_openib_component_init(int *num_btl_modules,
/* Read in INI files with HCA-specific parameters */
if (OMPI_SUCCESS != (ret = ompi_btl_openib_ini_init())) {
return NULL;
goto no_btls;
}
#if OMPI_HAVE_POSIX_THREADS
/* Set the fatal counter to zero */
@ -724,14 +735,36 @@ btl_openib_component_init(int *num_btl_modules,
opal_show_help("help-mpi-btl-openib.txt",
"ibv_fork_init fail", true,
orte_system_info.nodename);
mca_btl_openib_component.ib_num_btls = 0;
btl_openib_modex_send();
return NULL;
goto no_btls;
}
}
}
#endif
/* Parse the include and exclude lists, checking for errors */
mca_btl_openib_component.if_include_list =
mca_btl_openib_component.if_exclude_list =
mca_btl_openib_component.if_list = NULL;
if (NULL != mca_btl_openib_component.if_include &&
NULL != mca_btl_openib_component.if_exclude) {
opal_show_help("help-mpi-btl-openib.txt",
"specified include and exclude", true,
mca_btl_openib_component.if_include,
mca_btl_openib_component.if_exclude, NULL);
goto no_btls;
} else if (NULL != mca_btl_openib_component.if_include) {
mca_btl_openib_component.if_include_list =
opal_argv_split(mca_btl_openib_component.if_include, ',');
mca_btl_openib_component.if_list =
opal_argv_copy(mca_btl_openib_component.if_include_list);
} else if (NULL != mca_btl_openib_component.if_exclude) {
mca_btl_openib_component.if_exclude_list =
opal_argv_split(mca_btl_openib_component.if_exclude, ',');
mca_btl_openib_component.if_list =
opal_argv_copy(mca_btl_openib_component.if_exclude_list);
}
#ifdef HAVE_IBV_GET_DEVICE_LIST
ib_devs = ibv_get_device_list(&num_devs);
#else
@ -776,7 +809,6 @@ btl_openib_component_init(int *num_btl_modules,
OBJ_CONSTRUCT(&btl_list, opal_list_t);
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t);
for (i = 0; i < num_devs &&
(-1 == mca_btl_openib_component.ib_max_btls ||
mca_btl_openib_component.ib_num_btls <
@ -790,6 +822,21 @@ btl_openib_component_init(int *num_btl_modules,
opal_show_help("help-mpi-btl-openib.txt",
"error in hca init", true, orte_system_info.nodename);
}
/* If we got back from checking all the HCAs and find that there
are still items in the component.if_list, that means that they
didn't exist. Show an appropriate warning if the warning was
not disabled. */
if (0 != opal_argv_count(mca_btl_openib_component.if_list) &&
mca_btl_openib_component.warn_nonexistent_if) {
char *str = opal_argv_join(mca_btl_openib_component.if_list, ',');
opal_show_help("help-mpi-btl-openib.txt", "nonexistent port",
true, orte_system_info.nodename,
((NULL != mca_btl_openib_component.if_include) ?
"in" : "ex"), str);
free(str);
}
if(0 == mca_btl_openib_component.ib_num_btls) {
opal_show_help("help-mpi-btl-openib.txt",
@ -962,7 +1009,23 @@ btl_openib_component_init(int *num_btl_modules,
#else
free(ib_devs);
#endif
if (NULL != mca_btl_openib_component.if_include_list) {
opal_argv_free(mca_btl_openib_component.if_include_list);
mca_btl_openib_component.if_include_list = NULL;
}
if (NULL != mca_btl_openib_component.if_exclude_list) {
opal_argv_free(mca_btl_openib_component.if_exclude_list);
mca_btl_openib_component.if_exclude_list = NULL;
}
return btls;
no_btls:
/* If we fail early enough in the setup, we just modex around that
there are no openib BTL's in this process and return NULL. */
mca_btl_openib_component.ib_num_btls = 0;
btl_openib_modex_send();
return NULL;
}
@ -1514,3 +1577,114 @@ error:
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
return count;
}
static int
get_port_list(mca_btl_openib_hca_t *hca, int *allowed_ports)
{
int i, j, k, num_ports = 0;
const char *dev_name;
char *name;
dev_name = ibv_get_device_name(hca->ib_dev);
name = (char*) malloc(strlen(dev_name) + 4);
if (NULL == name) {
return 0;
}
/* Assume that all ports are allowed. num_ports will be adjusted
below to reflect whether this is true or not. */
for (i = 1; i <= hca->ib_dev_attr.phys_port_cnt; ++i) {
allowed_ports[num_ports++] = i;
}
num_ports = 0;
if (NULL != mca_btl_openib_component.if_include_list) {
/* If only the HCA name is given (eg. mthca0,mthca1) use all
ports */
i = 0;
while (mca_btl_openib_component.if_include_list[i]) {
if (0 == strcmp(dev_name,
mca_btl_openib_component.if_include_list[i])) {
num_ports = hca->ib_dev_attr.phys_port_cnt;
goto done;
}
++i;
}
/* Include only requested ports on the HCA */
for (i = 1; i <= hca->ib_dev_attr.phys_port_cnt; ++i) {
sprintf(name,"%s:%d",dev_name,i);
for (j = 0;
NULL != mca_btl_openib_component.if_include_list[j]; ++j) {
if (0 == strcmp(name,
mca_btl_openib_component.if_include_list[j])) {
allowed_ports[num_ports++] = i;
break;
}
}
}
} else if (NULL != mca_btl_openib_component.if_exclude_list) {
/* If only the HCA name is given (eg. mthca0,mthca1) exclude
all ports */
i = 0;
while (mca_btl_openib_component.if_exclude_list[i]) {
if (0 == strcmp(dev_name,
mca_btl_openib_component.if_exclude_list[i])) {
num_ports = 0;
goto done;
}
++i;
}
/* Exclude the specified ports on this HCA */
for (i = 1; i <= hca->ib_dev_attr.phys_port_cnt; ++i) {
sprintf(name,"%s:%d",dev_name,i);
for (j = 0;
NULL != mca_btl_openib_component.if_exclude_list[j]; ++j) {
if (0 == strcmp(name,
mca_btl_openib_component.if_exclude_list[j])) {
/* If found, set a sentinel value */
j = -1;
break;
}
}
/* If we didn't find it, it's ok to include in the list */
if (-1 != j) {
allowed_ports[num_ports++] = i;
}
}
} else {
num_ports = hca->ib_dev_attr.phys_port_cnt;
}
done:
/* Remove the following from the error-checking if_list:
- bare device name
- device name suffixed with port number */
if (NULL != mca_btl_openib_component.if_list) {
for (i = 0; NULL != mca_btl_openib_component.if_list[i]; ++i) {
/* Look for raw device name */
if (0 == strcmp(mca_btl_openib_component.if_list[i], dev_name)) {
j = opal_argv_count(mca_btl_openib_component.if_list);
opal_argv_delete(&j, &(mca_btl_openib_component.if_list),
i, 1);
--i;
}
}
for (i = 1; i <= hca->ib_dev_attr.phys_port_cnt; ++i) {
sprintf(name, "%s:%d", dev_name, i);
for (j = 0; NULL != mca_btl_openib_component.if_list[j]; ++j) {
if (0 == strcmp(mca_btl_openib_component.if_list[j], name)) {
k = opal_argv_count(mca_btl_openib_component.if_list);
opal_argv_delete(&k, &(mca_btl_openib_component.if_list),
j, 1);
--j;
break;
}
}
}
}
free(name);
return num_ports;
}

Просмотреть файл

@ -120,6 +120,31 @@ int btl_openib_register_mca_params(void)
"Warn when there is more than one active ports and at least one of them connected to the network with only default GID prefix configured (0 = do not warn; any other value = warn)",
1, &ival, 0));
mca_btl_openib_component.warn_default_gid_prefix = (0 != ival);
CHECK(reg_int("warn_nonexistent_if",
"Warn if non-existent HCAs and/or ports are specified in the btl_openib_if_[in|ex]clude MCA parameters (0 = do not warn; any other value = warn)",
1, &ival, 0));
mca_btl_openib_component.warn_nonexistent_if = (0 != ival);
#ifdef HAVE_IBV_FORK_INIT
ival2 = -1;
#else
ival2 = 0;
#endif
CHECK(reg_int("want_fork_support",
"Whether fork support is desired or not "
"(negative = try to enable fork support, but continue even if it is not available, 0 = do not enable fork support, positive = try to enable fork support and fail if it is not available)",
ival2, &ival, 0));
#ifdef HAVE_IBV_FORK_INIT
mca_btl_openib_component.want_fork_support = ival;
#else
if (0 != ival) {
opal_show_help("help-mpi-btl-openib.txt",
"ibv_fork requested but not supported", true,
orte_system_info.nodename);
return OMPI_ERROR;
}
#endif
asprintf(&str, "%s/mca-btl-openib-hca-params.ini",
opal_install_dirs.pkgdatadir);
if (NULL == str) {
@ -399,5 +424,15 @@ int btl_openib_register_mca_params(void)
mca_btl_base_param_register(&mca_btl_openib_component.super.btl_version,
&mca_btl_openib_module.super);
CHECK(reg_string("if_include",
"List of HCAs/ports to be used (eg. mthca0,mthca1:2)",
NULL, &mca_btl_openib_component.if_include,
0));
CHECK(reg_string("if_exclude",
"List of HCAs/ports to be excluded ",
NULL, &mca_btl_openib_component.if_exclude,
0));
return ret;
}

Просмотреть файл

@ -17,7 +17,8 @@
#
# $HEADER$
#
# This is the US/English general help file for Open MPI.
# This is the US/English help file for Open MPI's OpenFabrics support
# (the openib BTL).
#
[ini file:file not found]
The Open MPI OpenIB BTL component was unable to find or read an INI
@ -26,6 +27,7 @@ parameter. Please check this file and/or modify the
btl_openib_hca_param_files MCA parameter:
%s
#
[ini file:not in a section]
In parsing OpenIB BTL parameter file, values were found that were not
in a valid INI section. These values will be ignored. Please
@ -36,6 +38,7 @@ re-check this file:
At line %d, near the following text:
%s
#
[ini file:unexpected token]
In parsing OpenIB BTL parameter file, unexpected tokens were found
(this may cause significant portions of the INI file to be ignored).
@ -46,6 +49,7 @@ Please re-check this file:
At line %d, near the following text:
%s
#
[ini file:expected equals]
In parsing OpenIB BTL parameter file, unexpected tokens were found
(this may cause significant portions of the INI file to be ignored).
@ -57,6 +61,7 @@ this file:
At line %d, near the following text:
%s
#
[ini file:expected newline]
In parsing OpenIB BTL parameter file, unexpected tokens were found
(this may cause significant portions of the INI file to be ignored).
@ -67,6 +72,7 @@ A newline was expected but was not found. Please re-check this file:
At line %d, near the following text:
%s
#
[ini file:unknown field]
In parsing OpenIB BTL parameter file, an unrecognized field name was
found. Please re-check this file:
@ -78,6 +84,7 @@ At line %d, the field named:
%s
This field, and any other unrecognized fields, will be skipped.
#
[no hca params found]
WARNING: No HCA parameters were found for the HCA that Open MPI
detected:
@ -92,6 +99,7 @@ btl_openib_hca_param_files MCA parameter to set values for your HCA.
NOTE: You can turn off this warning by setting the MCA parameter
btl_openib_warn_no_hca_params_found to 0.
#
[init-fail-no-mem]
The OpenIB BTL failed to initialize while trying to allocate some
locked memory. This typically can indicate that the memlock limits
@ -109,6 +117,7 @@ problem fixed. This FAQ entry on the Open MPI web site may also be
helpful:
http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages
#
[init-fail-create-q]
The OpenIB BTL failed to initialize while trying to create an internal
queue. This typically indicates a failed OpenFabrics installation or
@ -122,6 +131,7 @@ faulty hardware. The failure occured here:
You may need to consult with your system administrator to get this
problem fixed.
#
[btl_openib:retry-exceeded]
The InfiniBand retry count between two MPI processes has been
exceeded. "Retry count" is defined in the InfiniBand spec 1.2
@ -148,12 +158,15 @@ respect to the retry count:
4.096 microseconds * (2^btl_openib_ib_timeout)
See the InfiniBand spec 1.2 (section 12.7.34) for more details.
#
[no active ports found]
WARNING: There is at least on IB HCA found on host '%s', but there is
no active ports detected. This is most certainly not what you wanted.
Check your cables and SM configuration.
#
[error in hca init]
WARNING: There were errors during IB HCA initialization on host '%s'.
#
[default subnet prefix]
WARNING: There are more than one active ports on host '%s', but the
default subnet GID prefix was detected on more than one of these
@ -169,16 +182,39 @@ Please see this FAQ entry for more details:
NOTE: You can turn off this warning by setting the MCA parameter
btl_openib_warn_default_gid_prefix to 0.
#
[wrong buffer alignment]
Wrong buffer alignment %d configured on host '%s'. Should be bigger
than zero and power of two. Use default %d instead.
#
[ibv_fork requested but not supported]
WARNING: fork() support was requested for the openib BTL, but it is
not supported on the host %s. Deactivating the openib BTL.
#
[ibv_fork_init fail]
WARNING: fork() support was requested for the openib BTL, but the
library call ibv_fork_init() failed on the host %s.
Deactivating the openib BTL.
#
[wrong buffer alignment]
Wrong buffer alignment %d configured on host '%s'. Should be bigger
than zero and power of two. Use default %d instead.
#
[specified include and exclude]
ERROR: You have specified both the btl_openib_if_include and
btl_openib_if_exclude MCA parameters. These two parameters are
mutually exclusive; you can only specify one or the other.
For reference, the values that you specified are:
btl_openib_if_include: %s
btl_openib_if_exclude: %s
[nonexistent port]
WARNING: One or more nonexistent HCAs/ports were specified:
Host: %s
MCA parameter: mca_btl_if_%sclude
Nonexistent entities: %s
These entities will be ignored. You can disable this warning by
setting the btl_openib_warn_nonexistent_if MCA parameter to 0.