1
1

Improving support for non homogeneous OpenFabrics network configurations

This commit was SVN r22312.
Этот коммит содержится в:
Vasily Filipov 2009-12-15 14:25:07 +00:00
родитель 4d02aea54c
Коммит 354bfe527f
7 изменённых файлов: 298 добавлений и 41 удалений

Просмотреть файл

@ -39,6 +39,8 @@
#include "ompi/runtime/ompi_cr.h"
#endif
#include "btl_openib_ini.h"
#include "btl_openib.h"
#include "btl_openib_frag.h"
#include "btl_openib_proc.h"
@ -97,6 +99,13 @@ mca_btl_openib_module_t mca_btl_openib_module = {
}
};
char* const mca_btl_openib_transport_name_strings[MCA_BTL_OPENIB_TRANSPORT_SIZE] = {
"MCA_BTL_OPENIB_TRANSPORT_IB",
"MCA_BTL_OPENIB_TRANSPORT_IWARP",
"MCA_BTL_OPENIB_TRANSPORT_RDMAOE",
"MCA_BTL_OPENIB_TRANSPORT_UNKNOWN"
};
static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl);
void mca_btl_openib_show_init_error(const char *file, int line,
@ -287,6 +296,160 @@ out:
return rc;
}
mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl)
{
/* If we have a driver with RDMAoE supporting as the device struct contains the same type (IB) for
IBV_LINK_LAYER_INFINIBAND and IBV_LINK_LAYER_ETHERNET link layers and the single way
to detect this fact is to check their link_layer fields in a port_attr struct.
If our driver doesn't support this feature => the checking of transport type in device struct will be enough.
If the driver doesn't support completely transport types =>
our assumption that it is very old driver - that supports IB devices only */
#ifdef HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE
switch(openib_btl->device->ib_dev->transport_type) {
case IBV_TRANSPORT_IB:
#ifdef OMPI_HAVE_RDMAOE
switch(openib_btl->ib_port_attr.link_layer) {
case IBV_LINK_LAYER_ETHERNET:
return MCA_BTL_OPENIB_TRANSPORT_RDMAOE;
case IBV_LINK_LAYER_INFINIBAND:
return MCA_BTL_OPENIB_TRANSPORT_IB;
/* It is not possible that a device struct contains
IB transport and port was configured to IBV_LINK_LAYER_UNSPECIFIED */
case IBV_LINK_LAYER_UNSPECIFIED:
default:
return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN;
}
#endif
return MCA_BTL_OPENIB_TRANSPORT_IB;
case IBV_TRANSPORT_IWARP:
return MCA_BTL_OPENIB_TRANSPORT_IWARP;
case IBV_TRANSPORT_UNKNOWN:
default:
return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN;
}
#else
return MCA_BTL_OPENIB_TRANSPORT_IB;
#endif
}
static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
mca_btl_base_endpoint_t* endpoint)
{
int ret = OMPI_SUCCESS;
char* recv_qps = NULL;
ompi_btl_openib_ini_values_t values;
if(mca_btl_openib_get_transport_type(openib_btl) != endpoint->rem_info.rem_transport_type) {
orte_show_help("help-mpi-btl-openib.txt",
"conflicting transport types", true,
orte_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
(openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_transport_name_strings[mca_btl_openib_get_transport_type(openib_btl)],
endpoint->endpoint_proc->proc_ompi->proc_hostname,
endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id,
mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]);
return OMPI_ERROR;
}
memset(&values, 0, sizeof(ompi_btl_openib_ini_values_t));
ret = ompi_btl_openib_ini_query(endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id, &values);
if (OMPI_SUCCESS != ret && OMPI_ERR_NOT_FOUND != ret) {
orte_show_help("help-mpi-btl-openib.txt",
"error in device init", true,
orte_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev));
return ret;
}
if(openib_btl->device->mtu < endpoint->rem_info.rem_mtu) {
endpoint->rem_info.rem_mtu = openib_btl->device->mtu;
}
endpoint->use_eager_rdma = openib_btl->device->use_eager_rdma &
endpoint->use_eager_rdma;
/* Receive queues checking */
/* In this check we assume that the command line or INI file parameters are the same
for all processes on all machines. The assumption is correct for 99.9999% of users,
if a user distributes different INI files or parameters for different node/procs,
it is on his own responsibility */
switch(mca_btl_openib_component.receive_queues_source) {
case BTL_OPENIB_RQ_SOURCE_MCA:
case BTL_OPENIB_RQ_SOURCE_MAX:
break;
/* If the queues configuration was set from command line
(with --mca btl_openib_receive_queues parameter) => both sides have a same configuration */
/* In this case the local queues configuration was gotten from INI file =>
not possible that remote side got its queues configuration from command line =>
(by prio) the configuration was set from INI file or (if not configure)
by default queues configuration */
case BTL_OPENIB_RQ_SOURCE_DEVICE_INI:
if(NULL != values.receive_queues) {
recv_qps = values.receive_queues;
} else {
recv_qps = mca_btl_openib_component.default_recv_qps;
}
if(0 != strcmp(mca_btl_openib_component.receive_queues,
recv_qps)) {
orte_show_help("help-mpi-btl-openib.txt",
"unsupported queues configuration", true,
orte_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
(openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_component.receive_queues,
endpoint->endpoint_proc->proc_ompi->proc_hostname,
endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id,
recv_qps);
return OMPI_ERROR;
}
break;
/* If the local queues configuration was set
by default queues => check all possible cases for remote side and compare */
case BTL_OPENIB_RQ_SOURCE_DEFAULT:
if(NULL != values.receive_queues) {
if(0 != strcmp(mca_btl_openib_component.receive_queues,
values.receive_queues)) {
orte_show_help("help-mpi-btl-openib.txt",
"unsupported queues configuration", true,
orte_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
(openib_btl->device->ib_dev_attr).vendor_id,
(openib_btl->device->ib_dev_attr).vendor_part_id,
mca_btl_openib_component.receive_queues,
endpoint->endpoint_proc->proc_ompi->proc_hostname,
endpoint->rem_info.rem_vendor_id,
endpoint->rem_info.rem_vendor_part_id,
values.receive_queues);
return OMPI_ERROR;
}
}
break;
}
return OMPI_SUCCESS;
}
/*
* add a proc to this btl module
* creates an endpoint that is setup on the
@ -478,6 +641,12 @@ int mca_btl_openib_add_procs(
continue;
}
if(OMPI_SUCCESS != mca_btl_openib_tune_endpoint(openib_btl, endpoint)) {
OBJ_RELEASE(endpoint);
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
return OMPI_ERROR;
}
endpoint->index = opal_pointer_array_add(openib_btl->device->endpoints, (void*)endpoint);
if( 0 > endpoint->index ) {
OBJ_RELEASE(endpoint);

Просмотреть файл

@ -75,6 +75,14 @@ BEGIN_C_DECLS
* Infiniband (IB) BTL component.
*/
typedef enum {
MCA_BTL_OPENIB_TRANSPORT_IB,
MCA_BTL_OPENIB_TRANSPORT_IWARP,
MCA_BTL_OPENIB_TRANSPORT_RDMAOE,
MCA_BTL_OPENIB_TRANSPORT_UNKNOWN,
MCA_BTL_OPENIB_TRANSPORT_SIZE
} mca_btl_openib_transport_type_t;
typedef enum {
MCA_BTL_OPENIB_PP_QP,
MCA_BTL_OPENIB_SRQ_QP,
@ -253,6 +261,8 @@ struct mca_btl_openib_component_t {
ompi_free_list_t recv_user_free;
/**< frags for coalesced massages */
ompi_free_list_t send_free_coalesced;
/** Default receive queues */
char* default_recv_qps;
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
@ -271,6 +281,12 @@ typedef struct mca_btl_openib_modex_message_t {
uint16_t apm_lid;
/** The MTU used by this port */
uint8_t mtu;
/** vendor id define device type and tuning */
uint32_t vendor_id;
/** vendor part id define device type and tuning */
uint32_t vendor_part_id;
/** Transport type of remote port */
uint8_t transport_type;
/** Dummy field used to calculate the real length */
uint8_t end;
} mca_btl_openib_modex_message_t;
@ -632,6 +648,18 @@ void mca_btl_openib_show_init_error(const char *file, int line,
int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
/**
* Get a transport name of btl by its transport type.
*/
const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
/**
* Get a transport type of btl.
*/
mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl);
static inline int qp_cq_prio(const int qp)
{
if(0 == qp)

Просмотреть файл

@ -143,6 +143,7 @@ int btl_openib_component_open(void)
OBJ_CONSTRUCT(&mca_btl_openib_component.devices, opal_pointer_array_t);
mca_btl_openib_component.devices_count = 0;
mca_btl_openib_component.cpc_explicitly_defined = false;
mca_btl_openib_component.default_recv_qps = NULL;
/* initialize objects */
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
@ -196,6 +197,10 @@ static int btl_openib_component_close(void)
free(mca_btl_openib_component.receive_queues);
}
if (NULL != mca_btl_openib_component.default_recv_qps) {
free(mca_btl_openib_component.default_recv_qps);
}
return rc;
}
@ -303,6 +308,16 @@ static int btl_openib_modex_send(void)
/* Pack the modex common message struct. */
size = modex_message_size;
(mca_btl_openib_component.openib_btls[i]->port_info).vendor_id =
(mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_id;
(mca_btl_openib_component.openib_btls[i]->port_info).vendor_part_id =
(mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_part_id;
(mca_btl_openib_component.openib_btls[i]->port_info).transport_type =
mca_btl_openib_get_transport_type(mca_btl_openib_component.openib_btls[i]);
memcpy(offset,
&(mca_btl_openib_component.openib_btls[i]->port_info),
size);
@ -1657,45 +1672,6 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
ibv_destroy_cq(cq);
}
/* If the user specified btl_openib_receive_queues MCA param, it
overrides all device INI params */
if (BTL_OPENIB_RQ_SOURCE_MCA !=
mca_btl_openib_component.receive_queues_source &&
NULL != values.receive_queues) {
/* If a prior device's INI values set a different value for
receive_queues, this is unsupported (see
https://svn.open-mpi.org/trac/ompi/ticket/1285) */
if (BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
mca_btl_openib_component.receive_queues_source) {
if (0 != strcmp(values.receive_queues,
mca_btl_openib_component.receive_queues)) {
orte_show_help("help-mpi-btl-openib.txt",
"conflicting receive_queues", true,
orte_process_info.nodename,
ibv_get_device_name(device->ib_dev),
device->ib_dev_attr.vendor_id,
device->ib_dev_attr.vendor_part_id,
values.receive_queues,
ibv_get_device_name(receive_queues_device->ib_dev),
receive_queues_device->ib_dev_attr.vendor_id,
receive_queues_device->ib_dev_attr.vendor_part_id,
mca_btl_openib_component.receive_queues,
opal_install_dirs.pkgdatadir);
ret = OMPI_ERR_RESOURCE_BUSY;
goto error;
}
} else {
if (NULL != mca_btl_openib_component.receive_queues) {
free(mca_btl_openib_component.receive_queues);
}
receive_queues_device = device;
mca_btl_openib_component.receive_queues =
strdup(values.receive_queues);
mca_btl_openib_component.receive_queues_source =
BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
}
}
/* Should we use RDMA for short / eager messages? First check MCA
param, then check INI file values. */
if (mca_btl_openib_component.use_eager_rdma >= 0) {
@ -1795,6 +1771,45 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
"apm not enough ports", true);
mca_btl_openib_component.apm_ports = 0;
}
/* If the user specified btl_openib_receive_queues MCA param, it
overrides all device INI params */
if (BTL_OPENIB_RQ_SOURCE_MCA !=
mca_btl_openib_component.receive_queues_source &&
NULL != values.receive_queues) {
/* If a prior device's INI values set a different value for
receive_queues, this is unsupported (see
https://svn.open-mpi.org/trac/ompi/ticket/1285) */
if (BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
mca_btl_openib_component.receive_queues_source) {
if (0 != strcmp(values.receive_queues,
mca_btl_openib_component.receive_queues)) {
orte_show_help("help-mpi-btl-openib.txt",
"conflicting receive_queues", true,
orte_process_info.nodename,
ibv_get_device_name(device->ib_dev),
device->ib_dev_attr.vendor_id,
device->ib_dev_attr.vendor_part_id,
values.receive_queues,
ibv_get_device_name(receive_queues_device->ib_dev),
receive_queues_device->ib_dev_attr.vendor_id,
receive_queues_device->ib_dev_attr.vendor_part_id,
mca_btl_openib_component.receive_queues,
opal_install_dirs.pkgdatadir);
ret = OMPI_ERR_RESOURCE_BUSY;
goto error;
}
} else {
if (NULL != mca_btl_openib_component.receive_queues) {
free(mca_btl_openib_component.receive_queues);
}
receive_queues_device = device;
mca_btl_openib_component.receive_queues =
strdup(values.receive_queues);
mca_btl_openib_component.receive_queues_source =
BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
}
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -310,6 +310,11 @@ void mca_btl_openib_endpoint_init(mca_btl_openib_module_t *btl,
ep->rem_info.rem_subnet_id,
ep->rem_info.rem_mtu);
ep->rem_info.rem_vendor_id = (remote_proc_info->pm_port_info).vendor_id;
ep->rem_info.rem_vendor_part_id = (remote_proc_info->pm_port_info).vendor_part_id;
ep->rem_info.rem_transport_type = (remote_proc_info->pm_port_info).transport_type;
for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
endpoint_init_qp(ep, qp);
}

Просмотреть файл

@ -94,6 +94,12 @@ typedef struct mca_btl_openib_rem_info_t {
mca_btl_openib_rem_qp_info_t *rem_qps;
/* Remote xrc_srq info, used only with XRC connections */
mca_btl_openib_rem_srq_info_t *rem_srqs;
/* Vendor id of remote HCA */
uint32_t rem_vendor_id;
/* Vendor part id of remote HCA */
uint32_t rem_vendor_part_id;
/* Transport type of remote port */
mca_btl_openib_transport_type_t rem_transport_type;
} mca_btl_openib_rem_info_t;

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
@ -526,6 +526,13 @@ int btl_openib_register_mca_params(void)
mid_qp_size,
(uint32_t)mca_btl_openib_module.super.btl_eager_limit,
(uint32_t)mca_btl_openib_module.super.btl_max_send_size);
mca_btl_openib_component.default_recv_qps = strdup(default_qps);
if(NULL == mca_btl_openib_component.default_recv_qps) {
BTL_ERROR(("Unable to allocate memory for default receive queues string.\n"));
return OMPI_ERROR;
}
CHECK(reg_string("receive_queues", NULL,
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
default_qps, &mca_btl_openib_component.receive_queues,

Просмотреть файл

@ -11,7 +11,7 @@
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
# Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
# $COPYRIGHT$
#
@ -590,3 +590,30 @@ value will be ignored.
Local host: %s
Value: %s
Message: %s
#
[unsupported queues configuration]
The Open MPI receive queue configuration for the OpenFabrics devices on two nodes are incompatible,
meaning that MPI processes on two specific nodes were unable to communicate with each other.
This generally happens when you are using OpenFabrics devices from different vendors on the same network.
You should be able to use the mca_btl_openib_receive_queues MCA parameter to set a uniform receive queue configuration
for all the devices in the MPI job, and therefore be able to run successfully.
Local host: %s
Local adapter: %s (vendor 0x%x, part ID %d)
Local queues: %s
Remote host: %s
Remote adapter: (vendor 0x%x, part ID %d)
Remote queues: %s
#
[conflicting transport types]
Open MPI detected two different OpenFabrics transport types in the same Infiniband network.
Such mixed network trasport configuration is not supported by Open MPI.
Local host: %s
Local adapter: %s (vendor 0x%x, part ID %d)
Local transport type: %s
Remote host: %s
Remote Adapter: (vendor 0x%x, part ID %d)
Remote transport type: %s