Improving support for non homogeneous OpenFabrics network configurations
This commit was SVN r22312.
Этот коммит содержится в:
родитель
4d02aea54c
Коммит
354bfe527f
@ -39,6 +39,8 @@
|
|||||||
#include "ompi/runtime/ompi_cr.h"
|
#include "ompi/runtime/ompi_cr.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "btl_openib_ini.h"
|
||||||
|
|
||||||
#include "btl_openib.h"
|
#include "btl_openib.h"
|
||||||
#include "btl_openib_frag.h"
|
#include "btl_openib_frag.h"
|
||||||
#include "btl_openib_proc.h"
|
#include "btl_openib_proc.h"
|
||||||
@ -97,6 +99,13 @@ mca_btl_openib_module_t mca_btl_openib_module = {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
char* const mca_btl_openib_transport_name_strings[MCA_BTL_OPENIB_TRANSPORT_SIZE] = {
|
||||||
|
"MCA_BTL_OPENIB_TRANSPORT_IB",
|
||||||
|
"MCA_BTL_OPENIB_TRANSPORT_IWARP",
|
||||||
|
"MCA_BTL_OPENIB_TRANSPORT_RDMAOE",
|
||||||
|
"MCA_BTL_OPENIB_TRANSPORT_UNKNOWN"
|
||||||
|
};
|
||||||
|
|
||||||
static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl);
|
static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl);
|
||||||
|
|
||||||
void mca_btl_openib_show_init_error(const char *file, int line,
|
void mca_btl_openib_show_init_error(const char *file, int line,
|
||||||
@ -287,6 +296,160 @@ out:
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl)
|
||||||
|
{
|
||||||
|
/* If we have a driver with RDMAoE supporting as the device struct contains the same type (IB) for
|
||||||
|
IBV_LINK_LAYER_INFINIBAND and IBV_LINK_LAYER_ETHERNET link layers and the single way
|
||||||
|
to detect this fact is to check their link_layer fields in a port_attr struct.
|
||||||
|
If our driver doesn't support this feature => the checking of transport type in device struct will be enough.
|
||||||
|
If the driver doesn't support completely transport types =>
|
||||||
|
our assumption that it is very old driver - that supports IB devices only */
|
||||||
|
|
||||||
|
#ifdef HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE
|
||||||
|
switch(openib_btl->device->ib_dev->transport_type) {
|
||||||
|
case IBV_TRANSPORT_IB:
|
||||||
|
#ifdef OMPI_HAVE_RDMAOE
|
||||||
|
switch(openib_btl->ib_port_attr.link_layer) {
|
||||||
|
case IBV_LINK_LAYER_ETHERNET:
|
||||||
|
return MCA_BTL_OPENIB_TRANSPORT_RDMAOE;
|
||||||
|
|
||||||
|
case IBV_LINK_LAYER_INFINIBAND:
|
||||||
|
return MCA_BTL_OPENIB_TRANSPORT_IB;
|
||||||
|
/* It is not possible that a device struct contains
|
||||||
|
IB transport and port was configured to IBV_LINK_LAYER_UNSPECIFIED */
|
||||||
|
case IBV_LINK_LAYER_UNSPECIFIED:
|
||||||
|
default:
|
||||||
|
return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return MCA_BTL_OPENIB_TRANSPORT_IB;
|
||||||
|
|
||||||
|
case IBV_TRANSPORT_IWARP:
|
||||||
|
return MCA_BTL_OPENIB_TRANSPORT_IWARP;
|
||||||
|
|
||||||
|
case IBV_TRANSPORT_UNKNOWN:
|
||||||
|
default:
|
||||||
|
return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
return MCA_BTL_OPENIB_TRANSPORT_IB;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||||
|
mca_btl_base_endpoint_t* endpoint)
|
||||||
|
{
|
||||||
|
int ret = OMPI_SUCCESS;
|
||||||
|
|
||||||
|
char* recv_qps = NULL;
|
||||||
|
|
||||||
|
ompi_btl_openib_ini_values_t values;
|
||||||
|
|
||||||
|
if(mca_btl_openib_get_transport_type(openib_btl) != endpoint->rem_info.rem_transport_type) {
|
||||||
|
orte_show_help("help-mpi-btl-openib.txt",
|
||||||
|
"conflicting transport types", true,
|
||||||
|
orte_process_info.nodename,
|
||||||
|
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||||
|
(openib_btl->device->ib_dev_attr).vendor_id,
|
||||||
|
(openib_btl->device->ib_dev_attr).vendor_part_id,
|
||||||
|
mca_btl_openib_transport_name_strings[mca_btl_openib_get_transport_type(openib_btl)],
|
||||||
|
endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||||
|
endpoint->rem_info.rem_vendor_id,
|
||||||
|
endpoint->rem_info.rem_vendor_part_id,
|
||||||
|
mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]);
|
||||||
|
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(&values, 0, sizeof(ompi_btl_openib_ini_values_t));
|
||||||
|
ret = ompi_btl_openib_ini_query(endpoint->rem_info.rem_vendor_id,
|
||||||
|
endpoint->rem_info.rem_vendor_part_id, &values);
|
||||||
|
|
||||||
|
if (OMPI_SUCCESS != ret && OMPI_ERR_NOT_FOUND != ret) {
|
||||||
|
orte_show_help("help-mpi-btl-openib.txt",
|
||||||
|
"error in device init", true,
|
||||||
|
orte_process_info.nodename,
|
||||||
|
ibv_get_device_name(openib_btl->device->ib_dev));
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(openib_btl->device->mtu < endpoint->rem_info.rem_mtu) {
|
||||||
|
endpoint->rem_info.rem_mtu = openib_btl->device->mtu;
|
||||||
|
}
|
||||||
|
|
||||||
|
endpoint->use_eager_rdma = openib_btl->device->use_eager_rdma &
|
||||||
|
endpoint->use_eager_rdma;
|
||||||
|
|
||||||
|
/* Receive queues checking */
|
||||||
|
|
||||||
|
/* In this check we assume that the command line or INI file parameters are the same
|
||||||
|
for all processes on all machines. The assumption is correct for 99.9999% of users,
|
||||||
|
if a user distributes different INI files or parameters for different node/procs,
|
||||||
|
it is on his own responsibility */
|
||||||
|
switch(mca_btl_openib_component.receive_queues_source) {
|
||||||
|
case BTL_OPENIB_RQ_SOURCE_MCA:
|
||||||
|
case BTL_OPENIB_RQ_SOURCE_MAX:
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* If the queues configuration was set from command line
|
||||||
|
(with --mca btl_openib_receive_queues parameter) => both sides have a same configuration */
|
||||||
|
|
||||||
|
/* In this case the local queues configuration was gotten from INI file =>
|
||||||
|
not possible that remote side got its queues configuration from command line =>
|
||||||
|
(by prio) the configuration was set from INI file or (if not configure)
|
||||||
|
by default queues configuration */
|
||||||
|
case BTL_OPENIB_RQ_SOURCE_DEVICE_INI:
|
||||||
|
if(NULL != values.receive_queues) {
|
||||||
|
recv_qps = values.receive_queues;
|
||||||
|
} else {
|
||||||
|
recv_qps = mca_btl_openib_component.default_recv_qps;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(0 != strcmp(mca_btl_openib_component.receive_queues,
|
||||||
|
recv_qps)) {
|
||||||
|
orte_show_help("help-mpi-btl-openib.txt",
|
||||||
|
"unsupported queues configuration", true,
|
||||||
|
orte_process_info.nodename,
|
||||||
|
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||||
|
(openib_btl->device->ib_dev_attr).vendor_id,
|
||||||
|
(openib_btl->device->ib_dev_attr).vendor_part_id,
|
||||||
|
mca_btl_openib_component.receive_queues,
|
||||||
|
endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||||
|
endpoint->rem_info.rem_vendor_id,
|
||||||
|
endpoint->rem_info.rem_vendor_part_id,
|
||||||
|
recv_qps);
|
||||||
|
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* If the local queues configuration was set
|
||||||
|
by default queues => check all possible cases for remote side and compare */
|
||||||
|
case BTL_OPENIB_RQ_SOURCE_DEFAULT:
|
||||||
|
if(NULL != values.receive_queues) {
|
||||||
|
if(0 != strcmp(mca_btl_openib_component.receive_queues,
|
||||||
|
values.receive_queues)) {
|
||||||
|
orte_show_help("help-mpi-btl-openib.txt",
|
||||||
|
"unsupported queues configuration", true,
|
||||||
|
orte_process_info.nodename,
|
||||||
|
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||||
|
(openib_btl->device->ib_dev_attr).vendor_id,
|
||||||
|
(openib_btl->device->ib_dev_attr).vendor_part_id,
|
||||||
|
mca_btl_openib_component.receive_queues,
|
||||||
|
endpoint->endpoint_proc->proc_ompi->proc_hostname,
|
||||||
|
endpoint->rem_info.rem_vendor_id,
|
||||||
|
endpoint->rem_info.rem_vendor_part_id,
|
||||||
|
values.receive_queues);
|
||||||
|
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* add a proc to this btl module
|
* add a proc to this btl module
|
||||||
* creates an endpoint that is setup on the
|
* creates an endpoint that is setup on the
|
||||||
@ -478,6 +641,12 @@ int mca_btl_openib_add_procs(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(OMPI_SUCCESS != mca_btl_openib_tune_endpoint(openib_btl, endpoint)) {
|
||||||
|
OBJ_RELEASE(endpoint);
|
||||||
|
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
endpoint->index = opal_pointer_array_add(openib_btl->device->endpoints, (void*)endpoint);
|
endpoint->index = opal_pointer_array_add(openib_btl->device->endpoints, (void*)endpoint);
|
||||||
if( 0 > endpoint->index ) {
|
if( 0 > endpoint->index ) {
|
||||||
OBJ_RELEASE(endpoint);
|
OBJ_RELEASE(endpoint);
|
||||||
|
@ -75,6 +75,14 @@ BEGIN_C_DECLS
|
|||||||
* Infiniband (IB) BTL component.
|
* Infiniband (IB) BTL component.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
MCA_BTL_OPENIB_TRANSPORT_IB,
|
||||||
|
MCA_BTL_OPENIB_TRANSPORT_IWARP,
|
||||||
|
MCA_BTL_OPENIB_TRANSPORT_RDMAOE,
|
||||||
|
MCA_BTL_OPENIB_TRANSPORT_UNKNOWN,
|
||||||
|
MCA_BTL_OPENIB_TRANSPORT_SIZE
|
||||||
|
} mca_btl_openib_transport_type_t;
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
MCA_BTL_OPENIB_PP_QP,
|
MCA_BTL_OPENIB_PP_QP,
|
||||||
MCA_BTL_OPENIB_SRQ_QP,
|
MCA_BTL_OPENIB_SRQ_QP,
|
||||||
@ -253,6 +261,8 @@ struct mca_btl_openib_component_t {
|
|||||||
ompi_free_list_t recv_user_free;
|
ompi_free_list_t recv_user_free;
|
||||||
/**< frags for coalesced massages */
|
/**< frags for coalesced massages */
|
||||||
ompi_free_list_t send_free_coalesced;
|
ompi_free_list_t send_free_coalesced;
|
||||||
|
/** Default receive queues */
|
||||||
|
char* default_recv_qps;
|
||||||
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
|
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
|
||||||
|
|
||||||
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
|
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
|
||||||
@ -271,6 +281,12 @@ typedef struct mca_btl_openib_modex_message_t {
|
|||||||
uint16_t apm_lid;
|
uint16_t apm_lid;
|
||||||
/** The MTU used by this port */
|
/** The MTU used by this port */
|
||||||
uint8_t mtu;
|
uint8_t mtu;
|
||||||
|
/** vendor id define device type and tuning */
|
||||||
|
uint32_t vendor_id;
|
||||||
|
/** vendor part id define device type and tuning */
|
||||||
|
uint32_t vendor_part_id;
|
||||||
|
/** Transport type of remote port */
|
||||||
|
uint8_t transport_type;
|
||||||
/** Dummy field used to calculate the real length */
|
/** Dummy field used to calculate the real length */
|
||||||
uint8_t end;
|
uint8_t end;
|
||||||
} mca_btl_openib_modex_message_t;
|
} mca_btl_openib_modex_message_t;
|
||||||
@ -632,6 +648,18 @@ void mca_btl_openib_show_init_error(const char *file, int line,
|
|||||||
|
|
||||||
int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
|
int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a transport name of btl by its transport type.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const char* btl_openib_get_transport_name(mca_btl_openib_transport_type_t transport_type);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a transport type of btl.
|
||||||
|
*/
|
||||||
|
|
||||||
|
mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl);
|
||||||
|
|
||||||
static inline int qp_cq_prio(const int qp)
|
static inline int qp_cq_prio(const int qp)
|
||||||
{
|
{
|
||||||
if(0 == qp)
|
if(0 == qp)
|
||||||
|
@ -143,6 +143,7 @@ int btl_openib_component_open(void)
|
|||||||
OBJ_CONSTRUCT(&mca_btl_openib_component.devices, opal_pointer_array_t);
|
OBJ_CONSTRUCT(&mca_btl_openib_component.devices, opal_pointer_array_t);
|
||||||
mca_btl_openib_component.devices_count = 0;
|
mca_btl_openib_component.devices_count = 0;
|
||||||
mca_btl_openib_component.cpc_explicitly_defined = false;
|
mca_btl_openib_component.cpc_explicitly_defined = false;
|
||||||
|
mca_btl_openib_component.default_recv_qps = NULL;
|
||||||
|
|
||||||
/* initialize objects */
|
/* initialize objects */
|
||||||
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
|
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
|
||||||
@ -196,6 +197,10 @@ static int btl_openib_component_close(void)
|
|||||||
free(mca_btl_openib_component.receive_queues);
|
free(mca_btl_openib_component.receive_queues);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (NULL != mca_btl_openib_component.default_recv_qps) {
|
||||||
|
free(mca_btl_openib_component.default_recv_qps);
|
||||||
|
}
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -303,6 +308,16 @@ static int btl_openib_modex_send(void)
|
|||||||
|
|
||||||
/* Pack the modex common message struct. */
|
/* Pack the modex common message struct. */
|
||||||
size = modex_message_size;
|
size = modex_message_size;
|
||||||
|
|
||||||
|
(mca_btl_openib_component.openib_btls[i]->port_info).vendor_id =
|
||||||
|
(mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_id;
|
||||||
|
|
||||||
|
(mca_btl_openib_component.openib_btls[i]->port_info).vendor_part_id =
|
||||||
|
(mca_btl_openib_component.openib_btls[i]->device->ib_dev_attr).vendor_part_id;
|
||||||
|
|
||||||
|
(mca_btl_openib_component.openib_btls[i]->port_info).transport_type =
|
||||||
|
mca_btl_openib_get_transport_type(mca_btl_openib_component.openib_btls[i]);
|
||||||
|
|
||||||
memcpy(offset,
|
memcpy(offset,
|
||||||
&(mca_btl_openib_component.openib_btls[i]->port_info),
|
&(mca_btl_openib_component.openib_btls[i]->port_info),
|
||||||
size);
|
size);
|
||||||
@ -1657,45 +1672,6 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
|||||||
ibv_destroy_cq(cq);
|
ibv_destroy_cq(cq);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If the user specified btl_openib_receive_queues MCA param, it
|
|
||||||
overrides all device INI params */
|
|
||||||
if (BTL_OPENIB_RQ_SOURCE_MCA !=
|
|
||||||
mca_btl_openib_component.receive_queues_source &&
|
|
||||||
NULL != values.receive_queues) {
|
|
||||||
/* If a prior device's INI values set a different value for
|
|
||||||
receive_queues, this is unsupported (see
|
|
||||||
https://svn.open-mpi.org/trac/ompi/ticket/1285) */
|
|
||||||
if (BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
|
|
||||||
mca_btl_openib_component.receive_queues_source) {
|
|
||||||
if (0 != strcmp(values.receive_queues,
|
|
||||||
mca_btl_openib_component.receive_queues)) {
|
|
||||||
orte_show_help("help-mpi-btl-openib.txt",
|
|
||||||
"conflicting receive_queues", true,
|
|
||||||
orte_process_info.nodename,
|
|
||||||
ibv_get_device_name(device->ib_dev),
|
|
||||||
device->ib_dev_attr.vendor_id,
|
|
||||||
device->ib_dev_attr.vendor_part_id,
|
|
||||||
values.receive_queues,
|
|
||||||
ibv_get_device_name(receive_queues_device->ib_dev),
|
|
||||||
receive_queues_device->ib_dev_attr.vendor_id,
|
|
||||||
receive_queues_device->ib_dev_attr.vendor_part_id,
|
|
||||||
mca_btl_openib_component.receive_queues,
|
|
||||||
opal_install_dirs.pkgdatadir);
|
|
||||||
ret = OMPI_ERR_RESOURCE_BUSY;
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (NULL != mca_btl_openib_component.receive_queues) {
|
|
||||||
free(mca_btl_openib_component.receive_queues);
|
|
||||||
}
|
|
||||||
receive_queues_device = device;
|
|
||||||
mca_btl_openib_component.receive_queues =
|
|
||||||
strdup(values.receive_queues);
|
|
||||||
mca_btl_openib_component.receive_queues_source =
|
|
||||||
BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Should we use RDMA for short / eager messages? First check MCA
|
/* Should we use RDMA for short / eager messages? First check MCA
|
||||||
param, then check INI file values. */
|
param, then check INI file values. */
|
||||||
if (mca_btl_openib_component.use_eager_rdma >= 0) {
|
if (mca_btl_openib_component.use_eager_rdma >= 0) {
|
||||||
@ -1795,6 +1771,45 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
|||||||
"apm not enough ports", true);
|
"apm not enough ports", true);
|
||||||
mca_btl_openib_component.apm_ports = 0;
|
mca_btl_openib_component.apm_ports = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If the user specified btl_openib_receive_queues MCA param, it
|
||||||
|
overrides all device INI params */
|
||||||
|
if (BTL_OPENIB_RQ_SOURCE_MCA !=
|
||||||
|
mca_btl_openib_component.receive_queues_source &&
|
||||||
|
NULL != values.receive_queues) {
|
||||||
|
/* If a prior device's INI values set a different value for
|
||||||
|
receive_queues, this is unsupported (see
|
||||||
|
https://svn.open-mpi.org/trac/ompi/ticket/1285) */
|
||||||
|
if (BTL_OPENIB_RQ_SOURCE_DEVICE_INI ==
|
||||||
|
mca_btl_openib_component.receive_queues_source) {
|
||||||
|
if (0 != strcmp(values.receive_queues,
|
||||||
|
mca_btl_openib_component.receive_queues)) {
|
||||||
|
orte_show_help("help-mpi-btl-openib.txt",
|
||||||
|
"conflicting receive_queues", true,
|
||||||
|
orte_process_info.nodename,
|
||||||
|
ibv_get_device_name(device->ib_dev),
|
||||||
|
device->ib_dev_attr.vendor_id,
|
||||||
|
device->ib_dev_attr.vendor_part_id,
|
||||||
|
values.receive_queues,
|
||||||
|
ibv_get_device_name(receive_queues_device->ib_dev),
|
||||||
|
receive_queues_device->ib_dev_attr.vendor_id,
|
||||||
|
receive_queues_device->ib_dev_attr.vendor_part_id,
|
||||||
|
mca_btl_openib_component.receive_queues,
|
||||||
|
opal_install_dirs.pkgdatadir);
|
||||||
|
ret = OMPI_ERR_RESOURCE_BUSY;
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (NULL != mca_btl_openib_component.receive_queues) {
|
||||||
|
free(mca_btl_openib_component.receive_queues);
|
||||||
|
}
|
||||||
|
receive_queues_device = device;
|
||||||
|
mca_btl_openib_component.receive_queues =
|
||||||
|
strdup(values.receive_queues);
|
||||||
|
mca_btl_openib_component.receive_queues_source =
|
||||||
|
BTL_OPENIB_RQ_SOURCE_DEVICE_INI;
|
||||||
|
}
|
||||||
|
}
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -310,6 +310,11 @@ void mca_btl_openib_endpoint_init(mca_btl_openib_module_t *btl,
|
|||||||
ep->rem_info.rem_subnet_id,
|
ep->rem_info.rem_subnet_id,
|
||||||
ep->rem_info.rem_mtu);
|
ep->rem_info.rem_mtu);
|
||||||
|
|
||||||
|
ep->rem_info.rem_vendor_id = (remote_proc_info->pm_port_info).vendor_id;
|
||||||
|
ep->rem_info.rem_vendor_part_id = (remote_proc_info->pm_port_info).vendor_part_id;
|
||||||
|
|
||||||
|
ep->rem_info.rem_transport_type = (remote_proc_info->pm_port_info).transport_type;
|
||||||
|
|
||||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||||
endpoint_init_qp(ep, qp);
|
endpoint_init_qp(ep, qp);
|
||||||
}
|
}
|
||||||
|
@ -94,6 +94,12 @@ typedef struct mca_btl_openib_rem_info_t {
|
|||||||
mca_btl_openib_rem_qp_info_t *rem_qps;
|
mca_btl_openib_rem_qp_info_t *rem_qps;
|
||||||
/* Remote xrc_srq info, used only with XRC connections */
|
/* Remote xrc_srq info, used only with XRC connections */
|
||||||
mca_btl_openib_rem_srq_info_t *rem_srqs;
|
mca_btl_openib_rem_srq_info_t *rem_srqs;
|
||||||
|
/* Vendor id of remote HCA */
|
||||||
|
uint32_t rem_vendor_id;
|
||||||
|
/* Vendor part id of remote HCA */
|
||||||
|
uint32_t rem_vendor_part_id;
|
||||||
|
/* Transport type of remote port */
|
||||||
|
mca_btl_openib_transport_type_t rem_transport_type;
|
||||||
} mca_btl_openib_rem_info_t;
|
} mca_btl_openib_rem_info_t;
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||||
@ -526,6 +526,13 @@ int btl_openib_register_mca_params(void)
|
|||||||
mid_qp_size,
|
mid_qp_size,
|
||||||
(uint32_t)mca_btl_openib_module.super.btl_eager_limit,
|
(uint32_t)mca_btl_openib_module.super.btl_eager_limit,
|
||||||
(uint32_t)mca_btl_openib_module.super.btl_max_send_size);
|
(uint32_t)mca_btl_openib_module.super.btl_max_send_size);
|
||||||
|
|
||||||
|
mca_btl_openib_component.default_recv_qps = strdup(default_qps);
|
||||||
|
if(NULL == mca_btl_openib_component.default_recv_qps) {
|
||||||
|
BTL_ERROR(("Unable to allocate memory for default receive queues string.\n"));
|
||||||
|
return OMPI_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
CHECK(reg_string("receive_queues", NULL,
|
CHECK(reg_string("receive_queues", NULL,
|
||||||
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
|
"Colon-delimited, comma delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
|
||||||
default_qps, &mca_btl_openib_component.receive_queues,
|
default_qps, &mca_btl_openib_component.receive_queues,
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||||
# Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
|
# Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
||||||
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
@ -590,3 +590,30 @@ value will be ignored.
|
|||||||
Local host: %s
|
Local host: %s
|
||||||
Value: %s
|
Value: %s
|
||||||
Message: %s
|
Message: %s
|
||||||
|
#
|
||||||
|
[unsupported queues configuration]
|
||||||
|
The Open MPI receive queue configuration for the OpenFabrics devices on two nodes are incompatible,
|
||||||
|
meaning that MPI processes on two specific nodes were unable to communicate with each other.
|
||||||
|
This generally happens when you are using OpenFabrics devices from different vendors on the same network.
|
||||||
|
You should be able to use the mca_btl_openib_receive_queues MCA parameter to set a uniform receive queue configuration
|
||||||
|
for all the devices in the MPI job, and therefore be able to run successfully.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Local adapter: %s (vendor 0x%x, part ID %d)
|
||||||
|
Local queues: %s
|
||||||
|
|
||||||
|
Remote host: %s
|
||||||
|
Remote adapter: (vendor 0x%x, part ID %d)
|
||||||
|
Remote queues: %s
|
||||||
|
#
|
||||||
|
[conflicting transport types]
|
||||||
|
Open MPI detected two different OpenFabrics transport types in the same Infiniband network.
|
||||||
|
Such mixed network trasport configuration is not supported by Open MPI.
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Local adapter: %s (vendor 0x%x, part ID %d)
|
||||||
|
Local transport type: %s
|
||||||
|
|
||||||
|
Remote host: %s
|
||||||
|
Remote Adapter: (vendor 0x%x, part ID %d)
|
||||||
|
Remote transport type: %s
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user