1
1

* Fix #1916: endian problems in iwarp wireup on big endian machines

(now works on both big and little endian machines)
 * Be a little more flexible when looking for active devices in
   btl_openib_component.c
 * Add device name and port number to lots of verbose and help
   messages
 * Add a bunch of verbose messages to give insight into what is
   occurring during all the CPC wireups

This commit was SVN r21418.
Этот коммит содержится в:
Jeff Squyres 2009-06-11 17:30:30 +00:00
родитель 4881cd0df3
Коммит 814a8f5e0f
8 изменённых файлов: 83 добавлений и 45 удалений

Просмотреть файл

@ -2380,17 +2380,21 @@ btl_openib_component_init(int *num_btl_modules,
/* Copy the btl module structs into a contiguous array and fully
initialize them */
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++){
item = opal_list_remove_first(&btl_list);
i = 0;
while (NULL != (item = opal_list_remove_first(&btl_list))) {
ib_selected = (mca_btl_base_selected_module_t*)item;
openib_btl = (mca_btl_openib_module_t*)ib_selected->btl_module;
/* Do we have at least one CPC that can handle this
port? */
ret =
ompi_btl_openib_connect_base_select_for_local_port(openib_btl);
if (OMPI_SUCCESS != ret) {
/* We already did a show_help in the lower layer */
/* Search for a CPC that can handle this port */
ret = ompi_btl_openib_connect_base_select_for_local_port(openib_btl);
/* If we get NOT_SUPPORTED, then no CPC was found for this
port. But that's not a fatal error -- just keep going;
let's see if we find any usable openib modules or not. */
if (OMPI_ERR_NOT_SUPPORTED == ret) {
continue;
} else if (OMPI_SUCCESS != ret) {
/* All others *are* fatal. Note that we already did a
show_help in the lower layer */
goto no_btls;
}
@ -2400,7 +2404,15 @@ btl_openib_component_init(int *num_btl_modules,
if (finish_btl_init(openib_btl) != OMPI_SUCCESS) {
goto no_btls;
}
}
++i;
}
/* If we got nothing, then error out */
if (0 == i) {
goto no_btls;
}
/* Otherwise reset to the number of openib modules that we
actually got */
mca_btl_openib_component.ib_num_btls = i;
btl_openib_modex_send();

Просмотреть файл

@ -210,8 +210,8 @@ static int ipaddr_specified(struct sockaddr_in *ipaddr, uint32_t netmask)
opal_argv_free(temp);
continue;
}
list_subnet = ipae.s_addr & ~(~0 << atoi(temp[1]));
subnet = ipaddr->sin_addr.s_addr & ~(~0 << netmask);
list_subnet = ntohl(ipae.s_addr) & ~(~0 >> atoi(temp[1]));
subnet = ntohl(ipaddr->sin_addr.s_addr) & ~(~0 >> netmask);
opal_argv_free(temp);
if (subnet == list_subnet) {
@ -252,8 +252,8 @@ static int ipaddr_specified(struct sockaddr_in *ipaddr, uint32_t netmask)
opal_argv_free(temp);
continue;
}
list_subnet = ipae.s_addr & ~(~0 << atoi(temp[1]));
subnet = ipaddr->sin_addr.s_addr & ~(~0 << netmask);
list_subnet = ntohl(ipae.s_addr) & ~(~0 >> atoi(temp[1]));
subnet = ntohl(ipaddr->sin_addr.s_addr) & ~(~0 >> netmask);
opal_argv_free(temp);
if (subnet == list_subnet) {
@ -318,7 +318,7 @@ static int add_rdma_addr(struct sockaddr *ipaddr, uint32_t netmask)
sinp = (struct sockaddr_in *)ipaddr;
myaddr->addr = sinp->sin_addr.s_addr;
myaddr->subnet = myaddr->addr & ~(~0 << netmask);
myaddr->subnet = ntohl(myaddr->addr) & ~(~0 >> netmask);
inet_ntop(sinp->sin_family, &sinp->sin_addr,
myaddr->addr_str, sizeof(myaddr->addr_str));
memcpy(myaddr->dev_name, cm_id->verbs->device->name, IBV_SYSFS_NAME_MAX);

Просмотреть файл

@ -294,7 +294,7 @@ int ompi_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t *
"no cpcs for port", true,
orte_process_info.nodename,
ibv_get_device_name(btl->device->ib_dev),
msg);
btl->port_num, msg);
free(cpcs);
free(msg);
return OMPI_ERR_NOT_SUPPORTED;

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008 Mellanox Technologies. All rights reserved.
*
* $COPYRIGHT$
@ -654,7 +654,9 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
iWarp), so we can safely assume that we can use this CPC. */
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
if (IBV_TRANSPORT_IB != btl->device->ib_dev->transport_type) {
BTL_VERBOSE(("ibcm CPC only supported on InfiniBand"));
BTL_VERBOSE(("ibcm CPC only supported on InfiniBand; skipped on %s:%d",
ibv_get_device_name(btl->device->ib_dev),
openib_btl->port_num));
rc = OMPI_ERR_NOT_SUPPORTED;
goto error;
}
@ -662,7 +664,9 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
/* IBCM is not supported if we have any XRC QPs */
if (mca_btl_openib_component.num_xrc_qps > 0) {
BTL_VERBOSE(("ibcm CPC not supported with XRC receive queues, please try xoob CPC; skipped"));
BTL_VERBOSE(("ibcm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d",
ibv_get_device_name(btl->device->ib_dev),
openib_btl->port_num));
rc = OMPI_ERR_NOT_SUPPORTED;
goto error;
}
@ -754,9 +758,10 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
rc = OMPI_ERR_NOT_SUPPORTED;
goto error;
}
OPAL_OUTPUT((-1, "opened ibcm device 0x%" PRIx64 " (%s)",
OPAL_OUTPUT((-1, "opened ibcm device 0x%" PRIx64 " (%s:%d)",
(uint64_t) cmh->cm_device,
ibv_get_device_name(cmh->ib_context->device)));
ibv_get_device_name(cmh->ib_context->device),
openib_btl->port_num));
if (0 != (rc = ib_cm_create_id(cmh->cm_device,
&cmh->listen_cm_id, NULL))) {
@ -808,10 +813,11 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
different formula). Query for the Nth GID (N = MCA param) on
the port. */
if (ibcm_gid_table_index > btl->ib_port_attr.gid_tbl_len) {
BTL_ERROR(("desired GID table index (%d) is larger than the actual table size (%d) on device %s",
BTL_ERROR(("desired GID table index (%d) is larger than the actual table size (%d) on %s:%d",
ibcm_gid_table_index,
btl->ib_port_attr.gid_tbl_len,
ibv_get_device_name(btl->device->ib_dev)));
ibv_get_device_name(btl->device->ib_dev),
btl->port_num));
rc = OMPI_ERR_UNREACH;
goto error;
}
@ -842,19 +848,22 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
/* All done */
*cpc = (ompi_btl_openib_connect_base_module_t *) m;
BTL_VERBOSE(("available for use on %s",
ibv_get_device_name(btl->device->ib_dev)));
BTL_VERBOSE(("available for use on %s:%d",
ibv_get_device_name(btl->device->ib_dev),
btl->port_num));
TIMER_STOP(QUERY);
return OMPI_SUCCESS;
error:
ibcm_module_finalize(btl, (ompi_btl_openib_connect_base_module_t *) m);
if (OMPI_ERR_NOT_SUPPORTED == rc) {
BTL_VERBOSE(("unavailable for use on %s; skipped",
ibv_get_device_name(btl->device->ib_dev)));
BTL_VERBOSE(("unavailable for use on %s:%d; skipped",
ibv_get_device_name(btl->device->ib_dev),
btl->port_num));
} else {
BTL_VERBOSE(("unavailable for use on %s; fatal error %d (%s)",
ibv_get_device_name(btl->device->ib_dev), rc,
BTL_VERBOSE(("unavailable for use on %s:%d; fatal error %d (%s)",
ibv_get_device_name(btl->device->ib_dev),
btl->port_num, rc,
opal_strerror(rc)));
}
return rc;
@ -923,6 +932,7 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
"inline truncated", orte_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
openib_btl->port_num,
req_inline, init_attr.cap.max_inline_data);
} else {
endpoint->qps[qp].ib_inline_max = req_inline;

Просмотреть файл

@ -122,15 +122,18 @@ static int oob_component_query(mca_btl_openib_module_t *btl,
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
if (IBV_TRANSPORT_IB != btl->device->ib_dev->transport_type) {
opal_output_verbose(5, mca_btl_base_output,
"openib BTL: oob CPC only supported on InfiniBand; skipped on device %s",
ibv_get_device_name(btl->device->ib_dev));
"openib BTL: oob CPC only supported on InfiniBand; skipped on %s:%d",
ibv_get_device_name(btl->device->ib_dev),
btl->port_num);
return OMPI_ERR_NOT_SUPPORTED;
}
#endif
if (mca_btl_openib_component.num_xrc_qps > 0) {
opal_output_verbose(5, mca_btl_base_output,
"openib BTL: oob CPC not supported with XRC receive queues, please try xoob CPC; skipped");
"openib BTL: oob CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d",
ibv_get_device_name(btl->device->ib_dev),
btl->port_num);
return OMPI_ERR_NOT_SUPPORTED;
}
/* If this btl supports OOB, then post the RML message. But
@ -171,8 +174,9 @@ static int oob_component_query(mca_btl_openib_module_t *btl,
(*cpc)->cbm_uses_cts = false;
opal_output_verbose(5, mca_btl_base_output,
"openib BTL: oob CPC available for use on %s",
ibv_get_device_name(btl->device->ib_dev));
"openib BTL: oob CPC available for use on %s:%d",
ibv_get_device_name(btl->device->ib_dev),
btl->port_num);
return OMPI_SUCCESS;
}
@ -468,6 +472,7 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
"inline truncated", true, orte_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
openib_btl->port_num,
req_inline, init_attr.cap.max_inline_data);
} else {
endpoint->qps[qp].ib_inline_max = req_inline;

Просмотреть файл

@ -427,6 +427,7 @@ static int rdmacm_setup_qp(rdmacm_contents_t *contents,
"inline truncated", true,
orte_process_info.nodename,
ibv_get_device_name(contents->openib_btl->device->ib_dev),
contents->openib_btl->port_num,
req_inline, attr.cap.max_inline_data);
} else {
endpoint->qps[qpnum].ib_inline_max = req_inline;
@ -1752,7 +1753,9 @@ static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_
/* RDMACM is not supported if we have any XRC QPs */
if (mca_btl_openib_component.num_xrc_qps > 0) {
BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped"));
BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d",
ibv_get_device_name(openib_btl->device->ib_dev),
openib_btl->port_num));
rc = OMPI_ERR_NOT_SUPPORTED;
goto out;
}
@ -1853,8 +1856,9 @@ static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_
opal_list_append(&server_listener_list, &(server->super));
opal_output_verbose(5, mca_btl_base_output,
"openib BTL: rdmacm CPC available for use on %s",
ibv_get_device_name(openib_btl->device->ib_dev));
"openib BTL: rdmacm CPC available for use on %s:%d",
ibv_get_device_name(openib_btl->device->ib_dev),
openib_btl->port_num);
return OMPI_SUCCESS;
out5:
@ -1869,12 +1873,14 @@ out1:
out:
if (OMPI_ERR_NOT_SUPPORTED == rc) {
opal_output_verbose(5, mca_btl_base_output,
"openib BTL: rdmacm CPC unavailable for use on %s; skipped",
ibv_get_device_name(openib_btl->device->ib_dev));
"openib BTL: rdmacm CPC unavailable for use on %s:%d; skipped",
ibv_get_device_name(openib_btl->device->ib_dev),
openib_btl->port_num);
} else {
opal_output_verbose(5, mca_btl_base_output,
"openib BTL: rmacm CPC unavailable for use on %s; fatal error %d (%s)",
ibv_get_device_name(openib_btl->device->ib_dev), rc,
"openib BTL: rmacm CPC unavailable for use on %s:%d; fatal error %d (%s)",
ibv_get_device_name(openib_btl->device->ib_dev),
openib_btl->port_num, rc,
opal_strerror(rc));
}
return rc;

Просмотреть файл

@ -414,6 +414,7 @@ static int xoob_send_qp_create (mca_btl_base_endpoint_t* endpoint)
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
"inline truncated", orte_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
openib_btl->port_num,
req_inline, qp_init_attr.cap.max_inline_data);
} else {
endpoint->qps[0].ib_inline_max = req_inline;
@ -956,8 +957,9 @@ static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
if (mca_btl_openib_component.num_xrc_qps <= 0) {
opal_output_verbose(5, mca_btl_base_output,
"openib BTL: xoob CPC only supported with XRC receive queues; skipped on device %s",
ibv_get_device_name(openib_btl->device->ib_dev));
"openib BTL: xoob CPC only supported with XRC receive queues; skipped on %s:%d",
ibv_get_device_name(openib_btl->device->ib_dev),
openib_btl->port_num);
return OMPI_ERR_NOT_SUPPORTED;
}
@ -998,8 +1000,9 @@ static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
(*cpc)->cbm_uses_cts = false;
opal_output_verbose(5, mca_btl_base_output,
"openib BTL: xoob CPC available for use on %s",
ibv_get_device_name(openib_btl->device->ib_dev));
"openib BTL: xoob CPC available for use on %s:%d",
ibv_get_device_name(openib_btl->device->ib_dev),
openib_btl->port_num);
return OMPI_SUCCESS;
}

Просмотреть файл

@ -1,6 +1,6 @@
# -*- text -*-
#
# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -17,6 +17,7 @@ support) will be disabled for this port.
Local host: %s
Local device: %s
Local port: %d
CPCs attempted: %s
#
[cpc name not found]
@ -36,5 +37,6 @@ a smaller inline data value than was requested.
Local host: %s
Local device: %s
Local port: %d
Requested value: %d
Value used by device: %d