From 814a8f5e0fe9ed29201122159bcc065758b6f073 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Thu, 11 Jun 2009 17:30:30 +0000 Subject: [PATCH] * Fix #1916: endian problems in iwarp wireup on big endian machines (now works on both big and little endian machines) * Be a little more flexible when looking for active devices in btl_openib_component.c * Add device name and port number to lots of verbose and help messages * Add a bunch of verbose messages to give insight into what is occurring during all the CPC wireups This commit was SVN r21418. --- ompi/mca/btl/openib/btl_openib_component.c | 30 +++++++++++----- ompi/mca/btl/openib/btl_openib_iwarp.c | 10 +++--- .../openib/connect/btl_openib_connect_base.c | 2 +- .../openib/connect/btl_openib_connect_ibcm.c | 36 ++++++++++++------- .../openib/connect/btl_openib_connect_oob.c | 15 +++++--- .../connect/btl_openib_connect_rdmacm.c | 20 +++++++---- .../openib/connect/btl_openib_connect_xoob.c | 11 +++--- .../connect/help-mpi-btl-openib-cpc-base.txt | 4 ++- 8 files changed, 83 insertions(+), 45 deletions(-) diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 28534a5411..611106e3d2 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -2380,17 +2380,21 @@ btl_openib_component_init(int *num_btl_modules, /* Copy the btl module structs into a contiguous array and fully initialize them */ - for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++){ - item = opal_list_remove_first(&btl_list); + i = 0; + while (NULL != (item = opal_list_remove_first(&btl_list))) { ib_selected = (mca_btl_base_selected_module_t*)item; openib_btl = (mca_btl_openib_module_t*)ib_selected->btl_module; - /* Do we have at least one CPC that can handle this - port? */ - ret = - ompi_btl_openib_connect_base_select_for_local_port(openib_btl); - if (OMPI_SUCCESS != ret) { - /* We already did a show_help in the lower layer */ + /* Search for a CPC that can handle this port */ + ret = ompi_btl_openib_connect_base_select_for_local_port(openib_btl); + /* If we get NOT_SUPPORTED, then no CPC was found for this + port. But that's not a fatal error -- just keep going; + let's see if we find any usable openib modules or not. */ + if (OMPI_ERR_NOT_SUPPORTED == ret) { + continue; + } else if (OMPI_SUCCESS != ret) { + /* All others *are* fatal. Note that we already did a + show_help in the lower layer */ goto no_btls; } @@ -2400,7 +2404,15 @@ btl_openib_component_init(int *num_btl_modules, if (finish_btl_init(openib_btl) != OMPI_SUCCESS) { goto no_btls; } - } + ++i; + } + /* If we got nothing, then error out */ + if (0 == i) { + goto no_btls; + } + /* Otherwise reset to the number of openib modules that we + actually got */ + mca_btl_openib_component.ib_num_btls = i; btl_openib_modex_send(); diff --git a/ompi/mca/btl/openib/btl_openib_iwarp.c b/ompi/mca/btl/openib/btl_openib_iwarp.c index 0ce0efa616..dc9c8c8b1c 100644 --- a/ompi/mca/btl/openib/btl_openib_iwarp.c +++ b/ompi/mca/btl/openib/btl_openib_iwarp.c @@ -210,8 +210,8 @@ static int ipaddr_specified(struct sockaddr_in *ipaddr, uint32_t netmask) opal_argv_free(temp); continue; } - list_subnet = ipae.s_addr & ~(~0 << atoi(temp[1])); - subnet = ipaddr->sin_addr.s_addr & ~(~0 << netmask); + list_subnet = ntohl(ipae.s_addr) & ~(~0 >> atoi(temp[1])); + subnet = ntohl(ipaddr->sin_addr.s_addr) & ~(~0 >> netmask); opal_argv_free(temp); if (subnet == list_subnet) { @@ -252,8 +252,8 @@ static int ipaddr_specified(struct sockaddr_in *ipaddr, uint32_t netmask) opal_argv_free(temp); continue; } - list_subnet = ipae.s_addr & ~(~0 << atoi(temp[1])); - subnet = ipaddr->sin_addr.s_addr & ~(~0 << netmask); + list_subnet = ntohl(ipae.s_addr) & ~(~0 >> atoi(temp[1])); + subnet = ntohl(ipaddr->sin_addr.s_addr) & ~(~0 >> netmask); opal_argv_free(temp); if (subnet == list_subnet) { @@ -318,7 +318,7 @@ static int add_rdma_addr(struct sockaddr *ipaddr, uint32_t netmask) sinp = (struct sockaddr_in *)ipaddr; myaddr->addr = sinp->sin_addr.s_addr; - myaddr->subnet = myaddr->addr & ~(~0 << netmask); + myaddr->subnet = ntohl(myaddr->addr) & ~(~0 >> netmask); inet_ntop(sinp->sin_family, &sinp->sin_addr, myaddr->addr_str, sizeof(myaddr->addr_str)); memcpy(myaddr->dev_name, cm_id->verbs->device->name, IBV_SYSFS_NAME_MAX); diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_base.c b/ompi/mca/btl/openib/connect/btl_openib_connect_base.c index 9c4d48a144..43385f5d9e 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_base.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_base.c @@ -294,7 +294,7 @@ int ompi_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t * "no cpcs for port", true, orte_process_info.nodename, ibv_get_device_name(btl->device->ib_dev), - msg); + btl->port_num, msg); free(cpcs); free(msg); return OMPI_ERR_NOT_SUPPORTED; diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c index 80efb9223d..ea4c79fbc7 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Mellanox Technologies. All rights reserved. * * $COPYRIGHT$ @@ -654,7 +654,9 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl, iWarp), so we can safely assume that we can use this CPC. */ #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) if (IBV_TRANSPORT_IB != btl->device->ib_dev->transport_type) { - BTL_VERBOSE(("ibcm CPC only supported on InfiniBand")); + BTL_VERBOSE(("ibcm CPC only supported on InfiniBand; skipped on %s:%d", + ibv_get_device_name(btl->device->ib_dev), + openib_btl->port_num)); rc = OMPI_ERR_NOT_SUPPORTED; goto error; } @@ -662,7 +664,9 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl, /* IBCM is not supported if we have any XRC QPs */ if (mca_btl_openib_component.num_xrc_qps > 0) { - BTL_VERBOSE(("ibcm CPC not supported with XRC receive queues, please try xoob CPC; skipped")); + BTL_VERBOSE(("ibcm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d", + ibv_get_device_name(btl->device->ib_dev), + openib_btl->port_num)); rc = OMPI_ERR_NOT_SUPPORTED; goto error; } @@ -754,9 +758,10 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl, rc = OMPI_ERR_NOT_SUPPORTED; goto error; } - OPAL_OUTPUT((-1, "opened ibcm device 0x%" PRIx64 " (%s)", + OPAL_OUTPUT((-1, "opened ibcm device 0x%" PRIx64 " (%s:%d)", (uint64_t) cmh->cm_device, - ibv_get_device_name(cmh->ib_context->device))); + ibv_get_device_name(cmh->ib_context->device), + openib_btl->port_num)); if (0 != (rc = ib_cm_create_id(cmh->cm_device, &cmh->listen_cm_id, NULL))) { @@ -808,10 +813,11 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl, different formula). Query for the Nth GID (N = MCA param) on the port. */ if (ibcm_gid_table_index > btl->ib_port_attr.gid_tbl_len) { - BTL_ERROR(("desired GID table index (%d) is larger than the actual table size (%d) on device %s", + BTL_ERROR(("desired GID table index (%d) is larger than the actual table size (%d) on %s:%d", ibcm_gid_table_index, btl->ib_port_attr.gid_tbl_len, - ibv_get_device_name(btl->device->ib_dev))); + ibv_get_device_name(btl->device->ib_dev), + btl->port_num)); rc = OMPI_ERR_UNREACH; goto error; } @@ -842,19 +848,22 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl, /* All done */ *cpc = (ompi_btl_openib_connect_base_module_t *) m; - BTL_VERBOSE(("available for use on %s", - ibv_get_device_name(btl->device->ib_dev))); + BTL_VERBOSE(("available for use on %s:%d", + ibv_get_device_name(btl->device->ib_dev), + btl->port_num)); TIMER_STOP(QUERY); return OMPI_SUCCESS; error: ibcm_module_finalize(btl, (ompi_btl_openib_connect_base_module_t *) m); if (OMPI_ERR_NOT_SUPPORTED == rc) { - BTL_VERBOSE(("unavailable for use on %s; skipped", - ibv_get_device_name(btl->device->ib_dev))); + BTL_VERBOSE(("unavailable for use on %s:%d; skipped", + ibv_get_device_name(btl->device->ib_dev), + btl->port_num)); } else { - BTL_VERBOSE(("unavailable for use on %s; fatal error %d (%s)", - ibv_get_device_name(btl->device->ib_dev), rc, + BTL_VERBOSE(("unavailable for use on %s:%d; fatal error %d (%s)", + ibv_get_device_name(btl->device->ib_dev), + btl->port_num, rc, opal_strerror(rc))); } return rc; @@ -923,6 +932,7 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp, orte_show_help("help-mpi-btl-openib-cpc-base.txt", "inline truncated", orte_process_info.nodename, ibv_get_device_name(openib_btl->device->ib_dev), + openib_btl->port_num, req_inline, init_attr.cap.max_inline_data); } else { endpoint->qps[qp].ib_inline_max = req_inline; diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c index d861bbbc67..8e33ee092f 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_oob.c @@ -122,15 +122,18 @@ static int oob_component_query(mca_btl_openib_module_t *btl, #if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) if (IBV_TRANSPORT_IB != btl->device->ib_dev->transport_type) { opal_output_verbose(5, mca_btl_base_output, - "openib BTL: oob CPC only supported on InfiniBand; skipped on device %s", - ibv_get_device_name(btl->device->ib_dev)); + "openib BTL: oob CPC only supported on InfiniBand; skipped on %s:%d", + ibv_get_device_name(btl->device->ib_dev), + btl->port_num); return OMPI_ERR_NOT_SUPPORTED; } #endif if (mca_btl_openib_component.num_xrc_qps > 0) { opal_output_verbose(5, mca_btl_base_output, - "openib BTL: oob CPC not supported with XRC receive queues, please try xoob CPC; skipped"); + "openib BTL: oob CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d", + ibv_get_device_name(btl->device->ib_dev), + btl->port_num); return OMPI_ERR_NOT_SUPPORTED; } /* If this btl supports OOB, then post the RML message. But @@ -171,8 +174,9 @@ static int oob_component_query(mca_btl_openib_module_t *btl, (*cpc)->cbm_uses_cts = false; opal_output_verbose(5, mca_btl_base_output, - "openib BTL: oob CPC available for use on %s", - ibv_get_device_name(btl->device->ib_dev)); + "openib BTL: oob CPC available for use on %s:%d", + ibv_get_device_name(btl->device->ib_dev), + btl->port_num); return OMPI_SUCCESS; } @@ -468,6 +472,7 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp, orte_show_help("help-mpi-btl-openib-cpc-base.txt", "inline truncated", true, orte_process_info.nodename, ibv_get_device_name(openib_btl->device->ib_dev), + openib_btl->port_num, req_inline, init_attr.cap.max_inline_data); } else { endpoint->qps[qp].ib_inline_max = req_inline; diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c index 3d5a9c2745..47ae2101e9 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_rdmacm.c @@ -427,6 +427,7 @@ static int rdmacm_setup_qp(rdmacm_contents_t *contents, "inline truncated", true, orte_process_info.nodename, ibv_get_device_name(contents->openib_btl->device->ib_dev), + contents->openib_btl->port_num, req_inline, attr.cap.max_inline_data); } else { endpoint->qps[qpnum].ib_inline_max = req_inline; @@ -1752,7 +1753,9 @@ static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_ /* RDMACM is not supported if we have any XRC QPs */ if (mca_btl_openib_component.num_xrc_qps > 0) { - BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped")); + BTL_VERBOSE(("rdmacm CPC not supported with XRC receive queues, please try xoob CPC; skipped on %s:%d", + ibv_get_device_name(openib_btl->device->ib_dev), + openib_btl->port_num)); rc = OMPI_ERR_NOT_SUPPORTED; goto out; } @@ -1853,8 +1856,9 @@ static int rdmacm_component_query(mca_btl_openib_module_t *openib_btl, ompi_btl_ opal_list_append(&server_listener_list, &(server->super)); opal_output_verbose(5, mca_btl_base_output, - "openib BTL: rdmacm CPC available for use on %s", - ibv_get_device_name(openib_btl->device->ib_dev)); + "openib BTL: rdmacm CPC available for use on %s:%d", + ibv_get_device_name(openib_btl->device->ib_dev), + openib_btl->port_num); return OMPI_SUCCESS; out5: @@ -1869,12 +1873,14 @@ out1: out: if (OMPI_ERR_NOT_SUPPORTED == rc) { opal_output_verbose(5, mca_btl_base_output, - "openib BTL: rdmacm CPC unavailable for use on %s; skipped", - ibv_get_device_name(openib_btl->device->ib_dev)); + "openib BTL: rdmacm CPC unavailable for use on %s:%d; skipped", + ibv_get_device_name(openib_btl->device->ib_dev), + openib_btl->port_num); } else { opal_output_verbose(5, mca_btl_base_output, - "openib BTL: rmacm CPC unavailable for use on %s; fatal error %d (%s)", - ibv_get_device_name(openib_btl->device->ib_dev), rc, + "openib BTL: rmacm CPC unavailable for use on %s:%d; fatal error %d (%s)", + ibv_get_device_name(openib_btl->device->ib_dev), + openib_btl->port_num, rc, opal_strerror(rc)); } return rc; diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c b/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c index 3080b4c21f..30cbc97af8 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c @@ -414,6 +414,7 @@ static int xoob_send_qp_create (mca_btl_base_endpoint_t* endpoint) orte_show_help("help-mpi-btl-openib-cpc-base.txt", "inline truncated", orte_process_info.nodename, ibv_get_device_name(openib_btl->device->ib_dev), + openib_btl->port_num, req_inline, qp_init_attr.cap.max_inline_data); } else { endpoint->qps[0].ib_inline_max = req_inline; @@ -956,8 +957,9 @@ static int xoob_component_query(mca_btl_openib_module_t *openib_btl, if (mca_btl_openib_component.num_xrc_qps <= 0) { opal_output_verbose(5, mca_btl_base_output, - "openib BTL: xoob CPC only supported with XRC receive queues; skipped on device %s", - ibv_get_device_name(openib_btl->device->ib_dev)); + "openib BTL: xoob CPC only supported with XRC receive queues; skipped on %s:%d", + ibv_get_device_name(openib_btl->device->ib_dev), + openib_btl->port_num); return OMPI_ERR_NOT_SUPPORTED; } @@ -998,8 +1000,9 @@ static int xoob_component_query(mca_btl_openib_module_t *openib_btl, (*cpc)->cbm_uses_cts = false; opal_output_verbose(5, mca_btl_base_output, - "openib BTL: xoob CPC available for use on %s", - ibv_get_device_name(openib_btl->device->ib_dev)); + "openib BTL: xoob CPC available for use on %s:%d", + ibv_get_device_name(openib_btl->device->ib_dev), + openib_btl->port_num); return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/openib/connect/help-mpi-btl-openib-cpc-base.txt b/ompi/mca/btl/openib/connect/help-mpi-btl-openib-cpc-base.txt index 63226a7f76..5db2abddf3 100644 --- a/ompi/mca/btl/openib/connect/help-mpi-btl-openib-cpc-base.txt +++ b/ompi/mca/btl/openib/connect/help-mpi-btl-openib-cpc-base.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -17,6 +17,7 @@ support) will be disabled for this port. Local host: %s Local device: %s + Local port: %d CPCs attempted: %s # [cpc name not found] @@ -36,5 +37,6 @@ a smaller inline data value than was requested. Local host: %s Local device: %s + Local port: %d Requested value: %d Value used by device: %d