1
1
This commit decouples OMPI deployment from the version(s) of the lower
layers of the stack by probing for UDP support.

Verbs applications assume a 40-byte header (there is no current
mechanism for querying payload offset).  So to support a 42-byte UDP
header without causing existing applications like ibv_ud_pingpong or
older versions of OMPI to crash, we must inform libusnic_verbs that we
are aware of the nonstandard payload offset.  We do this by overriding
the `transport_type` field of the device to be 42 before calling
`ibv_open_device`.  If the library resets it to something else, then we
know the lower layers are UDP capable.  Otherwise we use the older
custom-L2 format.

This necessitated some minor ugliness in common_verbs, but it's as tidy
as Jeff and I know how to make it right now.

This commit only adds support for UDP headers and connectivity over the
same L2 network, it does not touch routing or interface pairing.

Reviewed-by: Jeff Squyres <jsquyres@cisco.com>

cmr=v1.7.5:ticket=trac:4253

This commit was SVN r30838.

The following Trac tickets were found above:
  Ticket 4253 --> https://svn.open-mpi.org/trac/ompi/ticket/4253
Этот коммит содержится в:
Dave Goodell 2014-02-26 07:44:35 +00:00
родитель e10ad5763f
Коммит 4875f48eaa
10 изменённых файлов: 180 добавлений и 86 удалений

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2006-2011 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
@ -152,7 +152,7 @@ AC_DEFUN([OMPI_CHECK_OPENFABRICS],[
# If we have the openib stuff available, find out what we've got
AS_IF([test "$ompi_check_openib_happy" = "yes"],
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO], [], [],
[AC_CHECK_DECLS([IBV_EVENT_CLIENT_REREGISTER, IBV_ACCESS_SO, IBV_TRANSPORT_USNIC, IBV_TRANSPORT_USNIC_UDP, IBV_NODE_USNIC], [], [],
[#include <infiniband/verbs.h>])
AC_CHECK_FUNCS([ibv_get_device_list ibv_resize_cq])

Просмотреть файл

@ -145,6 +145,9 @@ typedef struct ompi_btl_usnic_component_t {
/** convertor packing threshold */
int pack_lazy_threshold;
/** does the stack below us speak UDP or custom-L2? */
bool use_udp;
/* vvvvvvvvvv non-fastpath fields go below vvvvvvvvvv */
/** list of usnic proc structures */

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow

Просмотреть файл

@ -98,7 +98,6 @@ static mca_btl_base_module_t **
usnic_component_init(int* num_btl_modules, bool want_progress_threads,
bool want_mpi_threads);
static int usnic_component_progress(void);
static bool port_is_usnic(ompi_common_verbs_port_item_t *port);
static int init_module_from_port(ompi_btl_usnic_module_t *module,
ompi_common_verbs_port_item_t *port);
@ -526,11 +525,34 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
* filtering below, so don't let the verbs code see our
* if_include/if_exclude strings */
port_list = ompi_common_verbs_find_ports(NULL, NULL,
OMPI_COMMON_VERBS_FLAGS_UD,
OMPI_COMMON_VERBS_FLAGS_TRANSPORT_USNIC_UDP,
USNIC_OUT);
if (NULL == port_list || 0 == opal_list_get_size(port_list)) {
mca_btl_base_error_no_nics("usNIC", "device");
if (NULL == port_list) {
OMPI_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto free_include_list;
} else if (opal_list_get_size(port_list) > 0) {
mca_btl_usnic_component.use_udp = true;
opal_output_verbose(5, USNIC_OUT, "btl:usnic: using UDP transport");
} else {
OBJ_RELEASE(port_list);
/* If we got no USNIC_UDP transport devices, try again with
USNIC */
port_list = ompi_common_verbs_find_ports(NULL, NULL,
OMPI_COMMON_VERBS_FLAGS_TRANSPORT_USNIC,
USNIC_OUT);
if (NULL == port_list) {
OMPI_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
goto free_include_list;
} else if (opal_list_get_size(port_list) > 0) {
mca_btl_usnic_component.use_udp = false;
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: using L2-only transport");
} else {
mca_btl_base_error_no_nics("usNIC", "device");
goto free_include_list;
}
}
/* Setup an array of pointers to point to each module (which we'll
@ -597,16 +619,6 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
"btl:usnic: found: device %s, port %d",
port->device->device_name, port->port_num);
/* This component only works with Cisco VIC/usNIC devices; it
is not a general verbs UD component. Reject any ports
found on devices that are not Cisco VICs. */
if (!port_is_usnic(port)) {
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: this is not a usnic-capable device");
--mca_btl_usnic_component.num_modules;
continue; /* next port */
}
/* Fill in a bunch of the module struct */
module = &(mca_btl_usnic_component.usnic_all_modules[i]);
if (OMPI_SUCCESS != init_module_from_port(module, port)) {
@ -695,24 +707,24 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
/* Find the max payload this port can handle */
module->max_frag_payload =
module->if_mtu - /* start with the MTU */
sizeof(ompi_btl_usnic_protocol_header_t) -
OMPI_BTL_USNIC_PROTO_HDR_SZ -
sizeof(ompi_btl_usnic_btl_header_t); /* subtract size of
the BTL header */
/* same, but use chunk header */
module->max_chunk_payload =
module->if_mtu -
sizeof(ompi_btl_usnic_protocol_header_t) -
OMPI_BTL_USNIC_PROTO_HDR_SZ -
sizeof(ompi_btl_usnic_btl_chunk_header_t);
/* Priorirty queue MTU and max size */
if (0 == module->tiny_mtu) {
module->tiny_mtu = 768;
module->max_tiny_payload = module->tiny_mtu -
sizeof(ompi_btl_usnic_protocol_header_t) -
OMPI_BTL_USNIC_PROTO_HDR_SZ -
sizeof(ompi_btl_usnic_btl_header_t);
} else {
module->tiny_mtu = module->max_tiny_payload +
sizeof(ompi_btl_usnic_protocol_header_t) +
OMPI_BTL_USNIC_PROTO_HDR_SZ +
sizeof(ompi_btl_usnic_btl_header_t);
}
@ -982,7 +994,7 @@ static int usnic_handle_completion(
going. The sender will eventually re-send it. */
if (IBV_WC_RECV == cwc->opcode) {
if (cwc->byte_len <
(sizeof(ompi_btl_usnic_protocol_header_t)+
(OMPI_BTL_USNIC_PROTO_HDR_SZ +
sizeof(ompi_btl_usnic_btl_header_t))) {
BTL_ERROR(("%s: RX error polling CQ[%d] with status %d for wr_id %" PRIx64 " vend_err %d, byte_len %d",
ibv_get_device_name(module->device),
@ -1115,33 +1127,6 @@ static int usnic_component_progress_2(void)
return count;
}
static bool port_is_usnic(ompi_common_verbs_port_item_t *port)
{
bool is_usnic = false;
uint32_t *vpi;
#if BTL_USNIC_HAVE_IBV_USNIC
/* If we have the IB_*_USNIC constants, then take any
device which advertises them */
if (IBV_TRANSPORT_USNIC == port->device->device->transport_type &&
IBV_NODE_USNIC == port->device->device->node_type) {
is_usnic = true;
}
#endif
/* Or take any specific device that we know is a Cisco
VIC. Cisco's vendor ID is 0x1137. */
if (!is_usnic && 0x1137 == port->device->device_attr.vendor_id) {
for (vpi = mca_btl_usnic_component.vendor_part_ids; *vpi > 0; ++vpi) {
if (port->device->device_attr.vendor_part_id == *vpi) {
is_usnic = true;
break;
}
}
}
return is_usnic;
}
/* returns OMPI_SUCCESS if module initialization was successful, OMPI_ERROR
* otherwise */
static int init_module_from_port(ompi_btl_usnic_module_t *module,

Просмотреть файл

@ -126,8 +126,9 @@ recv_seg_constructor(
/* on receive, BTL header starts after protocol header */
seg->rs_protocol_header = bseg->us_list.ptr;
bseg->us_btl_header =
(ompi_btl_usnic_btl_header_t *) (seg->rs_protocol_header + 1);
bseg->us_btl_header = (ompi_btl_usnic_btl_header_t *)(
((char *)seg->rs_protocol_header) +
OMPI_BTL_USNIC_PROTO_HDR_SZ);
/* initialize verbs work request */
seg->rs_recv_desc.wr_id = (unsigned long) seg;

Просмотреть файл

@ -93,14 +93,26 @@ typedef struct ompi_btl_usnic_reg_t {
struct ibv_mr* mr;
} ompi_btl_usnic_reg_t;
/* UDP headers are always 42 bytes long */
#define OMPI_BTL_USNIC_UDP_HDR_SZ (42)
/*
* Header that is the beginning of every usnic packet buffer.
*/
typedef struct {
/* Verbs UD global resource header (GRH), which appears on the
receiving side only. */
typedef union {
/* Verbs UD global resource header (GRH), which appears on the receiving
* side only. Valid iff mca_btl_usnic_component.use_udp is false. */
struct ibv_grh grh;
} ompi_btl_usnic_protocol_header_t;
/* Valid iff mca_btl_usnic_component.use_udp is true. */
char udp_bytes[OMPI_BTL_USNIC_UDP_HDR_SZ];
} __attribute__((__packed__)) ompi_btl_usnic_protocol_header_t;
#define OMPI_BTL_USNIC_PROTO_HDR_SZ \
(mca_btl_usnic_component.use_udp ? \
OMPI_BTL_USNIC_UDP_HDR_SZ : \
sizeof(struct ibv_grh))
/**
* usnic header type

Просмотреть файл

@ -1488,7 +1488,7 @@ static void module_async_event_callback(int fd, short flags, void *arg)
case IBV_EVENT_QP_FATAL:
case IBV_EVENT_PORT_ERR:
#if BTL_USNIC_HAVE_IBV_EVENT_GID_CHANGE
#if HAVE_DECL_IBV_EVENT_GID_CHANGE
case IBV_EVENT_GID_CHANGE:
#endif
default:

Просмотреть файл

@ -49,33 +49,12 @@ AC_DEFUN([MCA_ompi_btl_usnic_CONFIG],[
]
)
# Do we have the IBV_TRANSPORT_USNIC / IBV_NODE_USNIC defines?
# (note: if we have one, we have both)
btl_usnic_have_ibv_usnic=0
btl_usnic_have_ibv_event_gid_change=0
AS_IF([test "$btl_usnic_happy" = "yes"],
[AC_CHECK_DECL([IBV_NODE_USNIC],
[btl_usnic_have_ibv_usnic=1],
[],
[ #include <infiniband/verbs.h>
])
AC_CHECK_DECL([IBV_EVENT_GID_CHANGE],
[btl_usnic_have_ibv_event_gid_change=1],
[],
[ #include <infiniband/verbs.h>
])
AC_CHECK_DECLS([ibv_event_type_str], [], [],
[AC_CHECK_DECLS([IBV_EVENT_GID_CHANGE, ibv_event_type_str], [], [],
[#include <infiniband/verbs.h>
])
]
)
AC_DEFINE_UNQUOTED([BTL_USNIC_HAVE_IBV_USNIC],
[$btl_usnic_have_ibv_usnic],
[Whether we have IBV_NODE_USNIC / IBV_TRANSPORT_USNIC or not])
AC_DEFINE_UNQUOTED([BTL_USNIC_HAVE_IBV_EVENT_GID_CHANGE],
[$btl_usnic_have_ibv_event_gid_change],
[Whether we have IBV_EVENT_GID_CHANGE or not])
AS_IF([test "$btl_usnic_happy" = "yes"],
[btl_usnic_WRAPPER_EXTRA_LDFLAGS="$btl_usnic_LDFLAGS"

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved.
* All rights reserved.
* Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved.
* Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -79,14 +79,22 @@ enum {
OMPI_COMMON_VERBS_FLAGS_UD = 0x4,
OMPI_COMMON_VERBS_FLAGS_TRANSPORT_IB = 0x8,
OMPI_COMMON_VERBS_FLAGS_TRANSPORT_IWARP = 0x10,
OMPI_COMMON_VERBS_FLAGS_TRANSPORT_USNIC = 0x20,
OMPI_COMMON_VERBS_FLAGS_TRANSPORT_USNIC_UDP = 0x40,
/* Note that these 2 link layer flags will only be useful if
defined(HAVE_IBV_LINK_LAYER_ETHERNET). Otherwise, they will be
ignored. */
OMPI_COMMON_VERBS_FLAGS_LINK_LAYER_IB = 0x20,
OMPI_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET = 0x40,
OMPI_COMMON_VERBS_FLAGS_LINK_LAYER_IB = 0x80,
OMPI_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET = 0x100,
OMPI_COMMON_VERBS_FLAGS_MAX
};
enum {
/* a constant used when probing the usNIC transport type (custom L2 vs.
* UDP/IP) */
OMPI_COMMON_VERBS_USNIC_PROBE_MAGIC = 42
};
/**
* Find a list of ibv_device ports that match a specific criteria.
*

Просмотреть файл

@ -163,10 +163,16 @@ static bool want_this_port(char **include_list, char **exclude_list,
static const char *transport_name_to_str(enum ibv_transport_type transport_type)
{
switch(transport_type) {
case IBV_TRANSPORT_IB: return "IB";
case IBV_TRANSPORT_IWARP: return "IWARP";
case IBV_TRANSPORT_IB: return "IB";
case IBV_TRANSPORT_IWARP: return "IWARP";
#if HAVE_DECL_IBV_TRANSPORT_USNIC
case IBV_TRANSPORT_USNIC: return "usNIC";
#endif
#if HAVE_DECL_IBV_TRANSPORT_USNIC_UDP
case IBV_TRANSPORT_USNIC_UDP: return "usNIC UDP";
#endif
case IBV_TRANSPORT_UNKNOWN:
default: return "unknown";
default: return "unknown";
}
}
@ -182,6 +188,44 @@ static const char *link_layer_to_str(int link_type)
}
#endif
/* Helper routine to detect Cisco usNIC devices (these are non-IB, non-RoCE,
* non-iWARP devices). See the usnic BTL for more information.
*
* Once usNIC is no longer new and the IBV_TRANSPORT_USNIC constant is widely
* available in the wild, all calls to it can be replaced with a simple check
* against said constant.
*/
static bool device_is_usnic(struct ibv_device *device)
{
/* A usNIC-capable VIC will present as one of:
* 1. _IWARP -- any libibverbs and old kernel
* 2. _UNKNOWN -- old libibverbs and new kernel
* 3. _USNIC -- new libibverbs and new kernel
*
* Where an "old kernel" is one that does not have this commit:
* http://bit.ly/kernel-180771a3
*/
#if HAVE_DECL_IBV_TRANSPORT_USNIC
if (IBV_TRANSPORT_USNIC == device->transport_type) {
return true;
}
#endif
#if HAVE_DECL_IBV_TRANSPORT_USNIC_UDP
if (IBV_TRANSPORT_USNIC_UDP == device->transport_type) {
return true;
}
#endif
if ((IBV_TRANSPORT_IWARP == device->transport_type ||
IBV_TRANSPORT_UNKNOWN == device->transport_type) &&
0 == strncmp(device->name, "usnic_", strlen("usnic_"))) {
/* if we are willing to open the device, query its attributes, then
* close it again, we could also check for Cisco's vendor ID (0x1137) */
return true;
}
return false;
}
/***********************************************************************/
static void check_sanity(char ***if_sanity_list, const char *dev_name, int port)
@ -240,7 +284,8 @@ opal_list_t *ompi_common_verbs_find_ports(const char *if_include,
uint32_t i, j;
opal_list_t *port_list = NULL;
opal_list_item_t *item;
bool want;
bool want, dev_is_usnic;
enum ibv_transport_type saved_transport_type;
/* Allocate a list to fill */
port_list = OBJ_NEW(opal_list_t);
@ -286,9 +331,46 @@ opal_list_t *ompi_common_verbs_find_ports(const char *if_include,
device = devices[i];
check_sanity(&if_sanity_list, ibv_get_device_name(device), -1);
device_context = ibv_open_device(device);
opal_output_verbose(5, stream, "examining verbs interface: %s",
ibv_get_device_name(device));
dev_is_usnic = false;
if ((flags & OMPI_COMMON_VERBS_FLAGS_TRANSPORT_USNIC) ||
(flags & OMPI_COMMON_VERBS_FLAGS_TRANSPORT_USNIC_UDP)) {
dev_is_usnic = device_is_usnic(device);
if (dev_is_usnic) {
opal_output_verbose(5, stream, "verbs interface %s is usnic, claims transport_type %s",
ibv_get_device_name(device),
transport_name_to_str(device->transport_type));
/* There are two flavors of libusnic_verbs (and its supporting
* kernel infrastructure):
* 1. One that speaks an L2-only format that uses the RoCE
* ethertype with a different version value.
* 2. A more modern one that speaks UDP/IP.
*
* Because 42-byte UDP headers are larger than the standard
* 40-byte IB headers, applications (like OMPI) must inform the
* library that they are prepared to see payloads starting at a
* non-standard offset. We do that by overriding the
* transport_type to a magic value before calling
* ibv_open_device. Flavor #1 ignores this field, while flavor
* #2 will set it back to IBV_TRANSPORT_USNIC (or some other
* non-magic value, depending on whether the first is available
* in libibverbs itself).
*
* Later we compare against this MAGIC value to see which
* flavor this device actually is. First save the current
* value so we can restore it after evaluating this device for
* its usNIC properties.
*/
saved_transport_type = device->transport_type;
device->transport_type = OMPI_COMMON_VERBS_USNIC_PROBE_MAGIC;
}
}
device_context = ibv_open_device(device);
if (NULL == device_context) {
opal_show_help("help-ompi-common-verbs.txt",
"ibv_open_device fail", true,
@ -317,6 +399,30 @@ opal_list_t *ompi_common_verbs_find_ports(const char *if_include,
/* Check the device-specific flags to see if we want this
device */
want = false;
if (dev_is_usnic) {
if (flags & OMPI_COMMON_VERBS_FLAGS_TRANSPORT_USNIC &&
OMPI_COMMON_VERBS_USNIC_PROBE_MAGIC ==
(int) device->transport_type) {
want = true;
opal_output_verbose(5, stream,
"verbs interface %s has the right transport (usNIC/L2)",
ibv_get_device_name(device));
}
if (flags & OMPI_COMMON_VERBS_FLAGS_TRANSPORT_USNIC_UDP &&
OMPI_COMMON_VERBS_USNIC_PROBE_MAGIC !=
(int) device->transport_type) {
want = true;
opal_output_verbose(5, stream,
"verbs interface %s has the right transport (usNIC/UDP)",
ibv_get_device_name(device));
}
/* done checking for usNIC, restore the transport value for future
* calls to this routine */
device->transport_type = saved_transport_type;
}
if (flags & OMPI_COMMON_VERBS_FLAGS_TRANSPORT_IB &&
IBV_TRANSPORT_IB == device->transport_type) {
opal_output_verbose(5, stream, "verbs interface %s has right type (IB)",