From b3617e736eef97da6ecacee952baf699b6f7068d Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 9 Dec 2014 18:43:15 +0900 Subject: [PATCH] btl/openib: add XRC support with OFED 3.12+ based on an original patch contributed by Bull. --- config/opal_check_openfabrics.m4 | 20 ++- opal/mca/btl/openib/btl_openib.c | 31 ++++- opal/mca/btl/openib/btl_openib.h | 7 + opal/mca/btl/openib/btl_openib_async.c | 16 ++- opal/mca/btl/openib/btl_openib_async.h | 5 +- opal/mca/btl/openib/btl_openib_component.c | 8 +- opal/mca/btl/openib/btl_openib_endpoint.c | 19 +++ opal/mca/btl/openib/btl_openib_endpoint.h | 15 ++ opal/mca/btl/openib/btl_openib_ini.c | 11 ++ opal/mca/btl/openib/btl_openib_xrc.c | 48 ++++++- opal/mca/btl/openib/btl_openib_xrc.h | 5 + .../openib/connect/btl_openib_connect_udcm.c | 130 +++++++++++++++++- 12 files changed, 305 insertions(+), 10 deletions(-) diff --git a/config/opal_check_openfabrics.m4 b/config/opal_check_openfabrics.m4 index 13767c3441..50292fb406 100644 --- a/config/opal_check_openfabrics.m4 +++ b/config/opal_check_openfabrics.m4 @@ -16,6 +16,9 @@ # Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved. # Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. +# Copyright (c) 2014 Bull SAS. All rights reserved. +# Copyright (c) 2014-2015 Research Organization for Information Science +# and Technology (RIST). All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -148,6 +151,7 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS],[ # Set these up so that we can do an AC_DEFINE below # (unconditionally) $1_have_xrc=0 + $1_have_xrcd=0 $1_have_opensm_devel=0 # If we have the openib stuff available, find out what we've got @@ -161,9 +165,14 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS],[ [#include ]) # ibv_create_xrc_rcv_qp was added in OFED 1.3 + # ibv_cmd_open_xrcd (aka XRC Domains) was added in OFED 3.12 if test "$enable_connectx_xrc" = "yes"; then - AC_CHECK_FUNCS([ibv_create_xrc_rcv_qp], [$1_have_xrc=1]) + AC_CHECK_FUNCS([ibv_create_xrc_rcv_qp ibv_cmd_open_xrcd], [$1_have_xrc=1]) fi + if test "$enable_connectx_xrc" = "yes"; then + AC_CHECK_FUNCS([ibv_cmd_open_xrcd], [$1_have_xrcd=1]) + fi + if test "no" != "$enable_openib_dynamic_sl"; then # We need ib_types.h file, which is installed with opensm-devel @@ -228,6 +237,15 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS],[ AC_MSG_RESULT([no]) fi + AC_MSG_CHECKING([if ConnectIB XRC support is enabled]) + AC_DEFINE_UNQUOTED([OPAL_HAVE_XRCD], [$$1_have_xrcd], + [Enable features required for XRC domains support]) + if test "1" = "$$1_have_xrcd"; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + AC_MSG_CHECKING([if dynamic SL is enabled]) AC_DEFINE_UNQUOTED([OPAL_ENABLE_DYNAMIC_SL], [$$1_have_opensm_devel], [Enable features required for dynamic SL support]) diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 3d1c259756..fb949d6a8e 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -19,8 +19,9 @@ * Copyright (c) 2009 IBM Corporation. All rights reserved. * Copyright (c) 2013-2014 Intel, Inc. All rights reserved * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -324,10 +325,26 @@ static int create_srq(mca_btl_openib_module_t *openib_btl) openib_btl->qps[qp].u.srq_qp.rd_posted = 0; #if HAVE_XRC if(BTL_OPENIB_QP_TYPE_XRC(qp)) { +#if OPAL_HAVE_XRCD + struct ibv_srq_init_attr_ex attr_ex; + memset(&attr_ex, 0, sizeof(struct ibv_srq_init_attr_ex)); + attr_ex.attr.max_wr = attr.attr.max_wr; + attr_ex.attr.max_sge = attr.attr.max_sge; + attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_XRCD | + IBV_SRQ_INIT_ATTR_CQ | IBV_SRQ_INIT_ATTR_PD; + attr_ex.srq_type = IBV_SRQT_XRC; + attr_ex.xrcd = openib_btl->device->xrcd; + attr_ex.cq = openib_btl->device->ib_cq[qp_cq_prio(qp)]; + attr_ex.pd = openib_btl->device->ib_pd; + + openib_btl->qps[qp].u.srq_qp.srq = + ibv_create_srq_ex(openib_btl->device->ib_dev_context, &attr_ex); +#else openib_btl->qps[qp].u.srq_qp.srq = ibv_create_xrc_srq(openib_btl->device->ib_pd, openib_btl->device->xrc_domain, openib_btl->device->ib_cq[qp_cq_prio(qp)], &attr); +#endif } else #endif { @@ -1946,14 +1963,20 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, to_com_frag(frag)->endpoint = ep; #if HAVE_XRC if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) +#if OPAL_HAVE_XRCD + frag->sr_desc.qp_type.xrc.remote_srqn=ep->rem_info.rem_srqs[qp].rem_srq_num; +#else frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num; +#endif #endif descriptor->order = qp; /* Setting opcode on a frag constructor isn't enough since prepare_src * may return send_frag instead of put_frag */ frag->sr_desc.opcode = IBV_WR_RDMA_WRITE; - frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]), 1); + frag->sr_desc.send_flags = ib_send_flags(descriptor->des_local->seg_len, &(ep->qps[qp]), 1); + qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); + qp_reset_signal_count(ep, qp); qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); qp_reset_signal_count(ep, qp); @@ -2033,7 +2056,11 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, #if HAVE_XRC if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) +#if OPAL_HAVE_XRCD + frag->sr_desc.qp_type.xrc.remote_srqn=ep->rem_info.rem_srqs[qp].rem_srq_num; +#else frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num; +#endif #endif descriptor->order = qp; diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index 3a2038e804..a6f8bfc467 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -17,6 +17,9 @@ * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -398,7 +401,11 @@ typedef struct mca_btl_openib_device_t { volatile bool got_port_event; #endif #if HAVE_XRC +#if OPAL_HAVE_XRCD + struct ibv_xrcd *xrcd; +#else struct ibv_xrc_domain *xrc_domain; +#endif int xrc_fd; #endif int32_t non_eager_rdma_endpoints; diff --git a/opal/mca/btl/openib/btl_openib_async.c b/opal/mca/btl/openib/btl_openib_async.c index 0e19ffdf2e..ca4743d3d3 100644 --- a/opal/mca/btl/openib/btl_openib_async.c +++ b/opal/mca/btl/openib/btl_openib_async.c @@ -7,6 +7,9 @@ * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -128,7 +131,11 @@ static mca_btl_openib_endpoint_t * xrc_qp2endpoint(uint32_t qp_num, mca_btl_open int ep_i; for(ep_i = 0; ep_i < opal_pointer_array_get_size(device->endpoints); ep_i++) { ep = opal_pointer_array_get_item(device->endpoints, ep_i); +#if OPAL_HAVE_XRCD + if (qp_num == ep->xrc_recv_qp->qp_num) +#else if (qp_num == ep->xrc_recv_qp_num) +#endif return ep; } return NULL; @@ -352,11 +359,14 @@ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_po event_type = event.event_type; #if HAVE_XRC /* is it XRC event ?*/ +#if OPAL_HAVE_XRCD +#else if (IBV_XRC_QP_EVENT_FLAG & event.event_type) { xrc_event = true; /* Clean the bitnd handel as usual */ event_type ^= IBV_XRC_QP_EVENT_FLAG; } +#endif #endif switch(event_type) { case IBV_EVENT_PATH_MIG: @@ -367,9 +377,12 @@ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_po mca_btl_openib_load_apm(event.element.qp, qp2endpoint(event.element.qp, device)); #if HAVE_XRC +#if OPAL_HAVE_XRCD +#else else mca_btl_openib_load_apm_xrc_rcv(event.element.xrc_qp_num, xrc_qp2endpoint(event.element.xrc_qp_num, device)); +#endif #endif } break; @@ -648,7 +661,7 @@ void mca_btl_openib_load_apm(struct ibv_qp *qp, mca_btl_openib_endpoint_t *ep) qp->qp_num, strerror(errno), errno)); } -#if HAVE_XRC +#if HAVE_XRC && ! OPAL_HAVE_XRCD void mca_btl_openib_load_apm_xrc_rcv(uint32_t qp_num, mca_btl_openib_endpoint_t *ep) { struct ibv_qp_init_attr qp_init_attr; @@ -678,6 +691,7 @@ void mca_btl_openib_load_apm_xrc_rcv(uint32_t qp_num, mca_btl_openib_endpoint_t } ibv_modify_xrc_rcv_qp(btl->device->xrc_domain, qp_num, &attr, mask); + /* Maybe the qp already was modified by other process - ignoring error */ } #endif diff --git a/opal/mca/btl/openib/btl_openib_async.h b/opal/mca/btl/openib/btl_openib_async.h index 221c5feeb2..3021e66d27 100644 --- a/opal/mca/btl/openib/btl_openib_async.h +++ b/opal/mca/btl/openib/btl_openib_async.h @@ -1,5 +1,8 @@ /* * Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,7 +19,7 @@ int start_async_event_thread(void); void mca_btl_openib_load_apm(struct ibv_qp *qp, mca_btl_openib_endpoint_t *ep); int btl_openib_async_command_done(int exp); -#if HAVE_XRC +#if HAVE_XRC && ! OPAL_HAVE_XRCD void mca_btl_openib_load_apm_xrc_rcv(uint32_t qp_num, mca_btl_openib_endpoint_t *ep); #endif diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 2760f383b3..4cd1ff58c9 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -19,8 +19,9 @@ * Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved * Copyright (c) 2013-2014 Intel, Inc. All rights reserved - * Copyright (c) 2014 Research Organization for Information Science + * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -979,6 +980,11 @@ static void device_destruct(mca_btl_openib_device_t *device) } #if HAVE_XRC + + if (!mca_btl_openib_xrc_check_api()) { + return; + } + if (MCA_BTL_XRC_ENABLED) { if (OPAL_SUCCESS != mca_btl_openib_close_xrc_domain(device)) { BTL_VERBOSE(("XRC Internal error. Failed to close xrc domain")); diff --git a/opal/mca/btl/openib/btl_openib_endpoint.c b/opal/mca/btl/openib/btl_openib_endpoint.c index 4c519fac32..45849d456f 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.c +++ b/opal/mca/btl/openib/btl_openib_endpoint.c @@ -19,6 +19,9 @@ * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved * Copyright (c) 2013-2014 Intel, Inc. All rights reserved * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * * $COPYRIGHT$ * @@ -349,7 +352,11 @@ static void mca_btl_openib_endpoint_construct(mca_btl_base_endpoint_t* endpoint) } endpoint->ib_addr = NULL; +#if OPAL_HAVE_XRCD + endpoint->xrc_recv_qp = NULL; +#else endpoint->xrc_recv_qp_num = 0; +#endif endpoint->endpoint_btl = 0; endpoint->endpoint_proc = 0; endpoint->endpoint_local_cpc = NULL; @@ -460,12 +467,24 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) /* unregister xrc recv qp */ #if HAVE_XRC +#if OPAL_HAVE_XRCD + if (NULL != endpoint->xrc_recv_qp) { + if(ibv_destroy_qp(endpoint->xrc_recv_qp)) { + BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp->qp_num)); + } else { + BTL_VERBOSE(("Unregistered XRC Recv QP:%d\n", endpoint->xrc_recv_qp->qp_num)); + } + } +#else if (0 != endpoint->xrc_recv_qp_num) { if(ibv_unreg_xrc_rcv_qp(endpoint->endpoint_btl->device->xrc_domain, endpoint->xrc_recv_qp_num)) { BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp_num)); + } else { + BTL_VERBOSE(("Unregistered XRC Recv QP:%d\n", endpoint->xrc_recv_qp_num)); } } +#endif #endif OBJ_DESTRUCT(&endpoint->endpoint_lock); diff --git a/opal/mca/btl/openib/btl_openib_endpoint.h b/opal/mca/btl/openib/btl_openib_endpoint.h index 2b2fbf4be3..4db2c16a6a 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.h +++ b/opal/mca/btl/openib/btl_openib_endpoint.h @@ -15,6 +15,9 @@ * Copyright (c) 2006-2007 Voltaire All rights reserved. * Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved. * Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -39,6 +42,8 @@ #define QP_TX_BATCH_COUNT 64 +#define QP_TX_BATCH_COUNT 64 + BEGIN_C_DECLS struct mca_btl_openib_frag_t; @@ -206,7 +211,11 @@ struct mca_btl_base_endpoint_t { opal_list_t pending_lazy_frags; mca_btl_openib_endpoint_qp_t *qps; +#if OPAL_HAVE_XRCD + struct ibv_qp *xrc_recv_qp; +#else uint32_t xrc_recv_qp_num; /* in xrc we will use it as recv qp */ +#endif uint32_t xrc_recv_psn; /** list of pending rget ops */ @@ -323,6 +332,7 @@ static inline void qp_reset_signal_count(mca_btl_openib_endpoint_t *ep, const in } + int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t*, mca_btl_openib_send_frag_t*); int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*, @@ -596,8 +606,13 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep, } #if HAVE_XRC +#if OPAL_HAVE_XRCD + if(BTL_OPENIB_QP_TYPE_XRC(qp)) + sr_desc->qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; +#else if(BTL_OPENIB_QP_TYPE_XRC(qp)) sr_desc->xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; +#endif #endif assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr); diff --git a/opal/mca/btl/openib/btl_openib_ini.c b/opal/mca/btl/openib/btl_openib_ini.c index a6dc644cfc..82d3706000 100644 --- a/opal/mca/btl/openib/btl_openib_ini.c +++ b/opal/mca/btl/openib/btl_openib_ini.c @@ -421,6 +421,12 @@ static int parse_line(parsed_section_values_t *sv) sv->values.ignore_device_set = true; } + else if (0 == strcasecmp(key_buffer, "ignore_device")) { + /* Single value */ + sv->values.ignore_device = (bool) opal_btl_openib_ini_intify(value); + sv->values.ignore_device_set = true; + } + else { /* Have no idea what this parameter is. Not an error -- just ignore it */ @@ -584,6 +590,11 @@ static int save_section(parsed_section_values_t *s) h->values.ignore_device_set = true; } + if (s->values.ignore_device_set) { + h->values.ignore_device = s->values.ignore_device; + h->values.ignore_device_set = true; + } + found = true; break; } diff --git a/opal/mca/btl/openib/btl_openib_xrc.c b/opal/mca/btl/openib/btl_openib_xrc.c index b8f93d70f8..9118a05c01 100644 --- a/opal/mca/btl/openib/btl_openib_xrc.c +++ b/opal/mca/btl/openib/btl_openib_xrc.c @@ -4,6 +4,9 @@ * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -21,6 +24,7 @@ #ifdef HAVE_UNISTD_H #include #endif +#include #include "opal/mca/btl/base/base.h" #include "btl_openib_xrc.h" @@ -37,12 +41,38 @@ OBJ_CLASS_INSTANCE(ib_address_t, ib_address_constructor, ib_address_destructor); +/* run-time check for which libibverbs XRC API we really have underneath */ +bool mca_btl_openib_xrc_check_api() +{ + void *lib = dlopen(NULL, RTLD_NOW); /* current program */ + if (!lib) { + BTL_ERROR(("XRC error: could not find XRC API version")); + return false; + } + +#if OPAL_HAVE_XRCD + if (NULL != dlsym(lib, "ibv_open_xrcd")) { + BTL_ERROR(("XRC error: bad XRC API (require XRC from OFED 3.12+)")); + return false; + } +#else + if (NULL != dlsym(lib, "ibv_create_xrc_rcv_qp")) { + BTL_ERROR(("XRC error: bad XRC API (require XRC from OFED pre 3.12).")); + return false; + } +#endif + return true; +} + /* This func. opens XRC domain */ int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_device_t *device) { int len; char *xrc_file_name; const char *dev_name; +#if OPAL_HAVE_XRCD + struct ibv_xrcd_init_attr xrcd_attr; +#endif dev_name = ibv_get_device_name(device->ib_dev); len = asprintf(&xrc_file_name, @@ -61,9 +91,17 @@ int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_device_t *device) free(xrc_file_name); return OPAL_ERROR; } - +#if OPAL_HAVE_XRCD + memset(&xrcd_attr, 0, sizeof xrcd_attr); + xrcd_attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; + xrcd_attr.fd = device->xrc_fd; + xrcd_attr.oflags = O_CREAT; + device->xrcd = ibv_open_xrcd(device->ib_dev_context, &xrcd_attr); + if (NULL == device->xrcd) { +#else device->xrc_domain = ibv_open_xrc_domain(device->ib_dev_context, device->xrc_fd, O_CREAT); if (NULL == device->xrc_domain) { +#endif BTL_ERROR(("Failed to open XRC domain\n")); close(device->xrc_fd); free(xrc_file_name); @@ -76,11 +114,19 @@ int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_device_t *device) /* This func. closes XRC domain */ int mca_btl_openib_close_xrc_domain(struct mca_btl_openib_device_t *device) { +#if OPAL_HAVE_XRCD + if (NULL == device->xrcd) { +#else if (NULL == device->xrc_domain) { +#endif /* No XRC domain, just exit */ return OPAL_SUCCESS; } +#if OPAL_HAVE_XRCD + if (ibv_close_xrcd(device->xrcd)) { +#else if (ibv_close_xrc_domain(device->xrc_domain)) { +#endif BTL_ERROR(("Failed to close XRC domain, errno %d says %s\n", device->xrc_fd, strerror(errno))); return OPAL_ERROR; diff --git a/opal/mca/btl/openib/btl_openib_xrc.h b/opal/mca/btl/openib/btl_openib_xrc.h index e74c826b55..4f537b3e8b 100644 --- a/opal/mca/btl/openib/btl_openib_xrc.h +++ b/opal/mca/btl/openib/btl_openib_xrc.h @@ -2,6 +2,9 @@ * Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,4 +51,6 @@ int mca_btl_openib_close_xrc_domain(struct mca_btl_openib_device_t *device); int mca_btl_openib_ib_address_add_new (uint16_t lid, uint64_t s_id, opal_jobid_t ep_jobid, mca_btl_openib_endpoint_t *ep); +bool mca_btl_openib_xrc_check_api(void); + #endif diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index 8d78b40114..fc801e38b3 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -8,6 +8,9 @@ * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyright (c) 2014-2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * * $COPYRIGHT$ * @@ -329,7 +332,11 @@ static int udcm_xrc_start_connect (opal_btl_openib_connect_base_module_t *cpc, static int udcm_xrc_restart_connect (mca_btl_base_endpoint_t *lcl_ep); static int udcm_xrc_send_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr); static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep); +#if OPAL_HAVE_XRCD +static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num); +#else static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep); +#endif static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr); static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_endpoint_t *rem_ep, uint8_t msg_type); @@ -1963,7 +1970,10 @@ static int udcm_process_messages (struct ibv_cq *event_cq, udcm_module_t *m) if (UDCM_MESSAGE_XCONNECT2 == message->hdr.type) { /* save the qp number for unregister */ +#if ! OPAL_HAVE_XRCD lcl_ep->xrc_recv_qp_num = message->hdr.data.xreq.rem_qp_num; +#endif + } } #endif @@ -2403,7 +2413,11 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep) uint32_t send_wr; struct ibv_qp **qp; uint32_t *psn; +#if OPAL_HAVE_XRCD + struct ibv_qp_init_attr_ex qp_init_attr; +#else struct ibv_qp_init_attr qp_init_attr; +#endif struct ibv_qp_attr attr; int ret; size_t req_inline; @@ -2420,7 +2434,11 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep) send_wr = lcl_ep->ib_addr->qp->sd_wqe + (mca_btl_openib_component.use_eager_rdma ? mca_btl_openib_component.max_eager_rdma : 0); +#if OPAL_HAVE_XRCD + memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr_ex)); +#else memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); +#endif memset(&attr, 0, sizeof(struct ibv_qp_attr)); qp_init_attr.send_cq = qp_init_attr.recv_cq = openib_btl->device->ib_cq[prio]; @@ -2433,9 +2451,16 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep) qp_init_attr.cap.max_send_sge = 1; /* this one is ignored by driver */ qp_init_attr.cap.max_recv_sge = 1; /* we do not use SG list */ +#if OPAL_HAVE_XRCD + qp_init_attr.qp_type = IBV_QPT_XRC_SEND; + qp_init_attr.comp_mask = IBV_QP_INIT_ATTR_PD; + qp_init_attr.pd = openib_btl->device->ib_pd; + *qp = ibv_create_qp_ex(openib_btl->device->ib_dev_context, &qp_init_attr); +#else qp_init_attr.qp_type = IBV_QPT_XRC; qp_init_attr.xrc_domain = openib_btl->device->xrc_domain; *qp = ibv_create_qp(openib_btl->device->ib_pd, &qp_init_attr); +#endif if (NULL == *qp) { opal_show_help("help-mpi-btl-openib-cpc-base.txt", "ibv_create_qp failed", true, @@ -2482,11 +2507,33 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep) /* mark: xrc receive qp */ /* Recv qp connect */ +#if OPAL_HAVE_XRCD +static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep, uint32_t qp_num) +#else static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep) +#endif { mca_btl_openib_module_t *openib_btl = lcl_ep->endpoint_btl; - int ret; +#if OPAL_HAVE_XRCD + struct ibv_qp_open_attr attr; + memset(&attr, 0, sizeof(struct ibv_qp_open_attr)); + attr.comp_mask = IBV_QP_OPEN_ATTR_NUM | IBV_QP_OPEN_ATTR_XRCD | IBV_QP_OPEN_ATTR_TYPE; + attr.qp_num = qp_num; + attr.qp_type = IBV_QPT_XRC_RECV; + attr.xrcd = openib_btl->device->xrcd; + BTL_VERBOSE(("Connecting Recv QP\n")); + lcl_ep->xrc_recv_qp = ibv_open_qp(openib_btl->device->ib_dev_context, &attr); + if (NULL == lcl_ep->xrc_recv_qp) { /* failed to regester the qp, so it is already die and we should create new one */ + /* Return NOT READY !!!*/ + BTL_ERROR(("Failed to register qp_num: %d , get error: %s (%d)\n. Replying with RNR", + lcl_ep->xrc_recv_qp->qp_num, strerror(errno), errno)); + return OPAL_ERROR; + } else { + BTL_VERBOSE(("Connected to XRC Recv qp [%d]", lcl_ep->xrc_recv_qp->qp_num)); + return OPAL_SUCCESS; + } +#else BTL_VERBOSE(("Connecting receive qp: %d", lcl_ep->xrc_recv_qp_num)); ret = ibv_reg_xrc_rcv_qp(openib_btl->device->xrc_domain, lcl_ep->xrc_recv_qp_num); if (ret) { /* failed to regester the qp, so it is already die and we should create new one */ @@ -2496,6 +2543,7 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep) lcl_ep->xrc_recv_qp_num, strerror(ret), ret)); return OPAL_ERROR; } +#endif return OPAL_SUCCESS; } @@ -2504,27 +2552,58 @@ static int udcm_xrc_recv_qp_connect (mca_btl_openib_endpoint_t *lcl_ep) static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_hdr_t *msg_hdr) { mca_btl_openib_module_t* openib_btl = lcl_ep->endpoint_btl; +#if OPAL_HAVE_XRCD + struct ibv_qp_init_attr_ex qp_init_attr; +#else struct ibv_qp_init_attr qp_init_attr; +#endif struct ibv_qp_attr attr; int ret; BTL_VERBOSE(("creating xrc receive qp")); +#if OPAL_HAVE_XRCD + memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr_ex)); + qp_init_attr.qp_type = IBV_QPT_XRC_RECV; + qp_init_attr.comp_mask = IBV_QP_INIT_ATTR_XRCD; + qp_init_attr.xrcd = openib_btl->device->xrcd; + lcl_ep->xrc_recv_qp = ibv_create_qp_ex(openib_btl->device->ib_dev_context, + &qp_init_attr); + if (NULL == lcl_ep->xrc_recv_qp) { + BTL_ERROR(("Error creating XRC recv QP, errno says: %s [%d]", + strerror(errno), errno)); + return OPAL_ERROR; + } +#else memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); /* Only xrc_domain is required, all other are ignored */ qp_init_attr.xrc_domain = openib_btl->device->xrc_domain; - ret = ibv_create_xrc_rcv_qp(&qp_init_attr, &lcl_ep->xrc_recv_qp_num); + ret = ibv_create_xrc_rcv_qp(&qp_init_attr, &lcl_ep->xrc_recv_qp->qp_num); if (ret) { BTL_ERROR(("Error creating XRC recv QP[%x], errno says: %s [%d]", - lcl_ep->xrc_recv_qp_num, strerror(ret), ret)); + lcl_ep->xrc_recv_qp->qp_num, strerror(ret), ret)); return OPAL_ERROR; } +#endif memset(&attr, 0, sizeof(struct ibv_qp_attr)); attr.qp_state = IBV_QPS_INIT; attr.pkey_index = openib_btl->pkey_index; attr.port_num = openib_btl->port_num; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; +#if OPAL_HAVE_XRCD + ret = ibv_modify_qp(lcl_ep->xrc_recv_qp, + &attr, + IBV_QP_STATE| + IBV_QP_PKEY_INDEX| + IBV_QP_PORT| + IBV_QP_ACCESS_FLAGS); + if (ret) { + BTL_ERROR(("Error modifying XRC recv QP to IBV_QPS_INIT, errno says: %s [%d]", + strerror(ret), ret)); + return OPAL_ERROR; + } +#else ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain, lcl_ep->xrc_recv_qp_num, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | @@ -2535,6 +2614,7 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_ while(1); return OPAL_ERROR; } +#endif memset(&attr, 0, sizeof(struct ibv_qp_attr)); attr.qp_state = IBV_QPS_RTR; @@ -2555,7 +2635,11 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_ /* if user enabled dynamic SL, get it from PathRecord */ if (0 != mca_btl_openib_component.ib_path_record_service_level) { int rc = btl_openib_connect_get_pathrecord_sl( +#if OPAL_HAVE_XRCD + openib_btl->device->xrcd->context, +#else openib_btl->device->xrc_domain->context, +#endif attr.ah_attr.port_num, openib_btl->lid, attr.ah_attr.dlid); @@ -2566,6 +2650,22 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_ } #endif +#if OPAL_HAVE_XRCD + ret = ibv_modify_qp(lcl_ep->xrc_recv_qp, + &attr, + IBV_QP_STATE| + IBV_QP_AV| + IBV_QP_PATH_MTU| + IBV_QP_DEST_QPN| + IBV_QP_RQ_PSN| + IBV_QP_MAX_DEST_RD_ATOMIC| + IBV_QP_MIN_RNR_TIMER); + if (ret) { + BTL_ERROR(("Error modifying XRC recv QP to IBV_QPS_RTR, errno says: %s [%d]", + strerror(ret), ret)); + return OPAL_ERROR; + } +#else ret = ibv_modify_xrc_rcv_qp(openib_btl->device->xrc_domain, lcl_ep->xrc_recv_qp_num, &attr, @@ -2581,9 +2681,14 @@ static int udcm_xrc_recv_qp_create (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg_ lcl_ep->xrc_recv_qp_num, strerror(ret), ret)); return OPAL_ERROR; } +#endif #if OPAL_HAVE_THREADS if (APM_ENABLED) { +#if OPAL_HAVE_XRCD + mca_btl_openib_load_apm(lcl_ep->xrc_recv_qp, lcl_ep); +#else mca_btl_openib_load_apm_xrc_rcv(lcl_ep->xrc_recv_qp_num, lcl_ep); +#endif } #endif @@ -2648,14 +2753,29 @@ static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_bas msg->data->hdr.data.xres.rem_ep_index = htonl(lcl_ep->index); if (UDCM_MESSAGE_XRESPONSE == msg_type) { +#if OPAL_HAVE_XRCD + BTL_VERBOSE(("Sending qp: %d, psn: %d", lcl_ep->xrc_recv_qp->qp_num, lcl_ep->xrc_recv_psn)); + msg->data->hdr.data.xres.rem_qp_num = htonl(lcl_ep->xrc_recv_qp->qp_num); + msg->data->hdr.data.xres.rem_psn = htonl(lcl_ep->xrc_recv_psn); +#else BTL_VERBOSE(("Sending qp: %d, psn: %d", lcl_ep->xrc_recv_qp_num, lcl_ep->xrc_recv_psn)); msg->data->hdr.data.xres.rem_qp_num = htonl(lcl_ep->xrc_recv_qp_num); msg->data->hdr.data.xres.rem_psn = htonl(lcl_ep->xrc_recv_psn); +#endif } for (int i = 0; i < mca_btl_openib_component.num_xrc_qps; ++i) { +#if OPAL_HAVE_XRCD + uint32_t srq_num; + if (ibv_get_srq_num(lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq, &srq_num)) { + BTL_ERROR(("BTL openib XOOB internal error: can't get srq num")); + } + BTL_VERBOSE(("Sending srq[%d] num = %d", i, srq_num)); + msg->data->qps[i].qp_num = htonl(srq_num); +#else BTL_VERBOSE(("Sending srq[%d] num = %d", i, lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq->xrc_srq_num)); msg->data->qps[i].qp_num = htonl(lcl_ep->endpoint_btl->qps[i].u.srq_qp.srq->xrc_srq_num); +#endif } rc = udcm_post_send (lcl_ep, msg->data, m->msg_length, 0); @@ -2695,7 +2815,11 @@ static int udcm_xrc_handle_xconnect (mca_btl_openib_endpoint_t *lcl_ep, udcm_msg if (UDCM_MESSAGE_XCONNECT2 == msg_hdr->type) { response_type = UDCM_MESSAGE_XRESPONSE2; +#if OPAL_HAVE_XRCD + rc = udcm_xrc_recv_qp_connect (lcl_ep, msg_hdr->data.xreq.rem_qp_num); +#else rc = udcm_xrc_recv_qp_connect (lcl_ep); +#endif if (OPAL_SUCCESS != rc) { /* return not ready. remote side will retry */ rej_reason = UDCM_REJ_NOT_READY;