From 02a6c6856d2bfbc5381ee4973d522b21adf4329e Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Mon, 23 Nov 2015 16:07:12 -0700 Subject: [PATCH] btl/openib: add support for mlx5 atomic operations This commit adds support for fetch-and-add and compare-and-swap when using the mlx5 driver. The support is only enabled if the expanded verbs interface is detected. This is required because mlx5 HCAs return the atomic result in network byte order. This support may need to be tweaked if Mellanox commits their changes into upstream verbs. Closes open-mpi/ompi#1077 Signed-off-by: Nathan Hjelm --- config/opal_check_openfabrics.m4 | 17 +++++++++ opal/mca/btl/openib/btl_openib.h | 2 ++ opal/mca/btl/openib/btl_openib_atomic.c | 12 ++++--- opal/mca/btl/openib/btl_openib_component.c | 24 +++++++++++-- opal/mca/btl/openib/configure.m4 | 1 + .../openib/connect/btl_openib_connect_udcm.c | 35 +++++++++++++++++++ 6 files changed, 83 insertions(+), 8 deletions(-) diff --git a/config/opal_check_openfabrics.m4 b/config/opal_check_openfabrics.m4 index dafb9ca0a0..d53f3bdd34 100644 --- a/config/opal_check_openfabrics.m4 +++ b/config/opal_check_openfabrics.m4 @@ -387,6 +387,23 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS_CM],[ fi ])dnl +AC_DEFUN([OPAL_CHECK_EXP_VERBS],[ + OPAL_VAR_SCOPE_PUSH([have_struct_ibv_exp_send_wr]) + + AC_MSG_CHECKING([whether expanded verbs are available]) + AC_TRY_COMPILE([#include ], [struct ibv_exp_send_wr;], + [have_struct_ibv_exp_send_wr=1 + AC_MSG_RESULT([yes])], + [have_struct_ibv_exp_send_wr=0 + AC_MSG_RESULT([no])]) + + AC_DEFINE_UNQUOTED([HAVE_EXP_VERBS], [$have_struct_ibv_exp_send_wr], [Expanded verbs]) + AC_CHECK_DECLS([IBV_EXP_ATOMIC_HCA_REPLY_BE, IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY, ibv_exp_create_qp], [], [], [#include ]) + AC_CHECK_HEADERS([infiniband/verbs_exp.h]) + AS_IF([test '$have_struct_ibv_exp_send_wr' = 1], [$1], [$2]) + OPAL_VAR_SCOPE_POP +])dnl + AC_DEFUN([OPAL_CHECK_MLNX_OPENFABRICS],[ $1_have_mverbs=0 $1_have_mqe=0 diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index a9c13e3d13..073b858e2a 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -490,6 +490,8 @@ struct mca_btl_openib_module_t { mca_btl_openib_module_qp_t * qps; int local_procs; /** number of local procs */ + + bool atomic_ops_be; /** atomic result is big endian */ }; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; diff --git a/opal/mca/btl/openib/btl_openib_atomic.c b/opal/mca/btl/openib/btl_openib_atomic.c index 6e6698877d..8d7ac86f73 100644 --- a/opal/mca/btl/openib/btl_openib_atomic.c +++ b/opal/mca/btl/openib/btl_openib_atomic.c @@ -27,6 +27,7 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st { mca_btl_openib_get_frag_t* frag = NULL; int qp = order; + int32_t rkey; int rc; frag = to_get_frag(alloc_recv_user_frag()); @@ -61,15 +62,16 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st frag->sr_desc.wr.atomic.compare_add = operand; frag->sr_desc.wr.atomic.swap = operand2; + rkey = remote_handle->rkey; + #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT if((endpoint->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - frag->sr_desc.wr.atomic.rkey = opal_swap_bytes4 (remote_handle->rkey); - } else -#endif - { - frag->sr_desc.wr.atomic.rkey = remote_handle->rkey; + rkey = opal_swap_bytes4 (rkey); } +#endif + + frag->sr_desc.wr.atomic.rkey = rkey; #if HAVE_XRC if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 7c99209fd8..56732c49b2 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -822,13 +822,26 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, openib_btl->super.btl_get_local_registration_threshold = 0; #if HAVE_DECL_IBV_ATOMIC_HCA - if (openib_btl->device->ib_dev_attr.atomic_cap == IBV_ATOMIC_NONE) { + openib_btl->atomic_ops_be = false; + + switch (openib_btl->device->ib_dev_attr.atomic_cap) { + case IBV_ATOMIC_GLOB: + openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB; + break; +#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE + case IBV_EXP_ATOMIC_HCA_REPLY_BE: + openib_btl->atomic_ops_be = true; + break; +#endif + case IBV_ATOMIC_HCA: + break; + case IBV_ATOMIC_NONE: + default: + /* no atomics or an unsupported atomic type */ openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS; openib_btl->super.btl_atomic_flags = 0; openib_btl->super.btl_atomic_fop = NULL; openib_btl->super.btl_atomic_cswap = NULL; - } else if (IBV_ATOMIC_GLOB == openib_btl->device->ib_dev_attr.atomic_cap) { - openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB; } #endif @@ -3446,6 +3459,11 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, mca_btl_openib_get_frag_t *get_frag = to_get_frag(des); + /* check if atomic result needs to be byte swapped (mlx5) */ + if (openib_btl->atomic_ops_be && IBV_WC_RDMA_READ != wc->opcode) { + *((int64_t *) frag->sg_entry.addr) = ntoh64 (*((int64_t *) frag->sg_entry.addr)); + } + get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data, OPAL_SUCCESS); diff --git a/opal/mca/btl/openib/configure.m4 b/opal/mca/btl/openib/configure.m4 index 3ac6a85445..7dab3f5426 100644 --- a/opal/mca/btl/openib/configure.m4 +++ b/opal/mca/btl/openib/configure.m4 @@ -46,6 +46,7 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[ [btl_openib_happy="yes" OPAL_CHECK_OPENFABRICS_CM([btl_openib])], [btl_openib_happy="no"]) + OPAL_CHECK_EXP_VERBS([btl_openib], [], []) AS_IF([test "$btl_openib_happy" = "yes"], [# With the new openib flags, look for ibv_fork_init diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index c29df267d7..a539ba4d08 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -56,6 +56,9 @@ #include #include #include +#ifdef HAVE_INFINIBAND_VERBS_EXP_H +#include +#endif #include #include @@ -1307,7 +1310,11 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_ uint32_t max_send_wr) { udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); +#if HAVE_DECL_IBV_EXP_CREATE_QP + struct ibv_exp_qp_init_attr init_attr; +#else struct ibv_qp_init_attr init_attr; +#endif size_t req_inline; int rc; @@ -1328,6 +1335,32 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_ } init_attr.cap.max_send_wr = max_send_wr; +#if HAVE_DECL_IBV_EXP_CREATE_QP + /* use expanded verbs qp create to enable use of mlx5 atomics */ + init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD; + init_attr.pd = m->btl->device->ib_pd; + + init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG; + init_attr.max_atomic_arg = 8; + +#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE + if (IBV_EXP_ATOMIC_HCA_REPLY_BE == m->btl->device->ib_dev_attr.atomic_cap) { + init_attr.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY; + init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + } +#endif + + while (NULL == (lcl_ep->qps[qp].qp->lcl_qp = ibv_exp_create_qp (m->btl->device->ib_dev_context, + &init_attr))) { + /* NTH: this process may be out of registered memory. try evicting an item from + the lru of this btl's mpool */ + if (false == mca_mpool_grdma_evict (m->btl->super.btl_mpool)) { + break; + } + } + +#else + while (NULL == (lcl_ep->qps[qp].qp->lcl_qp = ibv_create_qp(m->btl->device->ib_pd, &init_attr))) { /* NTH: this process may be out of registered memory. try evicting an item from @@ -1337,6 +1370,8 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_ } } +#endif + if (NULL == lcl_ep->qps[qp].qp->lcl_qp) { opal_show_help("help-mpi-btl-openib-cpc-base.txt", "ibv_create_qp failed", true, opal_process_info.nodename,