diff --git a/config/opal_check_openfabrics.m4 b/config/opal_check_openfabrics.m4 index dafb9ca0a0..d53f3bdd34 100644 --- a/config/opal_check_openfabrics.m4 +++ b/config/opal_check_openfabrics.m4 @@ -387,6 +387,23 @@ AC_DEFUN([OPAL_CHECK_OPENFABRICS_CM],[ fi ])dnl +AC_DEFUN([OPAL_CHECK_EXP_VERBS],[ + OPAL_VAR_SCOPE_PUSH([have_struct_ibv_exp_send_wr]) + + AC_MSG_CHECKING([whether expanded verbs are available]) + AC_TRY_COMPILE([#include ], [struct ibv_exp_send_wr;], + [have_struct_ibv_exp_send_wr=1 + AC_MSG_RESULT([yes])], + [have_struct_ibv_exp_send_wr=0 + AC_MSG_RESULT([no])]) + + AC_DEFINE_UNQUOTED([HAVE_EXP_VERBS], [$have_struct_ibv_exp_send_wr], [Expanded verbs]) + AC_CHECK_DECLS([IBV_EXP_ATOMIC_HCA_REPLY_BE, IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY, ibv_exp_create_qp], [], [], [#include ]) + AC_CHECK_HEADERS([infiniband/verbs_exp.h]) + AS_IF([test '$have_struct_ibv_exp_send_wr' = 1], [$1], [$2]) + OPAL_VAR_SCOPE_POP +])dnl + AC_DEFUN([OPAL_CHECK_MLNX_OPENFABRICS],[ $1_have_mverbs=0 $1_have_mqe=0 diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index a9c13e3d13..073b858e2a 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -490,6 +490,8 @@ struct mca_btl_openib_module_t { mca_btl_openib_module_qp_t * qps; int local_procs; /** number of local procs */ + + bool atomic_ops_be; /** atomic result is big endian */ }; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; diff --git a/opal/mca/btl/openib/btl_openib_atomic.c b/opal/mca/btl/openib/btl_openib_atomic.c index 6e6698877d..8d7ac86f73 100644 --- a/opal/mca/btl/openib/btl_openib_atomic.c +++ b/opal/mca/btl/openib/btl_openib_atomic.c @@ -27,6 +27,7 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st { mca_btl_openib_get_frag_t* frag = NULL; int qp = order; + int32_t rkey; int rc; frag = to_get_frag(alloc_recv_user_frag()); @@ -61,15 +62,16 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st frag->sr_desc.wr.atomic.compare_add = operand; frag->sr_desc.wr.atomic.swap = operand2; + rkey = remote_handle->rkey; + #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT if((endpoint->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - frag->sr_desc.wr.atomic.rkey = opal_swap_bytes4 (remote_handle->rkey); - } else -#endif - { - frag->sr_desc.wr.atomic.rkey = remote_handle->rkey; + rkey = opal_swap_bytes4 (rkey); } +#endif + + frag->sr_desc.wr.atomic.rkey = rkey; #if HAVE_XRC if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 7c99209fd8..56732c49b2 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -822,13 +822,26 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, openib_btl->super.btl_get_local_registration_threshold = 0; #if HAVE_DECL_IBV_ATOMIC_HCA - if (openib_btl->device->ib_dev_attr.atomic_cap == IBV_ATOMIC_NONE) { + openib_btl->atomic_ops_be = false; + + switch (openib_btl->device->ib_dev_attr.atomic_cap) { + case IBV_ATOMIC_GLOB: + openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB; + break; +#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE + case IBV_EXP_ATOMIC_HCA_REPLY_BE: + openib_btl->atomic_ops_be = true; + break; +#endif + case IBV_ATOMIC_HCA: + break; + case IBV_ATOMIC_NONE: + default: + /* no atomics or an unsupported atomic type */ openib_btl->super.btl_flags &= ~MCA_BTL_FLAGS_ATOMIC_FOPS; openib_btl->super.btl_atomic_flags = 0; openib_btl->super.btl_atomic_fop = NULL; openib_btl->super.btl_atomic_cswap = NULL; - } else if (IBV_ATOMIC_GLOB == openib_btl->device->ib_dev_attr.atomic_cap) { - openib_btl->super.btl_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB; } #endif @@ -3446,6 +3459,11 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, mca_btl_openib_get_frag_t *get_frag = to_get_frag(des); + /* check if atomic result needs to be byte swapped (mlx5) */ + if (openib_btl->atomic_ops_be && IBV_WC_RDMA_READ != wc->opcode) { + *((int64_t *) frag->sg_entry.addr) = ntoh64 (*((int64_t *) frag->sg_entry.addr)); + } + get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data, OPAL_SUCCESS); diff --git a/opal/mca/btl/openib/configure.m4 b/opal/mca/btl/openib/configure.m4 index 3ac6a85445..7dab3f5426 100644 --- a/opal/mca/btl/openib/configure.m4 +++ b/opal/mca/btl/openib/configure.m4 @@ -46,6 +46,7 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[ [btl_openib_happy="yes" OPAL_CHECK_OPENFABRICS_CM([btl_openib])], [btl_openib_happy="no"]) + OPAL_CHECK_EXP_VERBS([btl_openib], [], []) AS_IF([test "$btl_openib_happy" = "yes"], [# With the new openib flags, look for ibv_fork_init diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index c29df267d7..a539ba4d08 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -56,6 +56,9 @@ #include #include #include +#ifdef HAVE_INFINIBAND_VERBS_EXP_H +#include +#endif #include #include @@ -1307,7 +1310,11 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_ uint32_t max_send_wr) { udcm_endpoint_t *udep = UDCM_ENDPOINT_DATA(lcl_ep); +#if HAVE_DECL_IBV_EXP_CREATE_QP + struct ibv_exp_qp_init_attr init_attr; +#else struct ibv_qp_init_attr init_attr; +#endif size_t req_inline; int rc; @@ -1328,6 +1335,32 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_ } init_attr.cap.max_send_wr = max_send_wr; +#if HAVE_DECL_IBV_EXP_CREATE_QP + /* use expanded verbs qp create to enable use of mlx5 atomics */ + init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_PD; + init_attr.pd = m->btl->device->ib_pd; + + init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_ATOMICS_ARG; + init_attr.max_atomic_arg = 8; + +#if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE + if (IBV_EXP_ATOMIC_HCA_REPLY_BE == m->btl->device->ib_dev_attr.atomic_cap) { + init_attr.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY; + init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + } +#endif + + while (NULL == (lcl_ep->qps[qp].qp->lcl_qp = ibv_exp_create_qp (m->btl->device->ib_dev_context, + &init_attr))) { + /* NTH: this process may be out of registered memory. try evicting an item from + the lru of this btl's mpool */ + if (false == mca_mpool_grdma_evict (m->btl->super.btl_mpool)) { + break; + } + } + +#else + while (NULL == (lcl_ep->qps[qp].qp->lcl_qp = ibv_create_qp(m->btl->device->ib_pd, &init_attr))) { /* NTH: this process may be out of registered memory. try evicting an item from @@ -1337,6 +1370,8 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_ } } +#endif + if (NULL == lcl_ep->qps[qp].qp->lcl_qp) { opal_show_help("help-mpi-btl-openib-cpc-base.txt", "ibv_create_qp failed", true, opal_process_info.nodename,