From 56bdcd0888add3ce5f4fcea286c8a9c6588c1233 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 26 May 2016 15:58:31 -0600 Subject: [PATCH] btl/openib: fix XRC WQE calculation Before dynamic add_procs support was committed to master we called add_procs with every proc in the job. The XRC code in the openib btl was taking advantage of this and setting the number of work queue entries (WQE) based on all the procs on a remote node. Since that is no longer the case we can not simply increment the sd_wqe field on the queue pair. To fix the issue a new field has been added to the xrc queue pair structure to keep track of how many wqes there are total on the queue pair. If a new endpoint is added that increases the number of wqes and the xrc queue pair is already connected the code will attempt to modify the number of wqes on the queue pair. A failure is ignored because all that will happen is the number of active send work requests on an XRC queue pair will be more limited. Signed-off-by: Nathan Hjelm --- opal/mca/btl/openib/btl_openib_endpoint.c | 38 +++++++++++++++++-- opal/mca/btl/openib/btl_openib_endpoint.h | 2 +- opal/mca/btl/openib/btl_openib_xrc.c | 1 + opal/mca/btl/openib/btl_openib_xrc.h | 4 ++ .../openib/connect/btl_openib_connect_udcm.c | 4 +- 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/opal/mca/btl/openib/btl_openib_endpoint.c b/opal/mca/btl/openib/btl_openib_endpoint.c index 277ff21dab..484beb56dc 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.c +++ b/opal/mca/btl/openib/btl_openib_endpoint.c @@ -183,12 +183,42 @@ endpoint_init_qp_xrc(mca_btl_base_endpoint_t *ep, const int qp) (mca_btl_openib_component.use_eager_rdma ? mca_btl_openib_component.max_eager_rdma : 0); mca_btl_openib_endpoint_qp_t *ep_qp = &ep->qps[qp]; + int32_t wqe, incr = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; + int rc; + + opal_mutex_lock (&ep->ib_addr->addr_lock); + ep_qp->qp = ep->ib_addr->qp; - ep_qp->qp->sd_wqe += mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max; - /* make sure that we don't overrun maximum supported by device */ - if (ep_qp->qp->sd_wqe > max) - ep_qp->qp->sd_wqe = max; + if (ep->ib_addr->max_wqe + incr > max) { + /* make sure that we don't overrun maximum supported by device */ + incr = max - ep->ib_addr->max_wqe; + } + + wqe = ep->ib_addr->max_wqe + incr + + (mca_btl_openib_component.use_eager_rdma ? + mca_btl_openib_component.max_eager_rdma : 0); + + ep->ib_addr->max_wqe += incr; + + if (NULL != ep_qp->qp->lcl_qp) { + struct ibv_qp_attr qp_attr; + + /* if this is modified the code in udcm_xrc_send_qp_create may + * need to be updated as well */ + qp_attr.cap.max_recv_wr = 0; + qp_attr.cap.max_send_wr = wqe; + qp_attr.cap.max_inline_data = ep->endpoint_btl->device->max_inline_data; + qp_attr.cap.max_send_sge = 1; + qp_attr.cap.max_recv_sge = 1; /* we do not use SG list */ + rc = ibv_modify_qp (ep_qp->qp->lcl_qp, &qp_attr, IBV_QP_CAP); + if (0 == rc) { + opal_atomic_add_32 (&ep_qp->qp->sd_wqe, incr); + } + } else { + ep_qp->qp->sd_wqe = ep->ib_addr->max_wqe; + } ep_qp->qp->users++; + opal_mutex_unlock (&ep->ib_addr->addr_lock); } static void endpoint_init_qp(mca_btl_base_endpoint_t *ep, const int qp) diff --git a/opal/mca/btl/openib/btl_openib_endpoint.h b/opal/mca/btl/openib/btl_openib_endpoint.h index ed80aec639..c74cd5b0a6 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.h +++ b/opal/mca/btl/openib/btl_openib_endpoint.h @@ -141,7 +141,7 @@ typedef struct mca_btl_openib_endpoint_srq_qp_t { typedef struct mca_btl_openib_qp_t { struct ibv_qp *lcl_qp; uint32_t lcl_psn; - int32_t sd_wqe; /**< number of available send wqe entries */ + volatile int32_t sd_wqe; /**< number of available send wqe entries */ int32_t sd_wqe_inflight; int wqe_count; int users; diff --git a/opal/mca/btl/openib/btl_openib_xrc.c b/opal/mca/btl/openib/btl_openib_xrc.c index 1952c31b12..0b3322ba1d 100644 --- a/opal/mca/btl/openib/btl_openib_xrc.c +++ b/opal/mca/btl/openib/btl_openib_xrc.c @@ -125,6 +125,7 @@ static void ib_address_constructor(ib_address_t *ib_addr) ib_addr->lid = 0; ib_addr->status = MCA_BTL_IB_ADDR_CLOSED; ib_addr->qp = NULL; + ib_addr->max_wqe = 0; /* NTH: make the addr_lock recursive because mca_btl_openib_endpoint_connected can call * into the CPC with the lock held. The alternative would be to drop the lock but the * lock is never obtained in a critical path. */ diff --git a/opal/mca/btl/openib/btl_openib_xrc.h b/opal/mca/btl/openib/btl_openib_xrc.h index 72e1509c1c..30313471ad 100644 --- a/opal/mca/btl/openib/btl_openib_xrc.h +++ b/opal/mca/btl/openib/btl_openib_xrc.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -5,6 +6,8 @@ * Copyright (c) 2014 Bull SAS. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,6 +46,7 @@ struct ib_address_t { uint32_t remote_xrc_rcv_qp_num; /* remote xrc qp number */ opal_mutex_t addr_lock; /* protection */ mca_btl_openib_ib_addr_state_t status; /* ib port status */ + int32_t max_wqe; }; typedef struct ib_address_t ib_address_t; diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index 7920fd7aa3..29b7de3554 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -2542,7 +2542,7 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep) psn = &lcl_ep->qps[0].qp->lcl_psn; /* reserve additional wr for eager rdma credit management */ - send_wr = lcl_ep->ib_addr->qp->sd_wqe + + send_wr = lcl_ep->ib_addr->max_wqe + (mca_btl_openib_component.use_eager_rdma ? mca_btl_openib_component.max_eager_rdma : 0); #if OPAL_HAVE_CONNECTX_XRC_DOMAINS @@ -2554,6 +2554,8 @@ static int udcm_xrc_send_qp_create (mca_btl_base_endpoint_t *lcl_ep) qp_init_attr.send_cq = qp_init_attr.recv_cq = openib_btl->device->ib_cq[prio]; + /* if this code is update the code in endpoint_init_qp_xrc may need to + * be updated as well */ /* no need recv queue; receives are posted to srq */ qp_init_attr.cap.max_recv_wr = 0; qp_init_attr.cap.max_send_wr = send_wr;