btl/openib: XRC fix bug that could cause an invalid SRQ# to be used
This commit fixes a bug that occurs when attempting a get or put operation on an endpoint that is not already connected. In this case the remote_srqn may be set to an invalid value as the rem_srqs array on the endpoint is not populated. This commit moves the usage of the rem_srqs array to the internal put/get functions where it is guaranteed this array is populated. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
142e38cbb2
Коммит
4dc73d7765
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
@ -73,16 +73,7 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st
|
||||
|
||||
frag->sr_desc.wr.atomic.rkey = rkey;
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
frag->sr_desc.qp_type.xrc.remote_srqn = endpoint->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#else
|
||||
frag->sr_desc.xrc_remote_srq_num = endpoint->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
||||
/* NTH: the SRQ# is set in mca_btl_get_internal */
|
||||
|
||||
if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
|
||||
@ -92,16 +92,6 @@ int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint
|
||||
frag->sr_desc.wr.rdma.rkey = remote_handle->rkey;
|
||||
}
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#else
|
||||
frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags);
|
||||
@ -138,6 +128,19 @@ int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base
|
||||
int qp = to_base_frag(frag)->base.order;
|
||||
struct ibv_send_wr *bad_wr;
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
/* NTH: the remote SRQ number is only available once the endpoint is connected. By
|
||||
* setting the value here instead of mca_btl_openib_get we guarantee the rem_srqs
|
||||
* array is initialized. */
|
||||
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#else
|
||||
frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
|
@ -101,19 +101,6 @@ int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint
|
||||
to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey;
|
||||
}
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
|
||||
#if OPAL_HAVE_CONNECTX_XRC
|
||||
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#elif OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#else
|
||||
#error "that should never happen"
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags);
|
||||
@ -153,6 +140,21 @@ int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base
|
||||
struct ibv_send_wr *bad_wr;
|
||||
int rc;
|
||||
|
||||
#if HAVE_XRC
|
||||
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
|
||||
/* NTH: the remote SRQ number is only available once the endpoint is connected. By
|
||||
* setting the value here instead of mca_btl_openib_put we guarantee the rem_srqs
|
||||
* array is initialized. */
|
||||
#if OPAL_HAVE_CONNECTX_XRC
|
||||
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#elif OPAL_HAVE_CONNECTX_XRC_DOMAINS
|
||||
to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
|
||||
#else
|
||||
#error "that should never happen"
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/* check for a send wqe */
|
||||
if (qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user