1
1

btl/openib: XRC fix bug that could cause an invalid SRQ# to be used

This commit fixes a bug that occurs when attempting a get or put
operation on an endpoint that is not already connected. In this case
the remote_srqn may be set to an invalid value as the rem_srqs array
on the endpoint is not populated. This commit moves the usage of the
rem_srqs array to the internal put/get functions where it is
guaranteed this array is populated.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-02-18 14:45:07 -07:00
родитель 142e38cbb2
Коммит 4dc73d7765
3 изменённых файлов: 31 добавлений и 35 удалений

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
@ -73,16 +73,7 @@ static int mca_btl_openib_atomic_internal (struct mca_btl_base_module_t *btl, st
frag->sr_desc.wr.atomic.rkey = rkey;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
frag->sr_desc.qp_type.xrc.remote_srqn = endpoint->rem_info.rem_srqs[qp].rem_srq_num;
#else
frag->sr_desc.xrc_remote_srq_num = endpoint->rem_info.rem_srqs[qp].rem_srq_num;
#endif
}
#endif
/* NTH: the SRQ# is set in mca_btl_get_internal */
if (endpoint->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights
* Copyright (c) 2006-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2008-2012 Oracle and/or its affiliates. All rights reserved.
@ -92,16 +92,6 @@ int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint
frag->sr_desc.wr.rdma.rkey = remote_handle->rkey;
}
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
#else
frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
}
#endif
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags);
@ -138,6 +128,19 @@ int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base
int qp = to_base_frag(frag)->base.order;
struct ibv_send_wr *bad_wr;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
/* NTH: the remote SRQ number is only available once the endpoint is connected. By
* setting the value here instead of mca_btl_openib_get we guarantee the rem_srqs
* array is initialized. */
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
#else
frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
}
#endif
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);

Просмотреть файл

@ -101,19 +101,6 @@ int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint
to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey;
}
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
#if OPAL_HAVE_CONNECTX_XRC
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#elif OPAL_HAVE_CONNECTX_XRC_DOMAINS
to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
#else
#error "that should never happen"
#endif
}
#endif
if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
OPAL_THREAD_LOCK(&ep->endpoint_lock);
rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags);
@ -153,6 +140,21 @@ int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base
struct ibv_send_wr *bad_wr;
int rc;
#if HAVE_XRC
if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
/* NTH: the remote SRQ number is only available once the endpoint is connected. By
* setting the value here instead of mca_btl_openib_put we guarantee the rem_srqs
* array is initialized. */
#if OPAL_HAVE_CONNECTX_XRC
to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#elif OPAL_HAVE_CONNECTX_XRC_DOMAINS
to_out_frag(frag)->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
#else
#error "that should never happen"
#endif
}
#endif
/* check for a send wqe */
if (qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);