From de934977899a3faf1ce600ce53b8c4950a6213e7 Mon Sep 17 00:00:00 2001 From: Ryan Grant Date: Wed, 4 Feb 2015 09:53:58 -0700 Subject: [PATCH] fix problem with get_pathrecord posting too many recv requests --- .../openib/connect/btl_openib_connect_sl.c | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_sl.c b/opal/mca/btl/openib/connect/btl_openib_connect_sl.c index 0d7b01b6d7..6d715bb298 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_sl.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_sl.c @@ -127,8 +127,8 @@ static int init_ud_qp(struct ibv_context *context_arg, memset(&iattr, 0, sizeof(iattr)); iattr.send_cq = cache->cq; iattr.recv_cq = cache->cq; - iattr.cap.max_send_wr = 2; - iattr.cap.max_recv_wr = 2; + iattr.cap.max_send_wr = 1; + iattr.cap.max_recv_wr = 1; iattr.cap.max_send_sge = 1; iattr.cap.max_recv_sge = 1; iattr.qp_type = IBV_QPT_UD; @@ -257,24 +257,24 @@ static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache, while (0 == got_sl_value) { ne = ibv_poll_cq(cache->cq, 1, &wc); - if (ne > 0 && - IBV_WC_SUCCESS == wc.status && - IBV_WC_RECV == wc.opcode && - wc.byte_len >= MAD_BLOCK_SIZE && - resp_mad->trans_id == req_mad->trans_id) { + if (ne > 0 && IBV_WC_RECV == wc.opcode) { + /* We only care about the status of receive work requests. */ + /* If the status of the send work request was anything other */ + /* than success, we'll eventually retransmit, so ignore them. */ if (0 == resp_mad->status && req_path_record->slid == htons(lid) && - req_path_record->dlid == htons(rem_lid)) { + req_path_record->dlid == htons(rem_lid) && + IBV_WC_SUCCESS == wc.status && + wc.byte_len >= MAD_BLOCK_SIZE && + resp_mad->trans_id == req_mad->trans_id) { /* Everything matches, so we have the desired SL */ cache->sl_values[rem_lid] = ib_path_rec_sl(resp_path_record); - got_sl_value = 1; /* still must repost recieve buf */ - } else { - /* Probably bad status, unlikely bad lid match. We will */ - /* ignore response and let it time out so that we do a */ - /* retry, but after a delay. We must make a new TID so */ - /* the SM doesn't see it as the same request. */ - req_mad->trans_id += hton64(1); + got_sl_value = 1; + break; } + /* Probably bad status, unlikely bad lid match. We will */ + /* ignore response and let it time out so that we do a */ + /* retry, but after a delay. Need to repost receive WR. */ rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr); if (0 != rc) { BTL_ERROR(("error posing receive on QP[%x] errno says: %s [%d]", @@ -295,7 +295,10 @@ static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache, MAX_GET_SL_REC_RETRIES)); return OPAL_ERROR; } - break; /* retransmit request */ + /* Need to retransmit request. We must make a new TID */ + /* so the SM doesn't see it as the same request. */ + req_mad->trans_id += hton64(1); + break; } usleep(100); /* otherwise pause before polling again */ } else if (ne < 0) {