From d8e5608053563ce48133c0f69c0c307a5a3a4c68 Mon Sep 17 00:00:00 2001
From: Jeff Squyres <jsquyres@cisco.com>
Date: Tue, 13 May 2008 16:10:34 +0000
Subject: [PATCH] Remove all retransmission code; the IBCM kernel module
 handles all of that for us.

This commit was SVN r18432.
---
 .../openib/connect/btl_openib_connect_ibcm.c  | 91 ++++++-------------
 1 file changed, 28 insertions(+), 63 deletions(-)

diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c
index c3a6ab03c8..4dd9639430 100644
--- a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c
+++ b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c
@@ -12,10 +12,9 @@
  * TO-DO:
  *
  * - audit control values passed to req_send()
- * - Somehow handle retransmission of RTU; don't know how to do this 
- *   yet. :-( 
  * - More show_help() throughout 
- * - ...?
+ * - error handling in case of broken connection is not good; need to
+ *   notify btl module safely
  */
 
 /*
@@ -219,23 +218,15 @@
  *   start_connect() will have created them already), create all
  *   num_qps QPs.  Also post receive buffers on all QPs.
  *
- * The IBCM CPC handles the retransmission of request and reply
- * messages, but not RTUs.  If we get a REQ_ERROR or REP_ERROR, it
- * means that the IBCM system tried to retransmit a few times and
- * failed.  Honestly, we could probably just give up at this point,
- * but one of the point of the IBCM CPC is to work at large scale, so
- * there might be heavy-duty congestion such that the UD messages are
- * actually getting lost.  So we'll find the request / reply and
- * re-send it -- we'll do this indefinitely.  
+ * We wholly reply on the IBCM system for all retransmissions and
+ * duplicate filtering of IBCM requests, replies, and RTUs.  If IBCM
+ * reports a timeout error up to OMPI, we abort the connection.  Lists
+ * are maintained of pending IBCM requests and replies solely for
+ * error handling; request/reply timeouts are reported via CM ID.  We
+ * can cross-reference this CM ID to the endpoint that it was trying
+ * to connect via these lists.
  *
- * We wholly rely on the IBCM system to retransmit the RTU, however.
- * If the passive side doesn't receive the RTU, it'll automatically
- * retransmit the reply (or raise a REP_ERROR and have us retransmit
- * it).  If the active side had already sent the RTU, it'll recognize
- * the incoming reply as a duplicate and therefore retransmit the RTU
- * for us.  Nifty.
- *
- * However, there is a race condition: because UD is unordered, the
+ * Note that there is a race condition: because UD is unordered, the
  * first message may arrive on the QP before the RTU has arrived.
  * This will cause an IBV_EVENT_COMM_EST event to be raised, which
  * would then be picked up by the async event handler in the
@@ -387,9 +378,9 @@ typedef struct {
 static OBJ_CLASS_INSTANCE(ibcm_base_cm_id_t, opal_list_item_t, NULL, NULL);
 
 /*
- * Need to maintain a list of pending CM ID requests (in case we need
- * to retransmit).  Need to use the struct name here because it was
- * forward referenced, above.
+ * Need to maintain a list of pending CM ID requests (for error
+ * handling if the requests timeout).  Need to use the struct name
+ * here because it was forward referenced, above.
  */
 typedef struct ibcm_request_t {
     ibcm_base_cm_id_t super;
@@ -412,9 +403,9 @@ static OBJ_CLASS_INSTANCE(ibcm_request_t, ibcm_base_cm_id_t,
                           ibcm_request_cm_id_constructor, NULL);
 
 /*
- * Need to maintain a list of pending CM ID replies (in case we need
- * to retransmit).  Need to use a struct name here because it was
- * forward referenced, above.
+ * Need to maintain a list of pending CM ID replies (for error
+ * handling if the replies timeout).  Need to use a struct name here
+ * because it was forward referenced, above.
  */
 typedef struct ibcm_reply_t {
     ibcm_base_cm_id_t super;
@@ -1693,9 +1684,9 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
     opal_mutex_unlock(&ie->ie_lock);
 
     /* If logic above selected a rejection reason, reject this
-       request.  No need to cache this reject message for
-       retransmission; if the same request arrives again later, IBCM
-       will trigger a new event and we'll just reject it again. */
+       request.  Note that if the same request arrives again later,
+       IBCM will trigger a new event and we'll just reject it
+       again. */
     if (REJ_MAX != rej_reason) {
         opal_output(-1, "arbitrartion failed -- reject");
         goto reject;
@@ -1923,9 +1914,7 @@ static int reply_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
         ie->ie_recv_buffers_posted = true;
     }
 
-    /* Send the RTU -- note that we don't need to queue it up for
-       retransmission (see lengthy explanation at beginning of this
-       file). */
+    /* Send the RTU */
     rtu_data.irtud_reply = reply;
     rtu_data.irtud_qp_index = p->irepd_qp_index;
     if (0 != ib_cm_send_rtu(event->cm_id, &rtu_data, sizeof(rtu_data))) {
@@ -1933,8 +1922,8 @@ static int reply_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
         return OMPI_ERR_IN_ERRNO;
     }
 
-    /* Remove the pending request so that we don't try to retransmit
-       it */
+    /* Remove the pending request because we won't need to handle
+       errors for it */
     opal_output(-1, "reply received cm id %p -- original cached req %p",
                 (void*)cmh->listen_cm_id, (void*)request);
     opal_list_remove_item(&ibcm_pending_requests, &(request->super.super));
@@ -1967,21 +1956,18 @@ static int ready_to_use_received(ibcm_listen_cm_id_t *h,
     /* Move the QP to RTS */
     if (OMPI_SUCCESS != (rc = qp_to_rts(p->irtud_qp_index,
                                         event->cm_id, endpoint))) {
-        /* JMS */
         opal_output(-1, "ib cm rtu handler: failed move to RTS (index %d)",
                     p->irtud_qp_index);
         return rc;
     }
 
-    /* Remove the pending reply so that we don't try to retransmit
-       it */
+    /* Remove the pending reply because we won't need to handle errors
+       for it */
     opal_output(-1, "RTU received cm id %p -- original cached reply %p",
                 (void*)event->cm_id, (void*)reply);
     opal_list_remove_item(&ibcm_pending_replies, &(reply->super.super));
     OBJ_RELEASE(reply);
 
-    /* JMS Send a 0 byte message to the other side to ACK this RTU */
-
     /* Have all the QP's been connected?  If so, tell the main BTL
        that we're done. */
     if (0 == --(ie->ie_qps_to_connect)) {
@@ -2042,8 +2028,8 @@ static int reject_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
             opal_output(-1, "ibcm rej handler: WRONG_DIRECTION unexpected!");
         } else {
 
-            /* Remove from the global pending_requests list (because
-               it no longer needs to be retransmitted upon timeout) */
+            /* Remove from the global pending_requests list because we
+               no longer need to handle errors for it */
             opal_output(-1, "reply received cm id %p -- original cached req %p",
                         (void*)cmh->listen_cm_id, 
                         (void*)request);
@@ -2077,7 +2063,6 @@ static int reject_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
 
 static int request_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
 {
-    int rc;
     ibcm_request_t *req;
     opal_output(-1, "ibcm handler: request error!");
 
@@ -2100,23 +2085,13 @@ static int request_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
         return OMPI_ERR_NOT_FOUND;
     }
 
-    rc = ib_cm_send_req(event->cm_id, &(req->cm_req));
-    if (0 != rc) {
-        opal_show_help("help-mpi-btl-openib-cpc-ibcm.txt",
-                       "ib_cm function error", true,
-                       orte_process_info.nodename, 
-                       "ib_cm_send_req", rc, strerror(rc));
-        return OMPI_ERR_UNREACH;
-    }
-    opal_output(-1, "Retransmitted IBCM request (CM ID: %p)",
-                (void*)event->cm_id);
+    /* JMS need to barf this connection request appropriately */
     return OMPI_SUCCESS;
 }
 
 
 static int reply_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
 {
-    int rc;
     ibcm_reply_t *rep;
     opal_output(-1, "ibcm handler: reply error!");
 
@@ -2139,17 +2114,7 @@ static int reply_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
         return OMPI_ERR_NOT_FOUND;
     }
 
-    rc = ib_cm_send_rep(event->cm_id, &(rep->cm_rep));
-    if (0 != rc) {
-        opal_show_help("help-mpi-btl-openib-cpc-ibcm.txt",
-                       "ib_cm function error", true,
-                       orte_process_info.nodename, 
-                       "ib_cm_send_rep", rc, strerror(rc));
-        return OMPI_ERR_UNREACH;
-    }
-    opal_output(-1, "Retransmitted IBCM reply (CM ID: %p)",
-                (void*)event->cm_id);
-
+    /* JMS need to barf this connection request appropriately */
     return OMPI_SUCCESS;
 }