Remove all retransmission code; the IBCM kernel module handles all of

that for us. This commit was SVN r18432.
2008-05-13 16:10:34 +00:00 · 2008-05-13 16:10:34 +00:00 · d8e5608053
--- a/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c
+++ b/ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c
@ -12,10 +12,9 @@
 * TO-DO:
 *
 * - audit control values passed to req_send()
 * - Somehow handle retransmission of RTU; don't know how to do this 
 *   yet. :-( 
 * - More show_help() throughout 
- * - ...?
+ * - error handling in case of broken connection is not good; need to
 *   notify btl module safely
 */
 /*
@ -219,23 +218,15 @@
 *   start_connect() will have created them already), create all
 *   num_qps QPs.  Also post receive buffers on all QPs.
 *
- * The IBCM CPC handles the retransmission of request and reply
+ * We wholly reply on the IBCM system for all retransmissions and
- * messages, but not RTUs.  If we get a REQ_ERROR or REP_ERROR, it
+ * duplicate filtering of IBCM requests, replies, and RTUs.  If IBCM
- * means that the IBCM system tried to retransmit a few times and
+ * reports a timeout error up to OMPI, we abort the connection.  Lists
- * failed.  Honestly, we could probably just give up at this point,
+ * are maintained of pending IBCM requests and replies solely for
- * but one of the point of the IBCM CPC is to work at large scale, so
+ * error handling; request/reply timeouts are reported via CM ID.  We
- * there might be heavy-duty congestion such that the UD messages are
+ * can cross-reference this CM ID to the endpoint that it was trying
- * actually getting lost.  So we'll find the request / reply and
+ * to connect via these lists.
 * re-send it -- we'll do this indefinitely.  
 *
- * We wholly rely on the IBCM system to retransmit the RTU, however.
+ * Note that there is a race condition: because UD is unordered, the
 * If the passive side doesn't receive the RTU, it'll automatically
 * retransmit the reply (or raise a REP_ERROR and have us retransmit
 * it).  If the active side had already sent the RTU, it'll recognize
 * the incoming reply as a duplicate and therefore retransmit the RTU
 * for us.  Nifty.
 *
 * However, there is a race condition: because UD is unordered, the
 * first message may arrive on the QP before the RTU has arrived.
 * This will cause an IBV_EVENT_COMM_EST event to be raised, which
 * would then be picked up by the async event handler in the
@ -387,9 +378,9 @@ typedef struct {
 static OBJ_CLASS_INSTANCE(ibcm_base_cm_id_t, opal_list_item_t, NULL, NULL);
 /*
- * Need to maintain a list of pending CM ID requests (in case we need
+ * Need to maintain a list of pending CM ID requests (for error
- * to retransmit).  Need to use the struct name here because it was
+ * handling if the requests timeout).  Need to use the struct name
- * forward referenced, above.
+ * here because it was forward referenced, above.
 */
 typedef struct ibcm_request_t {
    ibcm_base_cm_id_t super;
@ -412,9 +403,9 @@ static OBJ_CLASS_INSTANCE(ibcm_request_t, ibcm_base_cm_id_t,
                          ibcm_request_cm_id_constructor, NULL);
 /*
- * Need to maintain a list of pending CM ID replies (in case we need
+ * Need to maintain a list of pending CM ID replies (for error
- * to retransmit).  Need to use a struct name here because it was
+ * handling if the replies timeout).  Need to use a struct name here
- * forward referenced, above.
+ * because it was forward referenced, above.
 */
 typedef struct ibcm_reply_t {
    ibcm_base_cm_id_t super;
@ -1693,9 +1684,9 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
    opal_mutex_unlock(&ie->ie_lock);
    /* If logic above selected a rejection reason, reject this
-       request.  No need to cache this reject message for
+       request.  Note that if the same request arrives again later,
-       retransmission; if the same request arrives again later, IBCM
+       IBCM will trigger a new event and we'll just reject it
-       will trigger a new event and we'll just reject it again. */
+       again. */
    if (REJ_MAX != rej_reason) {
        opal_output(-1, "arbitrartion failed -- reject");
        goto reject;
@ -1923,9 +1914,7 @@ static int reply_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
        ie->ie_recv_buffers_posted = true;
    }
-    /* Send the RTU -- note that we don't need to queue it up for
+    /* Send the RTU */
       retransmission (see lengthy explanation at beginning of this
       file). */
    rtu_data.irtud_reply = reply;
    rtu_data.irtud_qp_index = p->irepd_qp_index;
    if (0 != ib_cm_send_rtu(event->cm_id, &rtu_data, sizeof(rtu_data))) {
@ -1933,8 +1922,8 @@ static int reply_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
        return OMPI_ERR_IN_ERRNO;
    }
-    /* Remove the pending request so that we don't try to retransmit
+    /* Remove the pending request because we won't need to handle
-       it */
+       errors for it */
    opal_output(-1, "reply received cm id %p -- original cached req %p",
                (void*)cmh->listen_cm_id, (void*)request);
    opal_list_remove_item(&ibcm_pending_requests, &(request->super.super));
@ -1967,21 +1956,18 @@ static int ready_to_use_received(ibcm_listen_cm_id_t *h,
    /* Move the QP to RTS */
    if (OMPI_SUCCESS != (rc = qp_to_rts(p->irtud_qp_index,
                                        event->cm_id, endpoint))) {
        /* JMS */
        opal_output(-1, "ib cm rtu handler: failed move to RTS (index %d)",
                    p->irtud_qp_index);
        return rc;
    }
-    /* Remove the pending reply so that we don't try to retransmit
+    /* Remove the pending reply because we won't need to handle errors
-       it */
+       for it */
    opal_output(-1, "RTU received cm id %p -- original cached reply %p",
                (void*)event->cm_id, (void*)reply);
    opal_list_remove_item(&ibcm_pending_replies, &(reply->super.super));
    OBJ_RELEASE(reply);
    /* JMS Send a 0 byte message to the other side to ACK this RTU */
    /* Have all the QP's been connected?  If so, tell the main BTL
       that we're done. */
    if (0 == --(ie->ie_qps_to_connect)) {
@ -2042,8 +2028,8 @@ static int reject_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
            opal_output(-1, "ibcm rej handler: WRONG_DIRECTION unexpected!");
        } else {
-            /* Remove from the global pending_requests list (because
+            /* Remove from the global pending_requests list because we
-               it no longer needs to be retransmitted upon timeout) */
+               no longer need to handle errors for it */
            opal_output(-1, "reply received cm id %p -- original cached req %p",
                        (void*)cmh->listen_cm_id, 
                        (void*)request);
@ -2077,7 +2063,6 @@ static int reject_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
 static int request_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
 {
    int rc;
    ibcm_request_t *req;
    opal_output(-1, "ibcm handler: request error!");
@ -2100,23 +2085,13 @@ static int request_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
        return OMPI_ERR_NOT_FOUND;
    }
-    rc = ib_cm_send_req(event->cm_id, &(req->cm_req));
+    /* JMS need to barf this connection request appropriately */
    if (0 != rc) {
        opal_show_help("help-mpi-btl-openib-cpc-ibcm.txt",
                       "ib_cm function error", true,
                       orte_process_info.nodename, 
                       "ib_cm_send_req", rc, strerror(rc));
        return OMPI_ERR_UNREACH;
    }
    opal_output(-1, "Retransmitted IBCM request (CM ID: %p)",
                (void*)event->cm_id);
    return OMPI_SUCCESS;
 }
 static int reply_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
 {
    int rc;
    ibcm_reply_t *rep;
    opal_output(-1, "ibcm handler: reply error!");
@ -2139,17 +2114,7 @@ static int reply_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
        return OMPI_ERR_NOT_FOUND;
    }
-    rc = ib_cm_send_rep(event->cm_id, &(rep->cm_rep));
+    /* JMS need to barf this connection request appropriately */
    if (0 != rc) {
        opal_show_help("help-mpi-btl-openib-cpc-ibcm.txt",
                       "ib_cm function error", true,
                       orte_process_info.nodename, 
                       "ib_cm_send_rep", rc, strerror(rc));
        return OMPI_ERR_UNREACH;
    }
    opal_output(-1, "Retransmitted IBCM reply (CM ID: %p)",
                (void*)event->cm_id);
    return OMPI_SUCCESS;
 }