From e75b86d3ab1f265c528a5e5d6ecdebf67b84dd07 Mon Sep 17 00:00:00 2001
From: Rolf vandeVaart <rvandevaart@nvidia.com>
Date: Wed, 19 Jan 2011 16:09:17 +0000
Subject: [PATCH] Fix some issues from jsquyres review. 1. Use asprintf instead
 of snprintf 2. Return remote_proc where possible. 3. Remove dead code. 4. Fix
 two comment typos.

This commit was SVN r24265.
---
 ompi/mca/btl/openib/btl_openib_component.c |  8 ++++-
 ompi/mca/btl/openib/btl_openib_failover.c  | 39 ++++++++++------------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c
index 2b14a7c3b5..08215b49b0 100644
--- a/ompi/mca/btl/openib/btl_openib_component.c
+++ b/ompi/mca/btl/openib/btl_openib_component.c
@@ -3240,6 +3240,12 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
                 while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                     btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
 #if OMPI_OPENIB_FAILOVER_ENABLED
+                    /* The check for the callback flag is only needed when running
+                     * with the failover case because there is a chance that a fragment
+                     * generated from a sendi call (which does not set the flag) gets
+                     * coalesced.  In normal operation, this cannot happen as the sendi
+                     * call will never queue up a fragment which could potentially become
+                     * a coalesced fragment.  It will revert to a regular send. */
                     if (to_base_frag(i)->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
 #endif
                         to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
@@ -3393,7 +3399,7 @@ error:
 #else
     if(openib_btl)
         openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
-                             NULL, NULL);
+                             remote_proc, NULL);
 #endif
 }
 
diff --git a/ompi/mca/btl/openib/btl_openib_failover.c b/ompi/mca/btl/openib/btl_openib_failover.c
index e8f65f4b99..a103e7fd58 100644
--- a/ompi/mca/btl/openib/btl_openib_failover.c
+++ b/ompi/mca/btl/openib/btl_openib_failover.c
@@ -60,7 +60,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
                                           ompi_proc_t* remote_proc,
                                           mca_btl_openib_endpoint_t* endpoint)
 {
-    char btlname[IBV_SYSFS_NAME_MAX];
+    char *btlname = NULL;
     int btl_ownership;
     /* Since this BTL supports failover, it will call the PML error handler
      * function with the NONFATAL flag.  If the PML is running with failover
@@ -76,12 +76,6 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
      * operation was done.  The important information needs to be read
      * from the fragment. */
 
-    /* Create a nice string to help with debug */
-    if (NULL != openib_btl) {
-        snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s",
-                 openib_btl->lid, openib_btl->device->ib_dev->name);
-    }
-
     /* Cannot issue callback to SRQ errors because the shared receive
      * queue is shared and is not specific to a connection.  There is no
      * way to figure out what type of message created the error because
@@ -98,6 +92,12 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
     }
     assert(NULL != remote_proc);
 
+    /* Create a nice string to help with debug */
+    if (NULL != openib_btl) {
+        asprintf(&btlname, "lid=%d:name=%s",
+                 openib_btl->lid, openib_btl->device->ib_dev->name);
+    }
+
     /* The next set of errors are associated with an endpoint, but not
      * with a PML descriptor.  They are not associated with a PML
      * descriptor because:
@@ -124,6 +124,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
                             "MCA_BTL_OPENIG_FRAG=%d, "
                             "dropping since connection is broken (des=%lx)",
                             openib_frag_type(des), (long unsigned int) des);
+        if (NULL != btlname) free(btlname);
         return;
     }
 
@@ -143,6 +144,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
      * the same remote_proc argument will not actually map anything out. */
     openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                          remote_proc, btlname);
+    if (NULL != btlname) free(btlname);
 
     /* Since we believe we have done a send, read or write, then the
      * des_src fields should have valid data. */
@@ -203,14 +205,6 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
         OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
     }
 
-#if 0
-    /* Since a QP has an error, let us go ahead and drain off the
-     * broken fragments.  This is not strictly necessary as we keep
-     * track of outstanding requests on any rendezvous requests.  But,
-     * I think it makes sense so we will keep it here. */
-    progress_one_device(openib_btl->device);
-#endif
-
     /* There are several queues associated with an endpoint that may
      * have some unsent fragments sitting in them.  Remove them and
      * call the callback functions with an error so the PML can send
@@ -240,14 +234,15 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
     if(mca_btl_openib_component.port_error_failover) {
         /* Since we are not specifying a specific connection to bring down,
          * the PML layer will may out the entire BTL for future communication. */
-        char btlname[IBV_SYSFS_NAME_MAX];
-        snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s",
+        char *btlname = NULL;
+        asprintf(&btlname, "lid=%d:name=%s",
                  openib_btl->lid, openib_btl->device->ib_dev->name);
         openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                              NULL, btlname);
+        if (NULL != btlname) free(btlname);
 
         /* Now send out messages to all endpoints that we are disconnecting.  
-         * Only do ths to endpoints that are connected.  Otherwise, the
+         * Only do this to endpoints that are connected.  Otherwise, the
          * remote side does not yet have the information on this endpoint.  */
         for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) {
             endpoint = (mca_btl_openib_endpoint_t*)
@@ -281,7 +276,7 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
  * one side of the connection actually sees the error.  This means we
  * can be left in a state where one side believes it has two BTLs, but
  * the other side believes it only has one.  This can cause problems.
- * In the case of the EAGER_RDMA_ERROR, we elsewhere in the code what
+ * In the case of the EAGER_RDMA_ERROR, see elsewhere in the code what
  * we are doing.
  * @param ctl_hdr Pointer control header that was received
  */
@@ -335,10 +330,10 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
                     if (MCA_BTL_IB_FAILED == newep->endpoint_state) {
                         return;
                     } else {
-                        char btlname[IBV_SYSFS_NAME_MAX];
+                        char *btlname = NULL;
                         ompi_proc_t* remote_proc = NULL;
 
-                        snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s",
+                        asprintf(&btlname, "lid=%d:name=%s",
                                  newbtl->lid, newbtl->device->ib_dev->name);
 
                         remote_proc = newep->endpoint_proc->proc_ompi;
@@ -353,6 +348,8 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
                                             newep->endpoint_state);
                         newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                                          remote_proc, btlname);
+                        if (NULL != btlname) free(btlname);
+
                         error_out_all_pending_frags(newep, &newbtl->super, true);
                         newep->endpoint_state = MCA_BTL_IB_FAILED;
                         return;