From e75b86d3ab1f265c528a5e5d6ecdebf67b84dd07 Mon Sep 17 00:00:00 2001 From: Rolf vandeVaart Date: Wed, 19 Jan 2011 16:09:17 +0000 Subject: [PATCH] Fix some issues from jsquyres review. 1. Use asprintf instead of snprintf 2. Return remote_proc where possible. 3. Remove dead code. 4. Fix two comment typos. This commit was SVN r24265. --- ompi/mca/btl/openib/btl_openib_component.c | 8 ++++- ompi/mca/btl/openib/btl_openib_failover.c | 39 ++++++++++------------ 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 2b14a7c3b5..08215b49b0 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -3240,6 +3240,12 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); #if OMPI_OPENIB_FAILOVER_ENABLED + /* The check for the callback flag is only needed when running + * with the failover case because there is a chance that a fragment + * generated from a sendi call (which does not set the flag) gets + * coalesced. In normal operation, this cannot happen as the sendi + * call will never queue up a fragment which could potentially become + * a coalesced fragment. It will revert to a regular send. */ if (to_base_frag(i)->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { #endif to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint, @@ -3393,7 +3399,7 @@ error: #else if(openib_btl) openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, - NULL, NULL); + remote_proc, NULL); #endif } diff --git a/ompi/mca/btl/openib/btl_openib_failover.c b/ompi/mca/btl/openib/btl_openib_failover.c index e8f65f4b99..a103e7fd58 100644 --- a/ompi/mca/btl/openib/btl_openib_failover.c +++ b/ompi/mca/btl/openib/btl_openib_failover.c @@ -60,7 +60,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, ompi_proc_t* remote_proc, mca_btl_openib_endpoint_t* endpoint) { - char btlname[IBV_SYSFS_NAME_MAX]; + char *btlname = NULL; int btl_ownership; /* Since this BTL supports failover, it will call the PML error handler * function with the NONFATAL flag. If the PML is running with failover @@ -76,12 +76,6 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, * operation was done. The important information needs to be read * from the fragment. */ - /* Create a nice string to help with debug */ - if (NULL != openib_btl) { - snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s", - openib_btl->lid, openib_btl->device->ib_dev->name); - } - /* Cannot issue callback to SRQ errors because the shared receive * queue is shared and is not specific to a connection. There is no * way to figure out what type of message created the error because @@ -98,6 +92,12 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, } assert(NULL != remote_proc); + /* Create a nice string to help with debug */ + if (NULL != openib_btl) { + asprintf(&btlname, "lid=%d:name=%s", + openib_btl->lid, openib_btl->device->ib_dev->name); + } + /* The next set of errors are associated with an endpoint, but not * with a PML descriptor. They are not associated with a PML * descriptor because: @@ -124,6 +124,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, "MCA_BTL_OPENIG_FRAG=%d, " "dropping since connection is broken (des=%lx)", openib_frag_type(des), (long unsigned int) des); + if (NULL != btlname) free(btlname); return; } @@ -143,6 +144,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, * the same remote_proc argument will not actually map anything out. */ openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); + if (NULL != btlname) free(btlname); /* Since we believe we have done a send, read or write, then the * des_src fields should have valid data. */ @@ -203,14 +205,6 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); } -#if 0 - /* Since a QP has an error, let us go ahead and drain off the - * broken fragments. This is not strictly necessary as we keep - * track of outstanding requests on any rendezvous requests. But, - * I think it makes sense so we will keep it here. */ - progress_one_device(openib_btl->device); -#endif - /* There are several queues associated with an endpoint that may * have some unsent fragments sitting in them. Remove them and * call the callback functions with an error so the PML can send @@ -240,14 +234,15 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) { if(mca_btl_openib_component.port_error_failover) { /* Since we are not specifying a specific connection to bring down, * the PML layer will may out the entire BTL for future communication. */ - char btlname[IBV_SYSFS_NAME_MAX]; - snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s", + char *btlname = NULL; + asprintf(&btlname, "lid=%d:name=%s", openib_btl->lid, openib_btl->device->ib_dev->name); openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, NULL, btlname); + if (NULL != btlname) free(btlname); /* Now send out messages to all endpoints that we are disconnecting. - * Only do ths to endpoints that are connected. Otherwise, the + * Only do this to endpoints that are connected. Otherwise, the * remote side does not yet have the information on this endpoint. */ for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) { endpoint = (mca_btl_openib_endpoint_t*) @@ -281,7 +276,7 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) { * one side of the connection actually sees the error. This means we * can be left in a state where one side believes it has two BTLs, but * the other side believes it only has one. This can cause problems. - * In the case of the EAGER_RDMA_ERROR, we elsewhere in the code what + * In the case of the EAGER_RDMA_ERROR, see elsewhere in the code what * we are doing. * @param ctl_hdr Pointer control header that was received */ @@ -335,10 +330,10 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t if (MCA_BTL_IB_FAILED == newep->endpoint_state) { return; } else { - char btlname[IBV_SYSFS_NAME_MAX]; + char *btlname = NULL; ompi_proc_t* remote_proc = NULL; - snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s", + asprintf(&btlname, "lid=%d:name=%s", newbtl->lid, newbtl->device->ib_dev->name); remote_proc = newep->endpoint_proc->proc_ompi; @@ -353,6 +348,8 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t newep->endpoint_state); newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); + if (NULL != btlname) free(btlname); + error_out_all_pending_frags(newep, &newbtl->super, true); newep->endpoint_state = MCA_BTL_IB_FAILED; return;