Fix some issues from jsquyres review.
1. Use asprintf instead of snprintf 2. Return remote_proc where possible. 3. Remove dead code. 4. Fix two comment typos. This commit was SVN r24265.
Этот коммит содержится в:
родитель
0e921bba7f
Коммит
e75b86d3ab
@ -3240,6 +3240,12 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
|||||||
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
||||||
btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||||
#if OMPI_OPENIB_FAILOVER_ENABLED
|
#if OMPI_OPENIB_FAILOVER_ENABLED
|
||||||
|
/* The check for the callback flag is only needed when running
|
||||||
|
* with the failover case because there is a chance that a fragment
|
||||||
|
* generated from a sendi call (which does not set the flag) gets
|
||||||
|
* coalesced. In normal operation, this cannot happen as the sendi
|
||||||
|
* call will never queue up a fragment which could potentially become
|
||||||
|
* a coalesced fragment. It will revert to a regular send. */
|
||||||
if (to_base_frag(i)->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
if (to_base_frag(i)->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
||||||
#endif
|
#endif
|
||||||
to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
|
to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
|
||||||
@ -3393,7 +3399,7 @@ error:
|
|||||||
#else
|
#else
|
||||||
if(openib_btl)
|
if(openib_btl)
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
|
||||||
NULL, NULL);
|
remote_proc, NULL);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,7 +60,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
ompi_proc_t* remote_proc,
|
ompi_proc_t* remote_proc,
|
||||||
mca_btl_openib_endpoint_t* endpoint)
|
mca_btl_openib_endpoint_t* endpoint)
|
||||||
{
|
{
|
||||||
char btlname[IBV_SYSFS_NAME_MAX];
|
char *btlname = NULL;
|
||||||
int btl_ownership;
|
int btl_ownership;
|
||||||
/* Since this BTL supports failover, it will call the PML error handler
|
/* Since this BTL supports failover, it will call the PML error handler
|
||||||
* function with the NONFATAL flag. If the PML is running with failover
|
* function with the NONFATAL flag. If the PML is running with failover
|
||||||
@ -76,12 +76,6 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
* operation was done. The important information needs to be read
|
* operation was done. The important information needs to be read
|
||||||
* from the fragment. */
|
* from the fragment. */
|
||||||
|
|
||||||
/* Create a nice string to help with debug */
|
|
||||||
if (NULL != openib_btl) {
|
|
||||||
snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s",
|
|
||||||
openib_btl->lid, openib_btl->device->ib_dev->name);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Cannot issue callback to SRQ errors because the shared receive
|
/* Cannot issue callback to SRQ errors because the shared receive
|
||||||
* queue is shared and is not specific to a connection. There is no
|
* queue is shared and is not specific to a connection. There is no
|
||||||
* way to figure out what type of message created the error because
|
* way to figure out what type of message created the error because
|
||||||
@ -98,6 +92,12 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
}
|
}
|
||||||
assert(NULL != remote_proc);
|
assert(NULL != remote_proc);
|
||||||
|
|
||||||
|
/* Create a nice string to help with debug */
|
||||||
|
if (NULL != openib_btl) {
|
||||||
|
asprintf(&btlname, "lid=%d:name=%s",
|
||||||
|
openib_btl->lid, openib_btl->device->ib_dev->name);
|
||||||
|
}
|
||||||
|
|
||||||
/* The next set of errors are associated with an endpoint, but not
|
/* The next set of errors are associated with an endpoint, but not
|
||||||
* with a PML descriptor. They are not associated with a PML
|
* with a PML descriptor. They are not associated with a PML
|
||||||
* descriptor because:
|
* descriptor because:
|
||||||
@ -124,6 +124,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
"MCA_BTL_OPENIG_FRAG=%d, "
|
"MCA_BTL_OPENIG_FRAG=%d, "
|
||||||
"dropping since connection is broken (des=%lx)",
|
"dropping since connection is broken (des=%lx)",
|
||||||
openib_frag_type(des), (long unsigned int) des);
|
openib_frag_type(des), (long unsigned int) des);
|
||||||
|
if (NULL != btlname) free(btlname);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,6 +144,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
* the same remote_proc argument will not actually map anything out. */
|
* the same remote_proc argument will not actually map anything out. */
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
||||||
remote_proc, btlname);
|
remote_proc, btlname);
|
||||||
|
if (NULL != btlname) free(btlname);
|
||||||
|
|
||||||
/* Since we believe we have done a send, read or write, then the
|
/* Since we believe we have done a send, read or write, then the
|
||||||
* des_src fields should have valid data. */
|
* des_src fields should have valid data. */
|
||||||
@ -203,14 +205,6 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
|
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
/* Since a QP has an error, let us go ahead and drain off the
|
|
||||||
* broken fragments. This is not strictly necessary as we keep
|
|
||||||
* track of outstanding requests on any rendezvous requests. But,
|
|
||||||
* I think it makes sense so we will keep it here. */
|
|
||||||
progress_one_device(openib_btl->device);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* There are several queues associated with an endpoint that may
|
/* There are several queues associated with an endpoint that may
|
||||||
* have some unsent fragments sitting in them. Remove them and
|
* have some unsent fragments sitting in them. Remove them and
|
||||||
* call the callback functions with an error so the PML can send
|
* call the callback functions with an error so the PML can send
|
||||||
@ -240,14 +234,15 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
|||||||
if(mca_btl_openib_component.port_error_failover) {
|
if(mca_btl_openib_component.port_error_failover) {
|
||||||
/* Since we are not specifying a specific connection to bring down,
|
/* Since we are not specifying a specific connection to bring down,
|
||||||
* the PML layer will may out the entire BTL for future communication. */
|
* the PML layer will may out the entire BTL for future communication. */
|
||||||
char btlname[IBV_SYSFS_NAME_MAX];
|
char *btlname = NULL;
|
||||||
snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s",
|
asprintf(&btlname, "lid=%d:name=%s",
|
||||||
openib_btl->lid, openib_btl->device->ib_dev->name);
|
openib_btl->lid, openib_btl->device->ib_dev->name);
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
||||||
NULL, btlname);
|
NULL, btlname);
|
||||||
|
if (NULL != btlname) free(btlname);
|
||||||
|
|
||||||
/* Now send out messages to all endpoints that we are disconnecting.
|
/* Now send out messages to all endpoints that we are disconnecting.
|
||||||
* Only do ths to endpoints that are connected. Otherwise, the
|
* Only do this to endpoints that are connected. Otherwise, the
|
||||||
* remote side does not yet have the information on this endpoint. */
|
* remote side does not yet have the information on this endpoint. */
|
||||||
for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) {
|
for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) {
|
||||||
endpoint = (mca_btl_openib_endpoint_t*)
|
endpoint = (mca_btl_openib_endpoint_t*)
|
||||||
@ -281,7 +276,7 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
|||||||
* one side of the connection actually sees the error. This means we
|
* one side of the connection actually sees the error. This means we
|
||||||
* can be left in a state where one side believes it has two BTLs, but
|
* can be left in a state where one side believes it has two BTLs, but
|
||||||
* the other side believes it only has one. This can cause problems.
|
* the other side believes it only has one. This can cause problems.
|
||||||
* In the case of the EAGER_RDMA_ERROR, we elsewhere in the code what
|
* In the case of the EAGER_RDMA_ERROR, see elsewhere in the code what
|
||||||
* we are doing.
|
* we are doing.
|
||||||
* @param ctl_hdr Pointer control header that was received
|
* @param ctl_hdr Pointer control header that was received
|
||||||
*/
|
*/
|
||||||
@ -335,10 +330,10 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
|||||||
if (MCA_BTL_IB_FAILED == newep->endpoint_state) {
|
if (MCA_BTL_IB_FAILED == newep->endpoint_state) {
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
char btlname[IBV_SYSFS_NAME_MAX];
|
char *btlname = NULL;
|
||||||
ompi_proc_t* remote_proc = NULL;
|
ompi_proc_t* remote_proc = NULL;
|
||||||
|
|
||||||
snprintf(btlname, IBV_SYSFS_NAME_MAX-1, "lid=%d:name=%s",
|
asprintf(&btlname, "lid=%d:name=%s",
|
||||||
newbtl->lid, newbtl->device->ib_dev->name);
|
newbtl->lid, newbtl->device->ib_dev->name);
|
||||||
|
|
||||||
remote_proc = newep->endpoint_proc->proc_ompi;
|
remote_proc = newep->endpoint_proc->proc_ompi;
|
||||||
@ -353,6 +348,8 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
|||||||
newep->endpoint_state);
|
newep->endpoint_state);
|
||||||
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
||||||
remote_proc, btlname);
|
remote_proc, btlname);
|
||||||
|
if (NULL != btlname) free(btlname);
|
||||||
|
|
||||||
error_out_all_pending_frags(newep, &newbtl->super, true);
|
error_out_all_pending_frags(newep, &newbtl->super, true);
|
||||||
newep->endpoint_state = MCA_BTL_IB_FAILED;
|
newep->endpoint_state = MCA_BTL_IB_FAILED;
|
||||||
return;
|
return;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user