1
1

Add a function to debug messages stuck in queues.

Change all tabs to spaces.

This commit was SVN r23974.
Этот коммит содержится в:
Rolf vandeVaart 2010-11-01 14:23:34 +00:00
родитель 6bd41cf5d8
Коммит c23b26a66f

Просмотреть файл

@ -28,13 +28,15 @@
#include "btl_openib_failover.h"
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
struct mca_btl_base_module_t* module);
struct mca_btl_base_module_t* module,
bool errout);
static void mca_btl_openib_endpoint_notify(mca_btl_openib_endpoint_t *endpoint,
uint8_t type, int index);
#if 0
/* debug functions that are normally not needed */
static void dump_all_local_rdma_frags(mca_btl_openib_device_t *device);
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint);
void dump_all_internal_queues(bool errout);
#endif
/**
@ -117,7 +119,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
error_out_all_pending_frags(endpoint, &openib_btl->super);
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
}
opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
"MCA_BTL_OPENIG_FRAG=%d, "
@ -218,7 +220,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
* The first time through will remove the unsent fragments so
* subsequent calls are no-ops. */
if (endpoint) {
error_out_all_pending_frags(endpoint, &openib_btl->super);
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
}
}
@ -257,7 +259,7 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) {
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
error_out_all_pending_frags(endpoint, &openib_btl->super);
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
}
}
}
@ -352,7 +354,7 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
newep->endpoint_state);
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
remote_proc, btlname);
error_out_all_pending_frags(newep, &newbtl->super);
error_out_all_pending_frags(newep, &newbtl->super, true);
newep->endpoint_state = MCA_BTL_IB_FAILED;
return;
}
@ -394,12 +396,17 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
* each qp with each priority and looks for both no_credits_pending_frags
* and no_wqe_pending_frags. It then looks for any pending_lazy_frags,
* pending_put_frags, and pending_get_frags. This function is only
* called when running with failover support enabled.
* called when running with failover support enabled. Note that
* the errout parameter allows the function to also be used as a
* debugging tool to see if there are any fragments on any of the
* queues.
* @param ep Pointer to endpoint that had error
* @param module Pointer to module that had error
* @param errout Boolean which says whether to error them out or not
*/
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
struct mca_btl_base_module_t* module)
struct mca_btl_base_module_t* module,
bool errout)
{
int qp, pri, len, total, btl_ownership;
@ -419,6 +426,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
"IB: Checking for no_wqe_pending_frags qp=%d, "
"pri=%d, list size=%d",
qp, pri, len);
if (true == errout) {
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
no_wqe_pending_frags[pri]))) {
frag = (mca_btl_openib_com_frag_t *) item;
@ -444,7 +452,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
if( btl_ownership ) {
mca_btl_openib_free(module, des);
}
}
}
}
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
@ -455,6 +463,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
"IB: Checking for no_credits_pending_frags qp=%d, "
"pri=%d, list size=%d",
qp, pri, len);
if (true == errout) {
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
no_credits_pending_frags[pri]))) {
frag = (mca_btl_openib_com_frag_t *) item;
@ -481,7 +490,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
if( btl_ownership ) {
mca_btl_openib_free(module, des);
}
}
}
}
@ -493,6 +502,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
"IB: Checking for srq pending_frags qp=%d, pri=%d, "
"list size=%d",
qp, pri, len);
if (true == errout) {
while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp].
u.srq_qp.pending_frags[pri]))) {
frag = (mca_btl_openib_com_frag_t *) item;
@ -518,7 +528,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
if( btl_ownership ) {
mca_btl_openib_free(module, des);
}
}
}
}
}
@ -533,36 +543,42 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
total += len;
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
"IB: Checking for pending_lazy_frags, list size=%d", len);
if (true == errout) {
while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) {
frag = (mca_btl_openib_com_frag_t *) item;
des = (mca_btl_base_descriptor_t *)frag;
des->des_cbfunc(module, ep, des, OMPI_ERROR);
}
}
}
len = opal_list_get_size(&ep->pending_put_frags);
if (len > 0) {
total += len;
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
"IB: Checking for pending_put_frags, list size=%d", len);
if (true == errout) {
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
frag = (mca_btl_openib_com_frag_t *) item;
des = (mca_btl_base_descriptor_t *)frag;
des->des_cbfunc(module, ep, des, OMPI_ERROR);
}
}
}
len = opal_list_get_size(&ep->pending_get_frags);
if (len > 0) {
total += len;
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
"IB: Checking for pending_get_frags, list size=%d", len);
if (true == errout) {
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
frag = (mca_btl_openib_com_frag_t *) item;
des = (mca_btl_base_descriptor_t *)frag;
des->des_cbfunc(module, ep, des, OMPI_ERROR);
}
}
}
opal_output_verbose(40, mca_btl_openib_component.verbose_failover,
"IB: Finished checking for pending_frags, total moved=%d",
@ -676,7 +692,7 @@ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, ui
return;
}
#if 0
#if 0 /* debugging functions */
/*
* Function used for debugging problems in eager rdma.
*/
@ -727,4 +743,37 @@ void dump_all_local_rdma_frags(mca_btl_openib_device_t *device) {
dump_local_rdma_frags(endpoint);
}
}
#endif
/**
* This function is a debugging tool. If you notify a hang, you can
* call this function from a debugger and see if there are any
* messages stuck in any of the queues. If you call it with
* errout=true, then it will error them out. Otherwise, it will
* just print out the size of the queues with data in them.
*/
void dump_all_internal_queues(bool errout) {
int i, j, num_eps;
mca_btl_openib_module_t* btl;
int total;
mca_btl_base_endpoint_t* ep;
struct mca_btl_base_module_t* module;
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
btl = mca_btl_openib_component.openib_btls[i];
module = &btl->super;
num_eps = opal_pointer_array_get_size(btl->device->endpoints);
/* Now, find the endpoint associated with it */
for (j = 0; j < num_eps; j++) {
ep = (mca_btl_openib_endpoint_t*)
opal_pointer_array_get_item(btl->device->endpoints, j);
if (NULL == ep) {
continue;
}
total = 0;
error_out_all_pending_frags(ep, module, errout);
}
}
}
#endif /* debugging functions */