Add a function to debug messages stuck in queues.
Change all tabs to spaces. This commit was SVN r23974.
Этот коммит содержится в:
родитель
6bd41cf5d8
Коммит
c23b26a66f
@ -28,13 +28,15 @@
|
||||
#include "btl_openib_failover.h"
|
||||
|
||||
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
struct mca_btl_base_module_t* module);
|
||||
struct mca_btl_base_module_t* module,
|
||||
bool errout);
|
||||
static void mca_btl_openib_endpoint_notify(mca_btl_openib_endpoint_t *endpoint,
|
||||
uint8_t type, int index);
|
||||
#if 0
|
||||
/* debug functions that are normally not needed */
|
||||
static void dump_all_local_rdma_frags(mca_btl_openib_device_t *device);
|
||||
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint);
|
||||
void dump_all_internal_queues(bool errout);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@ -117,7 +119,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
||||
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
|
||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
||||
error_out_all_pending_frags(endpoint, &openib_btl->super);
|
||||
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
||||
}
|
||||
opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
|
||||
"MCA_BTL_OPENIG_FRAG=%d, "
|
||||
@ -218,7 +220,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
||||
* The first time through will remove the unsent fragments so
|
||||
* subsequent calls are no-ops. */
|
||||
if (endpoint) {
|
||||
error_out_all_pending_frags(endpoint, &openib_btl->super);
|
||||
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
||||
}
|
||||
}
|
||||
|
||||
@ -257,7 +259,7 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
||||
if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) {
|
||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
error_out_all_pending_frags(endpoint, &openib_btl->super);
|
||||
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -352,7 +354,7 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
||||
newep->endpoint_state);
|
||||
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
||||
remote_proc, btlname);
|
||||
error_out_all_pending_frags(newep, &newbtl->super);
|
||||
error_out_all_pending_frags(newep, &newbtl->super, true);
|
||||
newep->endpoint_state = MCA_BTL_IB_FAILED;
|
||||
return;
|
||||
}
|
||||
@ -394,12 +396,17 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
||||
* each qp with each priority and looks for both no_credits_pending_frags
|
||||
* and no_wqe_pending_frags. It then looks for any pending_lazy_frags,
|
||||
* pending_put_frags, and pending_get_frags. This function is only
|
||||
* called when running with failover support enabled.
|
||||
* called when running with failover support enabled. Note that
|
||||
* the errout parameter allows the function to also be used as a
|
||||
* debugging tool to see if there are any fragments on any of the
|
||||
* queues.
|
||||
* @param ep Pointer to endpoint that had error
|
||||
* @param module Pointer to module that had error
|
||||
* @param errout Boolean which says whether to error them out or not
|
||||
*/
|
||||
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
struct mca_btl_base_module_t* module)
|
||||
struct mca_btl_base_module_t* module,
|
||||
bool errout)
|
||||
{
|
||||
int qp, pri, len, total, btl_ownership;
|
||||
|
||||
@ -419,6 +426,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
"IB: Checking for no_wqe_pending_frags qp=%d, "
|
||||
"pri=%d, list size=%d",
|
||||
qp, pri, len);
|
||||
if (true == errout) {
|
||||
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
||||
no_wqe_pending_frags[pri]))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
@ -444,7 +452,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
if( btl_ownership ) {
|
||||
mca_btl_openib_free(module, des);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
@ -455,6 +463,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
"IB: Checking for no_credits_pending_frags qp=%d, "
|
||||
"pri=%d, list size=%d",
|
||||
qp, pri, len);
|
||||
if (true == errout) {
|
||||
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
||||
no_credits_pending_frags[pri]))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
@ -481,7 +490,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
if( btl_ownership ) {
|
||||
mca_btl_openib_free(module, des);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -493,6 +502,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
"IB: Checking for srq pending_frags qp=%d, pri=%d, "
|
||||
"list size=%d",
|
||||
qp, pri, len);
|
||||
if (true == errout) {
|
||||
while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp].
|
||||
u.srq_qp.pending_frags[pri]))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
@ -518,7 +528,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
if( btl_ownership ) {
|
||||
mca_btl_openib_free(module, des);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -533,36 +543,42 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||
total += len;
|
||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||
"IB: Checking for pending_lazy_frags, list size=%d", len);
|
||||
if (true == errout) {
|
||||
while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
des = (mca_btl_base_descriptor_t *)frag;
|
||||
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
len = opal_list_get_size(&ep->pending_put_frags);
|
||||
if (len > 0) {
|
||||
total += len;
|
||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||
"IB: Checking for pending_put_frags, list size=%d", len);
|
||||
if (true == errout) {
|
||||
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
des = (mca_btl_base_descriptor_t *)frag;
|
||||
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
len = opal_list_get_size(&ep->pending_get_frags);
|
||||
if (len > 0) {
|
||||
total += len;
|
||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||
"IB: Checking for pending_get_frags, list size=%d", len);
|
||||
if (true == errout) {
|
||||
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
||||
frag = (mca_btl_openib_com_frag_t *) item;
|
||||
des = (mca_btl_base_descriptor_t *)frag;
|
||||
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
opal_output_verbose(40, mca_btl_openib_component.verbose_failover,
|
||||
"IB: Finished checking for pending_frags, total moved=%d",
|
||||
@ -676,7 +692,7 @@ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, ui
|
||||
return;
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if 0 /* debugging functions */
|
||||
/*
|
||||
* Function used for debugging problems in eager rdma.
|
||||
*/
|
||||
@ -727,4 +743,37 @@ void dump_all_local_rdma_frags(mca_btl_openib_device_t *device) {
|
||||
dump_local_rdma_frags(endpoint);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* This function is a debugging tool. If you notify a hang, you can
|
||||
* call this function from a debugger and see if there are any
|
||||
* messages stuck in any of the queues. If you call it with
|
||||
* errout=true, then it will error them out. Otherwise, it will
|
||||
* just print out the size of the queues with data in them.
|
||||
*/
|
||||
void dump_all_internal_queues(bool errout) {
|
||||
int i, j, num_eps;
|
||||
mca_btl_openib_module_t* btl;
|
||||
int total;
|
||||
mca_btl_base_endpoint_t* ep;
|
||||
struct mca_btl_base_module_t* module;
|
||||
|
||||
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
||||
btl = mca_btl_openib_component.openib_btls[i];
|
||||
module = &btl->super;
|
||||
num_eps = opal_pointer_array_get_size(btl->device->endpoints);
|
||||
|
||||
/* Now, find the endpoint associated with it */
|
||||
for (j = 0; j < num_eps; j++) {
|
||||
ep = (mca_btl_openib_endpoint_t*)
|
||||
opal_pointer_array_get_item(btl->device->endpoints, j);
|
||||
if (NULL == ep) {
|
||||
continue;
|
||||
}
|
||||
|
||||
total = 0;
|
||||
error_out_all_pending_frags(ep, module, errout);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* debugging functions */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user