Add a function to debug messages stuck in queues.
Change all tabs to spaces. This commit was SVN r23974.
Этот коммит содержится в:
родитель
6bd41cf5d8
Коммит
c23b26a66f
@ -28,13 +28,15 @@
|
|||||||
#include "btl_openib_failover.h"
|
#include "btl_openib_failover.h"
|
||||||
|
|
||||||
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||||
struct mca_btl_base_module_t* module);
|
struct mca_btl_base_module_t* module,
|
||||||
|
bool errout);
|
||||||
static void mca_btl_openib_endpoint_notify(mca_btl_openib_endpoint_t *endpoint,
|
static void mca_btl_openib_endpoint_notify(mca_btl_openib_endpoint_t *endpoint,
|
||||||
uint8_t type, int index);
|
uint8_t type, int index);
|
||||||
#if 0
|
#if 0
|
||||||
/* debug functions that are normally not needed */
|
/* debug functions that are normally not needed */
|
||||||
static void dump_all_local_rdma_frags(mca_btl_openib_device_t *device);
|
static void dump_all_local_rdma_frags(mca_btl_openib_device_t *device);
|
||||||
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint);
|
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint);
|
||||||
|
void dump_all_internal_queues(bool errout);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -54,10 +56,10 @@ static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint);
|
|||||||
* @param endpoint Pointer to endpoint that had the error
|
* @param endpoint Pointer to endpoint that had the error
|
||||||
*/
|
*/
|
||||||
void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
||||||
mca_btl_base_descriptor_t *des,
|
mca_btl_base_descriptor_t *des,
|
||||||
int qp,
|
int qp,
|
||||||
ompi_proc_t* remote_proc,
|
ompi_proc_t* remote_proc,
|
||||||
mca_btl_openib_endpoint_t* endpoint)
|
mca_btl_openib_endpoint_t* endpoint)
|
||||||
{
|
{
|
||||||
char btlname[IBV_SYSFS_NAME_MAX];
|
char btlname[IBV_SYSFS_NAME_MAX];
|
||||||
int btl_ownership;
|
int btl_ownership;
|
||||||
@ -87,7 +89,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
* we need the information in the wc->imm_data field which does not
|
* we need the information in the wc->imm_data field which does not
|
||||||
* exist when we have an error. So, nothing to do here but return. */
|
* exist when we have an error. So, nothing to do here but return. */
|
||||||
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
||||||
!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
||||||
"SRQ RECV type=%d", openib_frag_type(des));
|
"SRQ RECV type=%d", openib_frag_type(des));
|
||||||
/* Need to think about returning any shared resources of the
|
/* Need to think about returning any shared resources of the
|
||||||
@ -105,23 +107,23 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
* Therefore, just drop the fragments and call up into the PML to
|
* Therefore, just drop the fragments and call up into the PML to
|
||||||
* disable this endpoint for future communication. */
|
* disable this endpoint for future communication. */
|
||||||
if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
||||||
(BTL_OPENIB_QP_TYPE_PP(qp))) ||
|
(BTL_OPENIB_QP_TYPE_PP(qp))) ||
|
||||||
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) ||
|
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) ||
|
||||||
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) {
|
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) {
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
||||||
remote_proc, btlname);
|
remote_proc, btlname);
|
||||||
/* Now that this connection has been mapped out at the PML layer,
|
/* Now that this connection has been mapped out at the PML layer,
|
||||||
* we change the state in the BTL layer. The change in the PML
|
* we change the state in the BTL layer. The change in the PML
|
||||||
* layer should prevent that we ever try to send on this BTL
|
* layer should prevent that we ever try to send on this BTL
|
||||||
* again. If we do, then this is an error case. */
|
* again. If we do, then this is an error case. */
|
||||||
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
|
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
|
||||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
||||||
error_out_all_pending_frags(endpoint, &openib_btl->super);
|
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
||||||
}
|
}
|
||||||
opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
|
||||||
"MCA_BTL_OPENIG_FRAG=%d, "
|
"MCA_BTL_OPENIG_FRAG=%d, "
|
||||||
"dropping since connection is broken (des=%lx)",
|
"dropping since connection is broken (des=%lx)",
|
||||||
openib_frag_type(des), (long unsigned int) des);
|
openib_frag_type(des), (long unsigned int) des);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -131,7 +133,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
|
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
||||||
"OPENIB_FRAG_RECV_USER fragment, "
|
"OPENIB_FRAG_RECV_USER fragment, "
|
||||||
"btl=%lx, continue with callbacks",
|
"btl=%lx, continue with callbacks",
|
||||||
(long unsigned int) &openib_btl->super);
|
(long unsigned int) &openib_btl->super);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -189,7 +191,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
* one actually made it successfully. */
|
* one actually made it successfully. */
|
||||||
if (0 != to_send_frag(des)->ftr) {
|
if (0 != to_send_frag(des)->ftr) {
|
||||||
mca_btl_openib_endpoint_notify(endpoint,
|
mca_btl_openib_endpoint_notify(endpoint,
|
||||||
MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR,
|
MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR,
|
||||||
(long)to_send_frag(des)->ftr - 1);
|
(long)to_send_frag(des)->ftr - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -218,7 +220,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|||||||
* The first time through will remove the unsent fragments so
|
* The first time through will remove the unsent fragments so
|
||||||
* subsequent calls are no-ops. */
|
* subsequent calls are no-ops. */
|
||||||
if (endpoint) {
|
if (endpoint) {
|
||||||
error_out_all_pending_frags(endpoint, &openib_btl->super);
|
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -257,7 +259,7 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
|||||||
if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) {
|
if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) {
|
||||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
||||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
||||||
error_out_all_pending_frags(endpoint, &openib_btl->super);
|
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -287,7 +289,7 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
|||||||
void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t *ctl_hdr)
|
void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t *ctl_hdr)
|
||||||
{
|
{
|
||||||
mca_btl_openib_broken_connection_header_t *bc_hdr =
|
mca_btl_openib_broken_connection_header_t *bc_hdr =
|
||||||
(mca_btl_openib_broken_connection_header_t*)ctl_hdr;
|
(mca_btl_openib_broken_connection_header_t*)ctl_hdr;
|
||||||
int i;
|
int i;
|
||||||
int found = false;
|
int found = false;
|
||||||
|
|
||||||
@ -317,15 +319,15 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
|||||||
(bc_hdr->vpid == newep->endpoint_proc->proc_guid.vpid)) {
|
(bc_hdr->vpid == newep->endpoint_proc->proc_guid.vpid)) {
|
||||||
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: Control message received from %d: "
|
"IB: Control message received from %d: "
|
||||||
"found match: lid=%d,"
|
"found match: lid=%d,"
|
||||||
"subnet=0x%" PRIx64 ",endpoint_state=%d",
|
"subnet=0x%" PRIx64 ",endpoint_state=%d",
|
||||||
newep->endpoint_proc->proc_guid.vpid,
|
newep->endpoint_proc->proc_guid.vpid,
|
||||||
newep->rem_info.rem_lid,
|
newep->rem_info.rem_lid,
|
||||||
newep->rem_info.rem_subnet_id,
|
newep->rem_info.rem_subnet_id,
|
||||||
newep->endpoint_state);
|
newep->endpoint_state);
|
||||||
found = true;
|
found = true;
|
||||||
/* At this point, we have found the endpoint. Now decode the
|
/* At this point, we have found the endpoint. Now decode the
|
||||||
* message type and do the appropriate action. */
|
* message type and do the appropriate action. */
|
||||||
if (MCA_BTL_OPENIB_CONTROL_EP_BROKEN == ctl_hdr->type) {
|
if (MCA_BTL_OPENIB_CONTROL_EP_BROKEN == ctl_hdr->type) {
|
||||||
/* Now that we found a match, check the state of the
|
/* Now that we found a match, check the state of the
|
||||||
* endpoint to see it is already in a failed state.
|
* endpoint to see it is already in a failed state.
|
||||||
@ -352,7 +354,7 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
|||||||
newep->endpoint_state);
|
newep->endpoint_state);
|
||||||
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
||||||
remote_proc, btlname);
|
remote_proc, btlname);
|
||||||
error_out_all_pending_frags(newep, &newbtl->super);
|
error_out_all_pending_frags(newep, &newbtl->super, true);
|
||||||
newep->endpoint_state = MCA_BTL_IB_FAILED;
|
newep->endpoint_state = MCA_BTL_IB_FAILED;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -365,26 +367,26 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
|||||||
MCA_BTL_OPENIB_RDMA_NEXT_INDEX(newep->eager_rdma_local.head);
|
MCA_BTL_OPENIB_RDMA_NEXT_INDEX(newep->eager_rdma_local.head);
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: rank=%d, control message (remote=%d), "
|
"IB: rank=%d, control message (remote=%d), "
|
||||||
"moved local head by one (new=%d)",
|
"moved local head by one (new=%d)",
|
||||||
ORTE_PROC_MY_NAME->vpid,
|
ORTE_PROC_MY_NAME->vpid,
|
||||||
newep->endpoint_proc->proc_guid.vpid,
|
newep->endpoint_proc->proc_guid.vpid,
|
||||||
newep->eager_rdma_local.head);
|
newep->eager_rdma_local.head);
|
||||||
} else {
|
} else {
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: rank=%d, control message (remote=%d), "
|
"IB: rank=%d, control message (remote=%d), "
|
||||||
"did not move local head by one (still=%d)",
|
"did not move local head by one (still=%d)",
|
||||||
ORTE_PROC_MY_NAME->vpid,
|
ORTE_PROC_MY_NAME->vpid,
|
||||||
newep->endpoint_proc->proc_guid.vpid,
|
newep->endpoint_proc->proc_guid.vpid,
|
||||||
newep->eager_rdma_local.head);
|
newep->eager_rdma_local.head);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break; /* since we found the endpoint */
|
break; /* since we found the endpoint */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (false == found) {
|
if (false == found) {
|
||||||
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: Control message: no match found");
|
"IB: Control message: no match found");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -394,12 +396,17 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
|||||||
* each qp with each priority and looks for both no_credits_pending_frags
|
* each qp with each priority and looks for both no_credits_pending_frags
|
||||||
* and no_wqe_pending_frags. It then looks for any pending_lazy_frags,
|
* and no_wqe_pending_frags. It then looks for any pending_lazy_frags,
|
||||||
* pending_put_frags, and pending_get_frags. This function is only
|
* pending_put_frags, and pending_get_frags. This function is only
|
||||||
* called when running with failover support enabled.
|
* called when running with failover support enabled. Note that
|
||||||
|
* the errout parameter allows the function to also be used as a
|
||||||
|
* debugging tool to see if there are any fragments on any of the
|
||||||
|
* queues.
|
||||||
* @param ep Pointer to endpoint that had error
|
* @param ep Pointer to endpoint that had error
|
||||||
* @param module Pointer to module that had error
|
* @param module Pointer to module that had error
|
||||||
|
* @param errout Boolean which says whether to error them out or not
|
||||||
*/
|
*/
|
||||||
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
||||||
struct mca_btl_base_module_t* module)
|
struct mca_btl_base_module_t* module,
|
||||||
|
bool errout)
|
||||||
{
|
{
|
||||||
int qp, pri, len, total, btl_ownership;
|
int qp, pri, len, total, btl_ownership;
|
||||||
|
|
||||||
@ -417,46 +424,11 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|||||||
total += len;
|
total += len;
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: Checking for no_wqe_pending_frags qp=%d, "
|
"IB: Checking for no_wqe_pending_frags qp=%d, "
|
||||||
"pri=%d, list size=%d",
|
"pri=%d, list size=%d",
|
||||||
qp, pri, len);
|
qp, pri, len);
|
||||||
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
if (true == errout) {
|
||||||
no_wqe_pending_frags[pri]))) {
|
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
|
||||||
|
|
||||||
/* Error out any coalesced frags if they exist */
|
|
||||||
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
||||||
opal_list_item_t *i;
|
|
||||||
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Found coalesced frag in no_wqe_pending_frags");
|
|
||||||
btl_ownership = (to_base_frag(i)->base.des_flags &
|
|
||||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
to_base_frag(i)->base.des_cbfunc(module, ep,
|
|
||||||
&to_base_frag(i)->base, OMPI_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(module, des);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
|
|
||||||
len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
|
|
||||||
if (len > 0) {
|
|
||||||
total += len;
|
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Checking for no_credits_pending_frags qp=%d, "
|
|
||||||
"pri=%d, list size=%d",
|
|
||||||
qp, pri, len);
|
|
||||||
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
||||||
no_credits_pending_frags[pri]))) {
|
no_wqe_pending_frags[pri]))) {
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
frag = (mca_btl_openib_com_frag_t *) item;
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
des = (mca_btl_base_descriptor_t *)frag;
|
||||||
|
|
||||||
@ -465,8 +437,7 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|||||||
opal_list_item_t *i;
|
opal_list_item_t *i;
|
||||||
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: Found coalesced frag in "
|
"IB: Found coalesced frag in no_wqe_pending_frags");
|
||||||
"no_credits_pending_frags");
|
|
||||||
btl_ownership = (to_base_frag(i)->base.des_flags &
|
btl_ownership = (to_base_frag(i)->base.des_flags &
|
||||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||||
to_base_frag(i)->base.des_cbfunc(module, ep,
|
to_base_frag(i)->base.des_cbfunc(module, ep,
|
||||||
@ -481,7 +452,45 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|||||||
if( btl_ownership ) {
|
if( btl_ownership ) {
|
||||||
mca_btl_openib_free(module, des);
|
mca_btl_openib_free(module, des);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||||
|
len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
|
||||||
|
if (len > 0) {
|
||||||
|
total += len;
|
||||||
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||||
|
"IB: Checking for no_credits_pending_frags qp=%d, "
|
||||||
|
"pri=%d, list size=%d",
|
||||||
|
qp, pri, len);
|
||||||
|
if (true == errout) {
|
||||||
|
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
||||||
|
no_credits_pending_frags[pri]))) {
|
||||||
|
frag = (mca_btl_openib_com_frag_t *) item;
|
||||||
|
des = (mca_btl_base_descriptor_t *)frag;
|
||||||
|
|
||||||
|
/* Error out any coalesced frags if they exist */
|
||||||
|
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
||||||
|
opal_list_item_t *i;
|
||||||
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
||||||
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||||
|
"IB: Found coalesced frag in "
|
||||||
|
"no_credits_pending_frags");
|
||||||
|
btl_ownership = (to_base_frag(i)->base.des_flags &
|
||||||
|
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||||
|
to_base_frag(i)->base.des_cbfunc(module, ep,
|
||||||
|
&to_base_frag(i)->base, OMPI_ERROR);
|
||||||
|
if( btl_ownership ) {
|
||||||
|
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||||
|
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
||||||
|
if( btl_ownership ) {
|
||||||
|
mca_btl_openib_free(module, des);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -491,34 +500,35 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|||||||
total += len;
|
total += len;
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: Checking for srq pending_frags qp=%d, pri=%d, "
|
"IB: Checking for srq pending_frags qp=%d, pri=%d, "
|
||||||
"list size=%d",
|
"list size=%d",
|
||||||
qp, pri, len);
|
qp, pri, len);
|
||||||
while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp].
|
if (true == errout) {
|
||||||
u.srq_qp.pending_frags[pri]))) {
|
while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp].
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
u.srq_qp.pending_frags[pri]))) {
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
frag = (mca_btl_openib_com_frag_t *) item;
|
||||||
|
des = (mca_btl_base_descriptor_t *)frag;
|
||||||
|
|
||||||
/* Error out any coalesced frags if they exist */
|
/* Error out any coalesced frags if they exist */
|
||||||
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
||||||
opal_list_item_t *i;
|
opal_list_item_t *i;
|
||||||
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: Found coalesced frag in SRQ pending_frags");
|
"IB: Found coalesced frag in SRQ pending_frags");
|
||||||
btl_ownership = (to_base_frag(i)->base.des_flags &
|
btl_ownership = (to_base_frag(i)->base.des_flags &
|
||||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||||
to_base_frag(i)->base.des_cbfunc(module, ep,
|
to_base_frag(i)->base.des_cbfunc(module, ep,
|
||||||
&to_base_frag(i)->base, OMPI_ERROR);
|
&to_base_frag(i)->base, OMPI_ERROR);
|
||||||
if( btl_ownership ) {
|
if( btl_ownership ) {
|
||||||
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||||
|
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
||||||
|
if( btl_ownership ) {
|
||||||
|
mca_btl_openib_free(module, des);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(module, des);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -533,10 +543,12 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|||||||
total += len;
|
total += len;
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: Checking for pending_lazy_frags, list size=%d", len);
|
"IB: Checking for pending_lazy_frags, list size=%d", len);
|
||||||
while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) {
|
if (true == errout) {
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) {
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
frag = (mca_btl_openib_com_frag_t *) item;
|
||||||
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
des = (mca_btl_base_descriptor_t *)frag;
|
||||||
|
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -545,10 +557,12 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|||||||
total += len;
|
total += len;
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: Checking for pending_put_frags, list size=%d", len);
|
"IB: Checking for pending_put_frags, list size=%d", len);
|
||||||
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
if (true == errout) {
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
frag = (mca_btl_openib_com_frag_t *) item;
|
||||||
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
des = (mca_btl_base_descriptor_t *)frag;
|
||||||
|
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -557,10 +571,12 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|||||||
total += len;
|
total += len;
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
||||||
"IB: Checking for pending_get_frags, list size=%d", len);
|
"IB: Checking for pending_get_frags, list size=%d", len);
|
||||||
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
if (true == errout) {
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
frag = (mca_btl_openib_com_frag_t *) item;
|
||||||
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
des = (mca_btl_base_descriptor_t *)frag;
|
||||||
|
des->des_cbfunc(module, ep, des, OMPI_ERROR);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -571,9 +587,9 @@ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|||||||
|
|
||||||
/* local callback function for completion of a failover control message */
|
/* local callback function for completion of a failover control message */
|
||||||
static void mca_btl_openib_endpoint_notify_cb(mca_btl_base_module_t* btl,
|
static void mca_btl_openib_endpoint_notify_cb(mca_btl_base_module_t* btl,
|
||||||
struct mca_btl_base_endpoint_t* endpoint,
|
struct mca_btl_base_endpoint_t* endpoint,
|
||||||
struct mca_btl_base_descriptor_t* descriptor,
|
struct mca_btl_base_descriptor_t* descriptor,
|
||||||
int status)
|
int status)
|
||||||
{
|
{
|
||||||
MCA_BTL_IB_FRAG_RETURN(descriptor);
|
MCA_BTL_IB_FRAG_RETURN(descriptor);
|
||||||
}
|
}
|
||||||
@ -676,7 +692,7 @@ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, ui
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0 /* debugging functions */
|
||||||
/*
|
/*
|
||||||
* Function used for debugging problems in eager rdma.
|
* Function used for debugging problems in eager rdma.
|
||||||
*/
|
*/
|
||||||
@ -689,22 +705,22 @@ void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint) {
|
|||||||
opal_output(0, "Head = %d", endpoint->eager_rdma_local.head);
|
opal_output(0, "Head = %d", endpoint->eager_rdma_local.head);
|
||||||
|
|
||||||
for (i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
|
for (i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
|
||||||
frag = &headers_buf[i];
|
frag = &headers_buf[i];
|
||||||
size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);
|
size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);
|
||||||
|
|
||||||
frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
|
frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
|
||||||
size + sizeof(mca_btl_openib_footer_t));
|
size + sizeof(mca_btl_openib_footer_t));
|
||||||
to_base_frag(frag)->segment.seg_addr.pval =
|
to_base_frag(frag)->segment.seg_addr.pval =
|
||||||
((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
|
((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
|
||||||
|
|
||||||
chdr = to_base_frag(frag)->segment.seg_addr.pval;
|
chdr = to_base_frag(frag)->segment.seg_addr.pval;
|
||||||
if ((MCA_BTL_TAG_BTL == frag->hdr->tag) &&
|
if ((MCA_BTL_TAG_BTL == frag->hdr->tag) &&
|
||||||
(MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type)) {
|
(MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type)) {
|
||||||
opal_output(0, "tag[%d] is credit message", i);
|
opal_output(0, "tag[%d] is credit message", i);
|
||||||
} else {
|
} else {
|
||||||
opal_output(0, "frag[%d] size=%d,tag=%d,ftr->u.buf=%d", i, size, frag->hdr->tag,
|
opal_output(0, "frag[%d] size=%d,tag=%d,ftr->u.buf=%d", i, size, frag->hdr->tag,
|
||||||
frag->ftr->u.buf[3]);
|
frag->ftr->u.buf[3]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -724,7 +740,40 @@ void dump_all_local_rdma_frags(mca_btl_openib_device_t *device) {
|
|||||||
if(!endpoint)
|
if(!endpoint)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
dump_local_rdma_frags(endpoint);
|
dump_local_rdma_frags(endpoint);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
/**
|
||||||
|
* This function is a debugging tool. If you notify a hang, you can
|
||||||
|
* call this function from a debugger and see if there are any
|
||||||
|
* messages stuck in any of the queues. If you call it with
|
||||||
|
* errout=true, then it will error them out. Otherwise, it will
|
||||||
|
* just print out the size of the queues with data in them.
|
||||||
|
*/
|
||||||
|
void dump_all_internal_queues(bool errout) {
|
||||||
|
int i, j, num_eps;
|
||||||
|
mca_btl_openib_module_t* btl;
|
||||||
|
int total;
|
||||||
|
mca_btl_base_endpoint_t* ep;
|
||||||
|
struct mca_btl_base_module_t* module;
|
||||||
|
|
||||||
|
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
||||||
|
btl = mca_btl_openib_component.openib_btls[i];
|
||||||
|
module = &btl->super;
|
||||||
|
num_eps = opal_pointer_array_get_size(btl->device->endpoints);
|
||||||
|
|
||||||
|
/* Now, find the endpoint associated with it */
|
||||||
|
for (j = 0; j < num_eps; j++) {
|
||||||
|
ep = (mca_btl_openib_endpoint_t*)
|
||||||
|
opal_pointer_array_get_item(btl->device->endpoints, j);
|
||||||
|
if (NULL == ep) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
total = 0;
|
||||||
|
error_out_all_pending_frags(ep, module, errout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif /* debugging functions */
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user