From c82e468ede9f95779b6b2189390007c084c86e24 Mon Sep 17 00:00:00 2001 From: Rolf vandeVaart Date: Wed, 5 Aug 2009 22:23:26 +0000 Subject: [PATCH] Undo revision r21767 - sorry folks This commit was SVN r21769. The following SVN revision numbers were found above: r21767 --> open-mpi/ompi@41f38110ff232530f4284f5d15d490f84c2a2bc8 --- ompi/mca/bml/base/bml_base_open.c | 10 - ompi/mca/bml/bml.h | 5 - ompi/mca/bml/r2/bml_r2.c | 11 +- ompi/mca/btl/btl.h | 6 +- ompi/mca/btl/elan/btl_elan.c | 1 - ompi/mca/btl/gm/btl_gm.c | 1 - ompi/mca/btl/mx/btl_mx.c | 1 - ompi/mca/btl/ofud/btl_ofud.c | 1 - ompi/mca/btl/openib/btl_openib.c | 8 +- ompi/mca/btl/openib/btl_openib.h | 3 - ompi/mca/btl/openib/btl_openib_component.c | 404 +-------------------- ompi/mca/btl/openib/btl_openib_endpoint.c | 2 +- ompi/mca/btl/openib/btl_openib_endpoint.h | 57 +-- ompi/mca/btl/openib/btl_openib_frag.h | 1 - ompi/mca/btl/openib/btl_openib_mca.c | 5 - ompi/mca/btl/pcie/btl_pcie.c | 1 - ompi/mca/btl/portals/btl_portals.c | 1 - ompi/mca/btl/sctp/btl_sctp.c | 1 - ompi/mca/btl/self/btl_self.c | 1 - ompi/mca/btl/sm/btl_sm.c | 1 - ompi/mca/btl/tcp/btl_tcp.c | 1 - ompi/mca/btl/template/btl_template.c | 1 - ompi/mca/btl/udapl/btl_udapl.c | 1 - ompi/mca/pml/ob1/pml_ob1.c | 72 +--- ompi/mca/pml/ob1/pml_ob1.h | 1 - ompi/mca/pml/ob1/pml_ob1_component.c | 6 - ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 15 +- ompi/mca/pml/ob1/pml_ob1_recvreq.c | 1 - ompi/mca/pml/ob1/pml_ob1_sendreq.c | 3 - 29 files changed, 20 insertions(+), 602 deletions(-) diff --git a/ompi/mca/bml/base/bml_base_open.c b/ompi/mca/bml/base/bml_base_open.c index 31cf7928a2..0bbd3abbd6 100644 --- a/ompi/mca/bml/base/bml_base_open.c +++ b/ompi/mca/bml/base/bml_base_open.c @@ -39,7 +39,6 @@ int mca_bml_base_error_count; int mca_bml_base_open(void) { - int value; /* See if we've already been here */ if (++mca_bml_base_already_opened > 1) { return OMPI_SUCCESS; @@ -52,15 +51,6 @@ int mca_bml_base_open(void) return OMPI_ERROR; } - mca_base_param_reg_int_name("bml", - "base_verbose", - "Verbosity level of the BML framework", - false, false, - 0, - &value); - mca_bml_base_output = opal_output_open(NULL); - opal_output_set_verbosity(mca_bml_base_output, value); - #if OPAL_ENABLE_DEBUG_RELIABILITY do { int param, value; diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index fee3faa254..8c9556eada 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -175,7 +175,6 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_index(mca_bml_base_ * * @param index (OUT) */ -extern int mca_bml_base_output; static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_btl_array_t* array) { #if OPAL_ENABLE_DEBUG @@ -185,8 +184,6 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_b } #endif if( 1 == array->arr_size ) { - opal_output_verbose(20, mca_bml_base_output, - "%s btl selected", array->bml_btls[0].btl->btl_ifname); return &array->bml_btls[0]; /* force the return to avoid a jump */ } else { size_t current_position = array->arr_index; /* force to always start from zero */ @@ -195,8 +192,6 @@ static inline mca_bml_base_btl_t* mca_bml_base_btl_array_get_next(mca_bml_base_b } else { array->arr_index = current_position + 1; /* continue */ } - opal_output_verbose(20, mca_bml_base_output, - "%s btl selected", array->bml_btls[current_position].btl->btl_ifname); return &array->bml_btls[current_position]; } } diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index fa01dd2c60..6e0ab0200b 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -525,15 +525,12 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl mca_btl_base_module_t* ep_btl; double total_bandwidth = 0; size_t b; - int rc = 0; if(NULL == ep) return OMPI_SUCCESS; /* remove btl from eager list */ - if (mca_bml_base_btl_array_remove(&ep->btl_eager, btl)) { - rc++; - } + mca_bml_base_btl_array_remove(&ep->btl_eager, btl); /* remove btl from send list */ if(mca_bml_base_btl_array_remove(&ep->btl_send, btl)) { @@ -541,7 +538,6 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl /* compute total_bandwidth and reset max_send_size to the min of all btl's */ total_bandwidth = 0; - rc++; for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) { bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b); ep_btl = bml_btl->btl; @@ -567,10 +563,9 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl /* remove btl from RDMA list */ if(mca_bml_base_btl_array_remove(&ep->btl_rdma, btl)) { - + /* computer total bandwidth */ total_bandwidth = 0; - rc++; for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) { bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b); ep_btl = bml_btl->btl; @@ -598,7 +593,7 @@ static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl } } - return rc; + return OMPI_SUCCESS; } int mca_bml_r2_finalize( void ) diff --git a/ompi/mca/btl/btl.h b/ompi/mca/btl/btl.h index 9973b0229a..0b7b49a025 100644 --- a/ompi/mca/btl/btl.h +++ b/ompi/mca/btl/btl.h @@ -197,7 +197,6 @@ typedef uint8_t mca_btl_base_tag_t; /* error callback flags */ #define MCA_BTL_ERROR_FLAGS_FATAL 0x1 -#define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2 /** * Asynchronous callback function on completion of an operation. @@ -510,9 +509,7 @@ typedef int (*mca_btl_base_module_register_fn_t)( typedef void (*mca_btl_base_module_error_cb_fn_t)( struct mca_btl_base_module_t* btl, - int32_t flags, - struct ompi_proc_t* ompi_proc, - struct mca_btl_base_endpoint_t** newep + int32_t flags ); @@ -760,7 +757,6 @@ struct mca_btl_base_module_t { /* BTL common attributes */ mca_btl_base_component_t* btl_component; /**< pointer back to the BTL component structure */ - char btl_ifname[8]; /**< name of interface associated with btl */ size_t btl_eager_limit; /**< maximum size of first fragment -- eager send */ size_t btl_rndv_eager_limit; /**< the size of a data sent in a first fragment of rendezvous protocol */ size_t btl_max_send_size; /**< maximum send fragment size supported by the BTL */ diff --git a/ompi/mca/btl/elan/btl_elan.c b/ompi/mca/btl/elan/btl_elan.c index c22ed7874c..f0c00ae539 100644 --- a/ompi/mca/btl/elan/btl_elan.c +++ b/ompi/mca/btl/elan/btl_elan.c @@ -631,7 +631,6 @@ mca_btl_elan_register_error( struct mca_btl_base_module_t* btl, mca_btl_elan_module_t mca_btl_elan_module = { { &mca_btl_elan_component.super, - "unknown", 0, /* max size of first fragment */ 0, /* min send fragment size */ 0, /* max send fragment size */ diff --git a/ompi/mca/btl/gm/btl_gm.c b/ompi/mca/btl/gm/btl_gm.c index 103e74a9f5..1799348c33 100644 --- a/ompi/mca/btl/gm/btl_gm.c +++ b/ompi/mca/btl/gm/btl_gm.c @@ -57,7 +57,6 @@ static int mca_btl_gm_put_nl( mca_btl_gm_module_t mca_btl_gm_module = { { &mca_btl_gm_component.super, - "unknown", 0, /* max size of first fragment */ 0, /* min send fragment size */ 0, /* max send fragment size */ diff --git a/ompi/mca/btl/mx/btl_mx.c b/ompi/mca/btl/mx/btl_mx.c index 1b0d77dba1..186a37bed8 100644 --- a/ompi/mca/btl/mx/btl_mx.c +++ b/ompi/mca/btl/mx/btl_mx.c @@ -672,7 +672,6 @@ int mca_btl_mx_ft_event(int state) { mca_btl_mx_module_t mca_btl_mx_module = { { &mca_btl_mx_component.super, - "unknown", 0, /* max size of first fragment */ 0, /* min send fragment size */ 0, /* max send fragment size */ diff --git a/ompi/mca/btl/ofud/btl_ofud.c b/ompi/mca/btl/ofud/btl_ofud.c index 6d4bf0ea9f..1cc0089519 100644 --- a/ompi/mca/btl/ofud/btl_ofud.c +++ b/ompi/mca/btl/ofud/btl_ofud.c @@ -42,7 +42,6 @@ mca_btl_ud_module_t mca_btl_ofud_module = { { &mca_btl_ofud_component.super, - "unknown", 0, /* eager_limit */ 0, /* min_send_size */ 0, /* max_send_size */ diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index ed56d5d3ce..36f2df0af2 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -67,7 +67,6 @@ mca_btl_openib_module_t mca_btl_openib_module = { { &mca_btl_openib_component.super, - "unknown", 0, /* max size of first fragment */ 0, /* min send fragment size */ 0, /* max send fragment size */ @@ -639,7 +638,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc( assert(qp != MCA_BTL_NO_ORDER); if(mca_btl_openib_component.use_message_coalescing && - (flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) && !(flags & MCA_BTL_IB_NO_COALESCE)) { + (flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY); sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio], &ep->qps[qp].qp->lock, ep, size); @@ -1190,8 +1189,6 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl, if(!ib_rc) { OPAL_THREAD_UNLOCK(&ep->endpoint_lock); return OMPI_SUCCESS; - } else { - opal_output(0, "Error from ibv_post_send()"); } /* Failed to send, do clean up all allocated resources */ @@ -1222,9 +1219,6 @@ cant_send: OPAL_THREAD_UNLOCK(&ep->endpoint_lock); /* We can not send the data directly, so we just return descriptor */ *descriptor = mca_btl_openib_alloc(btl, ep, order, size, flags); -#if 0 - opal_output(0, "Failed to send during sendi, send frag=%d back up", *descriptor); -#endif return OMPI_ERR_RESOURCE_BUSY; } /* diff --git a/ompi/mca/btl/openib/btl_openib.h b/ompi/mca/btl/openib/btl_openib.h index 03a9003ed0..de05938172 100644 --- a/ompi/mca/btl/openib/btl_openib.h +++ b/ompi/mca/btl/openib/btl_openib.h @@ -54,7 +54,6 @@ BEGIN_C_DECLS #define MCA_BTL_IB_LEAVE_PINNED 1 #define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll #define MCA_BTL_IB_PKEY_MASK 0x7fff -#define MCA_BTL_IB_NO_COALESCE 0x4000 /*--------------------------------------------------------------------*/ @@ -255,8 +254,6 @@ struct mca_btl_openib_component_t { ompi_free_list_t recv_user_free; /**< frags for coalesced massages */ ompi_free_list_t send_free_coalesced; - /** < whether to enable HCA failover mechanism */ - bool enable_hca_failover; }; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t; OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component; diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index db4eac971f..6c6f3e6b9c 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -66,7 +66,6 @@ const char *ibv_get_sysfs_path(void); #include "orte/runtime/orte_globals.h" #include "orte/mca/notifier/notifier.h" -#include "ompi/mca/pml/ob1/pml_ob1_hdr.h" /* For debugging only */ #include "ompi/proc/proc.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/mpool/base/base.h" @@ -647,7 +646,6 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl; openib_btl->device = device; - strncpy(openib_btl->super.btl_ifname, ibv_get_device_name(device->ib_dev), 7); openib_btl->port_num = (uint8_t) port_num; openib_btl->pkey_index = pkey_index; openib_btl->lid = lid; @@ -2812,250 +2810,6 @@ static void progress_pending_frags_srq(mca_btl_openib_module_t* openib_btl, } } -/** - * Take an existing frag and move it to another endpoint. We first - * allocate a new fragment from the new btl. We then copy over various - * fields from the old fragment to the new one. Then we copy the - * actually data that is to be transferred. This includes the openib - * header, the PML header, and all the data. - */ -static void mca_btl_openib_move_frag(mca_btl_openib_endpoint_t* ep, - mca_btl_openib_com_frag_t* oldfrag) -{ - mca_btl_openib_com_frag_t* frag; - mca_btl_base_descriptor_t* olddes; - mca_btl_base_descriptor_t* des; - int coalesced_len, retval; - - if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { - OPAL_OUTPUT((-1, "INFO: Reposting to unconnected endpoint")); - } - - olddes = (mca_btl_base_descriptor_t *)oldfrag; - - /* Check to see if this was a coalesced fragment. If so, then - * first walk through each coalesced fragment, turn it into a send - * fragment, and repost. */ - coalesced_len = opal_list_get_size(&to_send_frag(olddes)->coalesced_frags); - if (coalesced_len > 0) { - mca_btl_openib_control_header_t *ctrl_hdr; - mca_btl_openib_header_coalesced_t *clsc_hdr; - opal_list_item_t *i; - mca_btl_base_descriptor_t* coalesced_des; - OPAL_OUTPUT((-1, "INFO: Reposting coalesced fragments")); - while((i = opal_list_remove_first(&to_send_frag(olddes)->coalesced_frags))) { - - frag = (mca_btl_openib_com_frag_t *) - mca_btl_openib_alloc((mca_btl_base_module_t *)ep->endpoint_btl, - ep, to_base_frag(i)->base.order, - to_base_frag(i)->segment.seg_len, - to_base_frag(i)->base.des_flags | MCA_BTL_IB_NO_COALESCE); - - coalesced_des = (mca_btl_base_descriptor_t *)i; - - /* First adjust the values in the descriptor portion of the fragment */ - des = (mca_btl_base_descriptor_t*)frag; - des->des_cbfunc = coalesced_des->des_cbfunc; - des->des_cbdata = coalesced_des->des_cbdata; - - /* Now adjust fragment specific information */ - frag->endpoint = ep; - - /* Finally copy over the data that is actually being transmitted */ - memcpy(to_base_frag(frag)->segment.seg_addr.pval, to_base_frag(i)->segment.seg_addr.pval, - to_base_frag(i)->segment.seg_len); - to_base_frag(frag)->segment.seg_len = to_base_frag(i)->segment.seg_len; - - /* Restore the PML fragment type header used for callbacks */ - clsc_hdr = (mca_btl_openib_header_coalesced_t *) to_coalesced_frag(i)->hdr; - to_send_frag(frag)->hdr->tag = clsc_hdr->tag; - - OPAL_OUTPUT((0, "Tag pulled from old coalesced frag: tag=%d", clsc_hdr->tag)); - - /* Set to zero just to be safe */ - to_send_frag(frag)->hdr->cm_seen = 0; - to_send_frag(frag)->hdr->credits = 0; - - /* This function will either post the send or queue it up if the resource - * is busy. The resource could be busy if it is out of credits or out of - * wqe's. If we get something other then resource busy or success, then - * we will error out entirely as an unrecoverable error. */ - retval = mca_btl_openib_endpoint_send(ep, (mca_btl_openib_send_frag_t*)frag); - if ((OMPI_SUCCESS != retval) && (OMPI_ERR_RESOURCE_BUSY != retval)) { - ep->endpoint_btl->error_cb(&ep->endpoint_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); - } - } - } - - /* Now move the actual frag that caused the error */ - frag = (mca_btl_openib_com_frag_t *) - mca_btl_openib_alloc((mca_btl_base_module_t *)ep->endpoint_btl, - ep, to_base_frag(oldfrag)->base.order, - to_base_frag(oldfrag)->segment.seg_len, - to_base_frag(oldfrag)->base.des_flags | MCA_BTL_IB_NO_COALESCE); - - OPAL_OUTPUT((-1, "Changing frag=%lx,btl=%s to frag=%lx,btl=%s, copying %d bytes\n", - oldfrag, oldfrag->endpoint->endpoint_btl->super.btl_ifname, - frag, ep->endpoint_btl->super.btl_ifname, - to_base_frag(oldfrag)->segment.seg_len)); - - /* First adjust the values in the descriptor portion of the fragment. - * Note that I do not currently set the des_context value. This field - * is often set to the bml endpoint when the frag is created. Not sure - * if I will ultimately need that. */ - des = (mca_btl_base_descriptor_t*)frag; - des->des_cbfunc = olddes->des_cbfunc; - des->des_cbdata = olddes->des_cbdata; - - /* Now adjust fragment specific information */ - frag->endpoint = ep; - - /* Finally copy over the data that is actually being transmitted */ - memcpy(to_base_frag(frag)->segment.seg_addr.pval, to_base_frag(oldfrag)->segment.seg_addr.pval, - to_base_frag(oldfrag)->segment.seg_len); - to_base_frag(frag)->segment.seg_len = to_base_frag(oldfrag)->segment.seg_len; - - /* Set the fields in the mca_btl_openib_header_t. The fields consist of: - * mca_btl_base_tag_t tag - * uint8_t cm_seen; - * uint16_t credits; - * The tag field gets the tag from the old fragment. The other two fields - * are set to zero. */ - - if (coalesced_len > 0) { - /* A coalesced fragment has the tag field in a different location */ - mca_btl_openib_control_header_t *ctrl_hdr; - mca_btl_openib_header_coalesced_t *clsc_hdr; - /* Peel off the old PML tag from the header information. Need to work past - * the openib_header and control_header to get to coalesce_header */ - ctrl_hdr = (mca_btl_openib_control_header_t*)(to_send_frag(oldfrag)->hdr + 1); - clsc_hdr = (mca_btl_openib_header_coalesced_t*)(ctrl_hdr + 1); - to_send_frag(frag)->hdr->tag = clsc_hdr->tag; - } else { - /* For normal send headers, copy over the tag. */ - to_send_frag(frag)->hdr->tag = to_send_frag(oldfrag)->hdr->tag; - } - to_send_frag(frag)->hdr->cm_seen = 0; - to_send_frag(frag)->hdr->credits = 0; - - /* This function will either post the send or queue it up if the resource - * is busy. The resource could be busy if it is out of credits or out of - * wqe's. If we get something other then resource busy or success, then - * we will error out entirely as an unrecoverable error. */ - retval = mca_btl_openib_endpoint_send(ep, (mca_btl_openib_send_frag_t*)frag); - if ((OMPI_SUCCESS != retval) && (OMPI_ERR_RESOURCE_BUSY != retval)) { - ep->endpoint_btl->error_cb(&ep->endpoint_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); - } - - /* Some extra debugging tool. Should be removed eventually. This prints - * out the PML header that is in the newly created fragment. */ - { - mca_pml_ob1_common_hdr_t* hdr; - mca_pml_ob1_match_hdr_t* mhdr; - mca_pml_ob1_frag_hdr_t* fhdr; - uint8_t type; - - hdr = (mca_pml_ob1_common_hdr_t*)des->des_src->seg_addr.pval; - type = hdr->hdr_type; - switch (type) { - case MCA_PML_OB1_HDR_TYPE_MATCH: - mhdr = (mca_pml_ob1_match_hdr_t*)hdr; - OPAL_OUTPUT((-1, "MATCH,frag=%d,tag=%d,src=%d,seq=%d", - frag, mhdr->hdr_tag, mhdr->hdr_src, mhdr->hdr_seq)); - break; - case MCA_PML_OB1_HDR_TYPE_FRAG: - fhdr = (mca_pml_ob1_frag_hdr_t*)hdr; - OPAL_OUTPUT((-1, "FRAG,frag=%lx,rreq=%lx,len=%d,offset=%d", - frag, fhdr->hdr_dst_req.pval, to_base_frag(frag)->segment.seg_len, - fhdr->hdr_frag_offset)); - break; - case MCA_PML_OB1_HDR_TYPE_RNDV: - OPAL_OUTPUT((-1, "RNDV,frag=%lx", frag)); - break; - case MCA_PML_OB1_HDR_TYPE_ACK: - OPAL_OUTPUT((-1, "ACK,frag=%lx", frag)); - break; - default: - OPAL_OUTPUT((-1, "OTHER,frag=%lx", frag)); - } - } -} - -/** - * This function will move all the pending fragments from one endpoint - * to another. It walks through each qp with each priority and looks - * for both no_credits_pending_frags and no_wqe_pending_frags and - * moves any it finds. This is called when we detect an error on a - * btl and we are trying to recover. - */ -static void move_all_pending_frags(mca_btl_base_endpoint_t *old_ep, - mca_btl_base_endpoint_t *new_ep) -{ - int qp, pri, rc, len, total; - opal_list_item_t *item; - mca_btl_openib_com_frag_t* frag; - - total = 0; - /* Traverse all QPs and all priorities and move to other endpoint */ - for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { - for (pri = 0; pri < 2; ++pri) { - /* All types of qp's have a no_wqe_pending_frags list */ - len = opal_list_get_size(&old_ep->qps[qp].no_wqe_pending_frags[pri]); - if (len > 0) { - total += len; - opal_output(0, "Checking for no_wqe_pending_frags qp=%d, pri=%d, list size=%d", - qp, pri, len); - while (NULL != (item = opal_list_remove_first(&old_ep->qps[qp]. - no_wqe_pending_frags[pri]))) { - frag = (mca_btl_openib_com_frag_t *) item; - mca_btl_openib_move_frag(new_ep, frag); - } - } - if (BTL_OPENIB_QP_TYPE_PP(qp)) { - len = opal_list_get_size(&old_ep->qps[qp].no_credits_pending_frags[pri]); - if (len > 0) { - total += len; - opal_output(0, "Checking for no_credits_pending_frags qp=%d, pri=%d, list size=%d", - qp, pri, len); - while (NULL != (item = opal_list_remove_first(&old_ep->qps[qp]. - no_credits_pending_frags[pri]))) { - frag = (mca_btl_openib_com_frag_t *) item; - mca_btl_openib_move_frag(new_ep, frag); - } - } - - } else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) { - len = opal_list_get_size(&old_ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]); - if (len > 0) { - total += len; - opal_output(0, "Checking for srq pending_frags qp=%d, pri=%d, list size=%d", - qp, pri, len); - while (NULL != (item = opal_list_remove_first(&old_ep->endpoint_btl->qps[qp]. - u.srq_qp.pending_frags[pri]))) { - frag = (mca_btl_openib_com_frag_t *) item; - mca_btl_openib_move_frag(new_ep, frag); - } - } - } - } - } - - /* Check for any frags from a connection that was never made. Not sure if this - * can actually happen. */ - len = opal_list_get_size(&old_ep->pending_lazy_frags); - if (len > 0) { - total += len; - opal_output(0, "Checking for pending_lazy_frags, list size=%d", len); - while (NULL != (item = opal_list_remove_first(&(old_ep->pending_lazy_frags)))) { - frag = (mca_btl_openib_com_frag_t *) item; - mca_btl_openib_move_frag(new_ep, frag); - } - } - - OPAL_OUTPUT((-1, "Finished checking for pending_frags, total moved=%d", - total)); -} - static char *cq_name[] = {"HP CQ", "LP CQ"}; static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, struct ibv_wc *wc) @@ -3064,11 +2818,9 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, mca_btl_openib_com_frag_t* frag; mca_btl_base_descriptor_t *des; mca_btl_openib_endpoint_t* endpoint; - mca_btl_openib_endpoint_t* newep; mca_btl_openib_module_t *openib_btl = NULL; ompi_proc_t* remote_proc = NULL; int qp, btl_ownership; - int holdon = 1; des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id; frag = to_com_frag(des); @@ -3082,32 +2834,6 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, if(endpoint) openib_btl = endpoint->endpoint_btl; - /* These are the three types of fragments we have seen so far */ - if ((openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_RECV) && - (openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_SEND) && - (openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_CONTROL)) { - OPAL_OUTPUT((0, "Fragment is type %d, size=%d", openib_frag_type(des), (int)wc->byte_len)); - } - - /* Quiet some of the receive frag errors */ - if (openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_RECV) { - OPAL_OUTPUT((-1, "Fragment is type %d, size=%d", openib_frag_type(des), (int)wc->byte_len)); - OPAL_OUTPUT((-1, "\nCQ btl=%s: status=%s(%d),wr_id=%d,opcode=%d", - openib_btl->super.btl_ifname, - btl_openib_component_status_to_string(wc->status), - wc->status, (void *)(uintptr_t)wc->wr_id, wc->opcode)); - if (des->des_src) { - mca_pml_ob1_frag_hdr_t* hdr = (mca_pml_ob1_frag_hdr_t*)des->des_src->seg_addr.pval; - if (MCA_PML_OB1_HDR_TYPE_FRAG == hdr->hdr_common.hdr_type) { - OPAL_OUTPUT((-1, "frag=TYPE_FRAG,offset=%d", hdr->hdr_frag_offset)); - } else if (MCA_PML_OB1_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type) { - OPAL_OUTPUT((-1, "frag=TYPE_RNDV")); - } else { - OPAL_OUTPUT((-1, "frag=OTHER")); - } - } - } - if(wc->status != IBV_WC_SUCCESS) { OPAL_OUTPUT((-1, "Got WC: ERROR")); goto error; @@ -3173,7 +2899,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, /* Process a RECV */ if(btl_openib_handle_incoming(openib_btl, endpoint, to_recv_frag(frag), wc->byte_len) != OMPI_SUCCESS) { - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); + openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL); break; } @@ -3190,7 +2916,7 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, default: BTL_ERROR(("Unhandled work completion opcode is %d", wc->opcode)); if(openib_btl) - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); + openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL); break; } @@ -3216,7 +2942,6 @@ error: } #endif -#if 0 if(IBV_WC_WR_FLUSH_ERR != wc->status || !flush_err_printed[cq]++) { BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s " "status number %d for wr_id %" PRIx64 " opcode %d vendor error %d qp_idx %d", @@ -3232,13 +2957,7 @@ error: wc->status, wc->wr_id, wc->opcode, wc->vendor_err, qp); } - if (openib_frag_type(des) != MCA_BTL_OPENIB_FRAG_RECV) { - OPAL_OUTPUT((0, "Error on btl=%s: wc->status=%s(%d), wc->wr_id=%d", - openib_btl->super.btl_ifname, - btl_openib_component_status_to_string(wc->status), - wc->status, (void *)(uintptr_t)wc->wr_id)); - } - + if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status || IBV_WC_RETRY_EXC_ERR == wc->status) { char *peer_hostname = @@ -3274,97 +2993,9 @@ error: device_name, peer_hostname); } } -#endif - /* If failover is not enabled, just error out like we always did */ - if(!mca_btl_openib_component.enable_hca_failover) { - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); - } - /* Here is where we figure out what to do with the unsent fragment. To keep - * things clear, I handle each one differently. - * Note: In the wc struct, these are the only valid fields with an error: - * wc->wr_id, wc->status, wc->vendor_err, wc->qp_num. - * This means we cannot key off of the wc->opcode to see what operation we did. - - /* Drop any errors receiving on a PP connection. There is nothing else to do */ - if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) { - OPAL_OUTPUT((-1, "RECV or CONTROL, dropping since connection is broken (des=%d)", des)); - return; - } - - /* Drop any CONTROL messages as they are only valid on this connection. */ - if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) { - OPAL_OUTPUT((-1, "RECV or CONTROL, dropping since connection is broken (des=%d)", des)); - return; - } - - /* MCA_BTL_OPENIB_FRAG_EAGER_RDMA is a openib specific control message - * used to set up eager RDMA on a connection. Since the connection - * is broken, just drop it. */ - if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA) { - OPAL_OUTPUT((-1, "OPENIB_FRAG_EAGER_RDMA, dropping since connection is broken (des=%d)", des)); - } - - if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) { - OPAL_OUTPUT((0, "SRQ RECV type=%d, size=%d", openib_frag_type(des), (int)wc->byte_len)); - return; -#if 0 - while (holdon) { - holdon++; - opal_output(0, "SRQ RECV DETECTED - ATTACH DEBUGGER"); - sleep(5); - } -#endif - } - -#if 0 - /* If we get an error on a receive then just map out the interface - * for any future sends. There is nothin to retransmit. - * NOTE: Not sure what to do with this yet */ - if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) || - (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL)) { - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, &newep); - return; - } -#endif - -#if 0 - /* For shared receive queues, we need to return the fragments and - * repost the receives since they are a shared resource. For - * peer-to-peer queues, we do nothing. - * NOTE: Not sure what to do here yet. I cannot get the btl or the endpoint - * from the fragment that is returned. Usually, the endpoint is retrieved via - * the immediate data, but obviously the immediate data is non-existant on an - * error. All I really need is the btl but I am not sure where I get that - * from. I have observed that I am not getting many errors on the receive - * so I will not worry now about reposting them. */ - if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) { - OPAL_OUTPUT((0, "SRQ RECV type=%d, size=%d", openib_frag_type(des), (int)wc->byte_len)); - MCA_BTL_IB_FRAG_RETURN(frag); - mca_btl_openib_module_t *btl = endpoint->endpoint_btl; - OPAL_THREAD_ADD32(&btl->qps[qp].u.srq_qp.rd_posted, -1); - mca_btl_openib_post_srr(btl, qp); - return; - } -#endif - - /* Need to keep calling this to get the alternative endpoint back. - * However, subsequent calls will not actually map anything out. - * Note that we do not call this on a SRQ receive error or any - * type of receive error. */ - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, &newep); - - /* Move all the pending frags to the new endpoint as they can no - * longer go out the broken endpoint. OPTIMIZATION: Like the PML - * callback, this really only needs to be called once. However, it - * does not hurt anything to keep calling it. Subsequent calls will - * just have nothing to move over. */ - move_all_pending_frags(endpoint, newep); - - /* Now move the fragment that triggered the error over to the - * other endpoint */ - mca_btl_openib_move_frag(newep, frag); - + if(openib_btl) + openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL); } static int poll_device(mca_btl_openib_device_t* device, int count) @@ -3398,7 +3029,6 @@ static int poll_device(mca_btl_openib_device_t* device, int count) device->hp_cq_polls--; } - OPAL_OUTPUT((-1, "ibv_poll_cq found CQ event on %s", device->ib_dev->name)); handle_wc(device, cq, &wc); } @@ -3495,7 +3125,7 @@ static int progress_one_device(mca_btl_openib_device_t *device) ret = btl_openib_handle_incoming(btl, to_com_frag(frag)->endpoint, frag, size - sizeof(mca_btl_openib_footer_t)); if (ret != MPI_SUCCESS) { - btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); + btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL); return 0; } @@ -3514,26 +3144,6 @@ static int progress_one_device(mca_btl_openib_device_t *device) return count; } -void btl_dump_pending_lists() { - int i,j; - mca_btl_openib_endpoint_t* endpoint; - - for(i = 0; i < mca_btl_openib_component.devices_count; i++) { - mca_btl_openib_device_t *device = - opal_pointer_array_get_item(&mca_btl_openib_component.devices, i); - for (j = 0; j < 10; j++) { - endpoint = (mca_btl_openib_endpoint_t*) - opal_pointer_array_get_item(device->endpoints, j); - if (endpoint != NULL) { - opal_output(0, "pending_lazy_frags size = %d", - endpoint->pending_lazy_frags.opal_list_length); - } - } - } -} - - - /* * IB component progress. */ @@ -3566,7 +3176,7 @@ error: mca_btl_openib_module_t* openib_btl = mca_btl_openib_component.openib_btls[i]; if(openib_btl->device->got_fatal_event) { - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); + openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL); } } return count; diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index 3e4ade364f..83e6952ed8 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -1018,7 +1018,7 @@ void *mca_btl_openib_endpoint_invoke_error(void *context) } /* Invoke the callback to the upper layer */ - btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL, NULL, NULL); + btl->error_cb(&(btl->super), MCA_BTL_ERROR_FLAGS_FATAL); /* Will likely never get here */ return NULL; diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.h b/ompi/mca/btl/openib/btl_openib_endpoint.h index 8fae6e4a1b..b2e46d1e28 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.h +++ b/ompi/mca/btl/openib/btl_openib_endpoint.h @@ -24,7 +24,6 @@ #ifndef MCA_BTL_IB_ENDPOINT_H #define MCA_BTL_IB_ENDPOINT_H -#include #include "opal/class/opal_list.h" #include "opal/event/event.h" #include "opal/util/output.h" @@ -36,7 +35,6 @@ #include #include "ompi/mca/btl/base/btl_base_error.h" #include "connect/base.h" -#include "ompi/mca/pml/ob1/pml_ob1_hdr.h" /* For debugging only */ BEGIN_C_DECLS @@ -422,7 +420,6 @@ static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep, mca_btl_base_descriptor_t *des, opal_list_t *pending_list) { int rc = OMPI_ERR_RESOURCE_BUSY; - int holdon = 1; switch(ep->endpoint_state) { case MCA_BTL_IB_CLOSED: @@ -440,13 +437,6 @@ static inline int check_endpoint_state(mca_btl_openib_endpoint_t *ep, /* fall through */ default: opal_list_append(pending_list, (opal_list_item_t *)des); -#if 0 - while (holdon) { - holdon++; - opal_output(0, "STARTING CONNECTION on %d - ATTACH DEBUGGER", getpid()); - sleep(5); - } -#endif break; case MCA_BTL_IB_FAILED: rc = OMPI_ERR_UNREACH; @@ -486,40 +476,6 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep, struct ibv_send_wr *sr_desc = &to_out_frag(frag)->sr_desc; struct ibv_send_wr *bad_wr; int qp = to_base_frag(frag)->base.order; - static int printstuff = 0; - - if (printstuff == 1) { - /* Some extra debugging tool. Should be removed eventually. This prints - * out the PML header that is in the newly created fragment. */ - mca_pml_ob1_common_hdr_t* hdr; - mca_pml_ob1_match_hdr_t* mhdr; - mca_pml_ob1_frag_hdr_t* fhdr; - uint8_t type; - - hdr = (mca_pml_ob1_common_hdr_t*)seg->seg_addr.pval; - type = hdr->hdr_type; - switch (type) { - case MCA_PML_OB1_HDR_TYPE_MATCH: - mhdr = (mca_pml_ob1_match_hdr_t*)hdr; - OPAL_OUTPUT((-1, "MATCH,frag=%d,tag=%d,src=%d,seq=%d", - frag, mhdr->hdr_tag, mhdr->hdr_src, mhdr->hdr_seq)); - break; - case MCA_PML_OB1_HDR_TYPE_FRAG: - fhdr = (mca_pml_ob1_frag_hdr_t*)hdr; - OPAL_OUTPUT((-1, "FRAG,frag=%lx,rreq=%lx,len=%d,offset=%d", - frag, fhdr->hdr_dst_req.pval, to_base_frag(frag)->segment.seg_len, - fhdr->hdr_frag_offset)); - break; - case MCA_PML_OB1_HDR_TYPE_RNDV: - OPAL_OUTPUT((-1, "RNDV,frag=%lx", frag)); - break; - case MCA_PML_OB1_HDR_TYPE_ACK: - OPAL_OUTPUT((-1, "ACK,frag=%lx", frag)); - break; - default: - OPAL_OUTPUT((-1, "OTHER,frag=%lx", frag)); - } - } sg->length = seg->seg_len + sizeof(mca_btl_openib_header_t) + (rdma ? sizeof(mca_btl_openib_footer_t) : 0) + frag->coalesced_length; @@ -571,18 +527,7 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep, #endif assert(sg->addr == (uint64_t)(uintptr_t)frag->hdr); - { - int retval; - retval = ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr); - if (0 == retval) { - OPAL_OUTPUT((-1, "SUCCESS: Posted %d frag on %s\n", frag, - ep->endpoint_btl->super.btl_ifname)); - } else { - OPAL_OUTPUT((0, "FAILURE: Did not posted %d frag on %s\n", frag, - ep->endpoint_btl->super.btl_ifname)); - } - return retval; - } + return ibv_post_send(ep->qps[qp].qp->lcl_qp, sr_desc, &bad_wr); } END_C_DECLS diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index 12ddeb7d65..817ad87ea4 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -166,7 +166,6 @@ do { \ } while (0) enum mca_btl_openib_frag_type_t { - MCA_BTL_OPENIB_FRAG_UNUSED, /* For debugging: Makes FRAG_RECV=1 */ MCA_BTL_OPENIB_FRAG_RECV, MCA_BTL_OPENIB_FRAG_RECV_USER, MCA_BTL_OPENIB_FRAG_SEND, diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index 3892d38fc8..70f6d9ab87 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -555,11 +555,6 @@ int btl_openib_register_mca_params(void) NULL, &mca_btl_openib_component.ipaddr_exclude, 0)); - CHECK(reg_int("enable_hca_failover", NULL, - "Enable failover from one HCA to another", 1, &ival, 0)); - mca_btl_openib_component.enable_hca_failover = (0 != ival); - - /* Register any MCA params for the connect pseudo-components */ if (OMPI_SUCCESS == ret) { ret = ompi_btl_openib_connect_base_register(); diff --git a/ompi/mca/btl/pcie/btl_pcie.c b/ompi/mca/btl/pcie/btl_pcie.c index b440c4a668..098f40167d 100644 --- a/ompi/mca/btl/pcie/btl_pcie.c +++ b/ompi/mca/btl/pcie/btl_pcie.c @@ -36,7 +36,6 @@ mca_btl_pcie_module_t mca_btl_pcie_module = { { &mca_btl_pcie_component.super, - "unknown", 0, /* max size of first fragment */ 0, /* Threshold below which BTL should not fragment */ 0, /* max send fragment size */ diff --git a/ompi/mca/btl/portals/btl_portals.c b/ompi/mca/btl/portals/btl_portals.c index dc58148079..1ef111e54f 100644 --- a/ompi/mca/btl/portals/btl_portals.c +++ b/ompi/mca/btl/portals/btl_portals.c @@ -38,7 +38,6 @@ mca_btl_portals_module_t mca_btl_portals_module = { { &mca_btl_portals_component.super, - "unknown", /* NOTE: All these default values are set in component_open() */ diff --git a/ompi/mca/btl/sctp/btl_sctp.c b/ompi/mca/btl/sctp/btl_sctp.c index 3e143c7225..f0276e6373 100644 --- a/ompi/mca/btl/sctp/btl_sctp.c +++ b/ompi/mca/btl/sctp/btl_sctp.c @@ -34,7 +34,6 @@ mca_btl_sctp_module_t mca_btl_sctp_module = { { &mca_btl_sctp_component.super, - "unknown", 0, /* max size of first fragment */ 0, /* min send fragment size */ 0, /* max send fragment size */ diff --git a/ompi/mca/btl/self/btl_self.c b/ompi/mca/btl/self/btl_self.c index af466df0c7..d721033ee6 100644 --- a/ompi/mca/btl/self/btl_self.c +++ b/ompi/mca/btl/self/btl_self.c @@ -36,7 +36,6 @@ mca_btl_base_module_t mca_btl_self = { &mca_btl_self_component.super, - "unknown", 0, /* btl_eager_limit */ 0, /* btl_rndv_eager_limit */ 0, /* btl_max_send_size */ diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index 83c25633a2..4c94b786de 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -56,7 +56,6 @@ mca_btl_sm_t mca_btl_sm = { { &mca_btl_sm_component.super, - "sm", 0, /* btl_eager_limit */ 0, /* btl_rndv_eager_limit */ 0, /* btl_max_send_size */ diff --git a/ompi/mca/btl/tcp/btl_tcp.c b/ompi/mca/btl/tcp/btl_tcp.c index a35e56990b..4f21bbd39a 100644 --- a/ompi/mca/btl/tcp/btl_tcp.c +++ b/ompi/mca/btl/tcp/btl_tcp.c @@ -36,7 +36,6 @@ mca_btl_tcp_module_t mca_btl_tcp_module = { { &mca_btl_tcp_component.super, - "unknown", 0, /* max size of first fragment */ 0, /* min send fragment size */ 0, /* max send fragment size */ diff --git a/ompi/mca/btl/template/btl_template.c b/ompi/mca/btl/template/btl_template.c index b2ecc7f99b..07f2b29135 100644 --- a/ompi/mca/btl/template/btl_template.c +++ b/ompi/mca/btl/template/btl_template.c @@ -33,7 +33,6 @@ mca_btl_template_module_t mca_btl_template_module = { { &mca_btl_template_component.super, - "unknown", 0, /* max size of first fragment */ 0, /* min send fragment size */ 0, /* max send fragment size */ diff --git a/ompi/mca/btl/udapl/btl_udapl.c b/ompi/mca/btl/udapl/btl_udapl.c index 37cd4c9e72..61d78c876c 100644 --- a/ompi/mca/btl/udapl/btl_udapl.c +++ b/ompi/mca/btl/udapl/btl_udapl.c @@ -50,7 +50,6 @@ static int mca_btl_udapl_assign_netmask(mca_btl_udapl_module_t* udapl_btl); mca_btl_udapl_module_t mca_btl_udapl_module = { { &mca_btl_udapl_component.super, - "unknown", 0, /* max size of first fragment */ 0, /* min send fragment size */ 0, /* max send fragment size */ diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index 3fdec6f349..6a283d671c 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -41,7 +41,6 @@ #include "ompi/mca/bml/base/base.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/grpcomm/grpcomm.h" -#include "orte/mca/notifier/notifier.h" #include "ompi/runtime/ompi_cr.h" @@ -71,8 +70,7 @@ mca_pml_ob1_t mca_pml_ob1 = { void mca_pml_ob1_error_handler( struct mca_btl_base_module_t* btl, - int32_t flags, ompi_proc_t* ompi_proc, - struct mca_btl_base_endpoint_t** btl_endpoint); + int32_t flags ); int mca_pml_ob1_enable(bool enable) { @@ -567,70 +565,10 @@ void mca_pml_ob1_process_pending_rdma(void) } -void mca_pml_ob1_error_handler( struct mca_btl_base_module_t* btl, - int32_t flags, ompi_proc_t *errproc, - struct mca_btl_base_endpoint_t** btl_endpoint) -{ - ompi_proc_t** procs; - size_t p, num_procs; - mca_bml_base_endpoint_t* ep; - - if (flags & MCA_BTL_ERROR_FLAGS_FATAL) { - orte_errmgr.abort(-1, NULL); - } - - /** - * Just remove the offending bml_btl corresponding to the btl with the - * error. Let the other errors remove the other ones. - */ - procs = ompi_proc_all(&num_procs); - if(NULL != procs) { - if (0 < mca_bml.bml_del_proc_btl(errproc, btl)) { - opal_output(0, "PML error handler: rank=%d mapping out btl:name=%s,if=%s to rank=%d on node=%s", - ORTE_PROC_MY_NAME->vpid, - btl->btl_component->btl_version.mca_component_name, - btl->btl_ifname, - errproc->proc_name.vpid, - errproc->proc_hostname); - - } - -#if 0 - for( p = 0; p < num_procs; p++ ) { - ompi_proc_t* proc = procs[p]; - ep = (mca_bml_base_endpoint_t*)proc->proc_bml; - opal_output(0, "p=%d, eager=%d, send=%d, rdma=%d, proc=%s", - p, - ep->btl_eager.arr_size, - ep->btl_send.arr_size, - ep->btl_rdma.arr_size, - proc->proc_hostname); - } -#endif - - ep = (mca_bml_base_endpoint_t*)errproc->proc_bml; - - if ((ep->btl_eager.arr_size == 0) && - (ep->btl_send.arr_size == 0) && - (ep->btl_rdma.arr_size == 0)) { - opal_output(0, "NO MORE INTERFACES - BYE BYE"); - orte_errmgr.abort(-1, NULL); - } - } - - /** - * Now return the first one in the list. Odds are there were only - * two to start with and now we are down to one. - */ - if (NULL != btl_endpoint) { - *btl_endpoint = errproc->proc_bml->btl_send.bml_btls[0].btl_endpoint; - } - - orte_notifier.log(ORTE_NOTIFIER_INFRA, ORTE_ERR_COMM_FAILURE, - "Mapping out btl component %s with interface %s", - btl->btl_component->btl_version.mca_component_name, - btl->btl_ifname); - +void mca_pml_ob1_error_handler( + struct mca_btl_base_module_t* btl, + int32_t flags) { + orte_errmgr.abort(-1, NULL); } #if OPAL_ENABLE_FT == 0 diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h index dbedb3e36a..f0d3a97cbc 100644 --- a/ompi/mca/pml/ob1/pml_ob1.h +++ b/ompi/mca/pml/ob1/pml_ob1.h @@ -81,7 +81,6 @@ struct mca_pml_ob1_t { typedef struct mca_pml_ob1_t mca_pml_ob1_t; extern mca_pml_ob1_t mca_pml_ob1; -extern int mca_pml_ob1_output; /* * PML interface functions. diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index 8ac48dc58c..c21680748c 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -47,7 +47,6 @@ static mca_pml_base_module_t* mca_pml_ob1_component_init( int* priority, bool enable_progress_threads, bool enable_mpi_threads ); static int mca_pml_ob1_component_fini(void); -int mca_pml_ob1_output = 0; mca_pml_base_component_2_0_0_t mca_pml_ob1_component = { @@ -94,11 +93,6 @@ static inline int mca_pml_ob1_param_register_int( static int mca_pml_ob1_component_open(void) { mca_allocator_base_component_t* allocator_component; - int value; - - value = mca_pml_ob1_param_register_int("verbose", 0); - mca_pml_ob1_output = opal_output_open(NULL); - opal_output_set_verbosity(mca_pml_ob1_output, value); mca_pml_ob1.free_list_num = mca_pml_ob1_param_register_int("free_list_num", 4); diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 2689b95b41..53414ab381 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -317,7 +317,6 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { - int triperr = 1; mca_btl_base_segment_t* segments = des->des_dst; mca_pml_ob1_hdr_t* hdr = (mca_pml_ob1_hdr_t*)segments->seg_addr.pval; mca_pml_ob1_recv_request_t* recvreq; @@ -327,15 +326,6 @@ void mca_pml_ob1_recv_frag_callback_frag(mca_btl_base_module_t* btl, } ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_FRAG); recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval; - if (recvreq->req_recv.req_base.req_ompi.req_state == OMPI_REQUEST_INVALID) { - while (triperr) { - triperr++; - opal_output(0, "ERROR DETECTED - ATTACH DEBUGGER"); - sleep(5); - } - return; - } - mca_pml_ob1_recv_request_progress_frag(recvreq,btl,segments,des->des_dst_cnt); return; @@ -602,9 +592,6 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl, /* get sequence number of next message that can be processed */ next_msg_seq_expected = (uint16_t)proc->expected_sequence; - opal_output_verbose(20, mca_pml_ob1_output, - "frag_msg_seq=%d, next_msg_seq_expected=%d", - frag_msg_seq, next_msg_seq_expected); if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected)) goto wrong_seq; @@ -684,5 +671,5 @@ wrong_seq: num_segments, NULL); OPAL_THREAD_UNLOCK(&comm->matching_lock); return OMPI_SUCCESS; - } + diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 89b1a8470d..0adb0a1b14 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -422,7 +422,6 @@ void mca_pml_ob1_recv_request_progress_frag( mca_pml_ob1_recv_request_t* recvreq 0, bytes_received ); bytes_received -= sizeof(mca_pml_ob1_frag_hdr_t); data_offset = hdr->hdr_frag.hdr_frag_offset; - OPAL_OUTPUT((-1, " Received SEND_FRAG, offset=%d", data_offset)); /* * Make user buffer accessable(defined) before unpacking. */ diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 45e45eafb6..515bce3871 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -36,7 +36,6 @@ OBJ_CLASS_INSTANCE(mca_pml_ob1_send_range_t, ompi_free_list_item_t, NULL, NULL); - void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl) { int i, s = opal_list_get_size(&mca_pml_ob1.send_pending); @@ -545,11 +544,9 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, switch(rc) { case OMPI_ERR_RESOURCE_BUSY: /* No more resources. Allow the upper level to queue the send */ - opal_output(0, "OMPI_ERR_RESOURCE_BUSY returned from mca_pml_ob1_send_request_start_copy"); rc = OMPI_ERR_OUT_OF_RESOURCE; break; default: - opal_output(0, "ERROR ERROR ERROR ERROR ERROR in start_copy"); mca_bml_base_free(bml_btl, des); break; }