diff --git a/opal/mca/btl/openib/Makefile.am b/opal/mca/btl/openib/Makefile.am index 612f8e96b5..aeb9da07e0 100644 --- a/opal/mca/btl/openib/Makefile.am +++ b/opal/mca/btl/openib/Makefile.am @@ -68,13 +68,6 @@ sources = \ connect/btl_openib_connect_empty.h \ connect/connect.h -# If we have failover support, build that file -if MCA_btl_openib_enable_failover -sources += \ - btl_openib_failover.c \ - btl_openib_failover.h -endif - # If we have rdmacm support, build that CPC if MCA_btl_openib_have_rdmacm sources += \ diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 0f021ce304..20585e62f5 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -1850,23 +1850,13 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl, assert(max_data == payload_size); } -#if BTL_OPENIB_FAILOVER_ENABLED - send_signaled = 1; -#else send_signaled = qp_need_signal(ep, qp, payload_size + header_size, do_rdma); -#endif ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled); if (!ib_rc) { if (0 == send_signaled) { MCA_BTL_IB_FRAG_RETURN(frag); } -#if BTL_OPENIB_FAILOVER_ENABLED - else { - /* Return up in case needed for failover */ - *descriptor = (struct mca_btl_base_descriptor_t *) frag; - } -#endif OPAL_THREAD_UNLOCK(&ep->endpoint_lock); return OPAL_SUCCESS; diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index a8566a640c..33e4db113f 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -241,9 +241,6 @@ struct mca_btl_openib_component_t { opal_event_base_t *async_evbase; /**< Async event base */ bool use_async_event_thread; /**< Use the async event handler */ mca_btl_openib_srq_manager_t srq_manager; /**< Hash table for all BTL SRQs */ -#if BTL_OPENIB_FAILOVER_ENABLED - bool port_error_failover; /**< Report port errors to speed up failover */ -#endif /* declare as an int instead of btl_openib_device_type_t since there is no guarantee about the size of an enum. this value will be registered as an integer with the MCA variable system */ @@ -310,9 +307,6 @@ struct mca_btl_openib_component_t { int memory_registration_verbose_level; int memory_registration_verbose; int ignore_locality; -#if BTL_OPENIB_FAILOVER_ENABLED - int verbose_failover; -#endif #if OPAL_CUDA_SUPPORT bool cuda_async_send; bool cuda_async_recv; diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index cb741816ce..d6c119f6ec 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -84,9 +84,6 @@ #include "btl_openib_ini.h" #include "btl_openib_mca.h" #include "btl_openib_xrc.h" -#if BTL_OPENIB_FAILOVER_ENABLED -#include "btl_openib_failover.h" -#endif #include "btl_openib_async.h" #include "connect/base.h" #include "btl_openib_ip.h" @@ -504,12 +501,6 @@ static void btl_openib_control(mca_btl_base_module_t* btl, mca_btl_openib_endpoint_connected(ep); } break; -#if BTL_OPENIB_FAILOVER_ENABLED - case MCA_BTL_OPENIB_CONTROL_EP_BROKEN: - case MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR: - btl_openib_handle_failover_control_messages(ctl_hdr, ep); - break; -#endif default: BTL_ERROR(("Unknown message type received by BTL")); break; @@ -3452,20 +3443,8 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); -#if BTL_OPENIB_FAILOVER_ENABLED - /* The check for the callback flag is only needed when running - * with the failover case because there is a chance that a fragment - * generated from a sendi call (which does not set the flag) gets - * coalesced. In normal operation, this cannot happen as the sendi - * call will never queue up a fragment which could potentially become - * a coalesced fragment. It will revert to a regular send. */ - if (to_base_frag(i)->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { -#endif to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint, &to_base_frag(i)->base, OPAL_SUCCESS); -#if BTL_OPENIB_FAILOVER_ENABLED - } -#endif if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base); } @@ -3590,14 +3569,9 @@ error: } } -#if BTL_OPENIB_FAILOVER_ENABLED - mca_btl_openib_handle_endpoint_error(openib_btl, des, qp, - remote_proc, endpoint); -#else if(openib_btl) openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL, (struct opal_proc_t*)remote_proc, NULL); -#endif } static int poll_device(mca_btl_openib_device_t* device, int count) @@ -3808,9 +3782,6 @@ error: if(openib_btl->device->got_port_event) { /* These are non-fatal so just ignore it. */ openib_btl->device->got_port_event = false; -#if BTL_OPENIB_FAILOVER_ENABLED - mca_btl_openib_handle_btl_error(openib_btl); -#endif } } return count; diff --git a/opal/mca/btl/openib/btl_openib_endpoint.h b/opal/mca/btl/openib/btl_openib_endpoint.h index d6846da957..7c92aa7fd2 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.h +++ b/opal/mca/btl/openib/btl_openib_endpoint.h @@ -584,13 +584,6 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep, BTL_OPENIB_FOOTER_HTON(*ftr); sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey; -#if BTL_OPENIB_FAILOVER_ENABLED - /* frag->ftr is unused on the sending fragment, so use it - * to indicate it is an eager fragment. A non-zero value - * indicates it is eager, and the value indicates the - * location in the eager RDMA array that it lives. */ - frag->ftr = (mca_btl_openib_footer_t*)(long)(1 + head); -#endif sr_desc->wr.rdma.remote_addr = ep->eager_rdma_remote.base.lval + head * openib_btl->eager_rdma_frag_size + diff --git a/opal/mca/btl/openib/btl_openib_failover.c b/opal/mca/btl/openib/btl_openib_failover.c deleted file mode 100644 index eba83a0f3a..0000000000 --- a/opal/mca/btl/openib/btl_openib_failover.c +++ /dev/null @@ -1,790 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * Functions specific to implementing failover support. - * - * This file is conditionally copiled into the BTL when one configures - * it in with --enable-openib-failover. When this file is compiled - * in, the multi-BTL configurations can handle errors. The - * requirement is that there needs to be more than one openib BTL in - * use so that all the traffic can move to the other BTL. This does - * not support failing over to a different BTL like TCP. - */ - -#include "opal_config.h" -#include "opal_stdint.h" - -#include "btl_openib.h" -#include "btl_openib_endpoint.h" -#include "btl_openib_proc.h" -#include "btl_openib_failover.h" - -static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep, - struct mca_btl_base_module_t* module, - bool errout); -static void mca_btl_openib_endpoint_notify(mca_btl_openib_endpoint_t *endpoint, - uint8_t type, int index); - -/* debug functions that are normally not needed */ -void mca_btl_openib_dump_all_local_rdma_frags(mca_btl_openib_device_t *device); -void mca_btl_openib_dump_all_internal_queues(bool errout); -static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint); - -/** - * This function is called when we get an error on the completion - * event of a fragment. We check to see what type of fragment it is - * and act accordingly. In most cases, we first call up into the PML - * and have it map out this connection for any future communication. - * In addition, this function will possibly send some control messages - * over the other openib BTL. The first control message will tell the - * remote side to also map out this connection. The second control - * message makes sure the eager RDMA connection remains in a sane - * state. See that function for more details. - * @param openib_btl Pointer to BTL that had the error - * @param des Pointer to descriptor that had the error - * @param qp Queue pair that had the error - * @param remote_proc Pointer to process that had the error - * @param endpoint Pointer to endpoint that had the error - */ -void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, - mca_btl_base_descriptor_t *des, - int qp, - opal_proc_t* remote_proc, - mca_btl_openib_endpoint_t* endpoint) -{ - char *btlname = NULL; - int btl_ownership; - /* Since this BTL supports failover, it will call the PML error handler - * function with the NONFATAL flag. If the PML is running with failover - * support, then it will map out the endpoint for further communication - * and return control here. If the PML does not have failover support, - * it will abort the job and control will not return here. */ - - /* Note: At this point, what needs to be done is based on the type - * of openib fragment that got the error. Also note that in the wc - * struct, when wc->status != IBV_WC_SUCCESS, these are the only - * valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num. - * This means that one cannot key off of the wc->opcode to see what - * operation was done. The important information needs to be read - * from the fragment. */ - - /* Cannot issue callback to SRQ errors because the shared receive - * queue is shared and is not specific to a connection. There is no - * way to figure out what type of message created the error because - * we need the information in the wc->imm_data field which does not - * exist when we have an error. So, nothing to do here but return. */ - if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && - !BTL_OPENIB_QP_TYPE_PP(qp)) { - opal_output_verbose(20, mca_btl_openib_component.verbose_failover, - "SRQ RECV type=%d", openib_frag_type(des)); - /* Need to think about returning any shared resources of the - * SRQ. For now, we do nothing as we rarely see an error on - * the SRQ. */ - return; - } - assert(NULL != remote_proc); - - /* Create a nice string to help with debug */ - if (NULL != openib_btl) { - asprintf(&btlname, "lid=%d:name=%s", - openib_btl->lid, openib_btl->device->ib_dev->name); - } - - /* The next set of errors are associated with an endpoint, but not - * with a PML descriptor. They are not associated with a PML - * descriptor because: - * A. It was a receive - * B. It was some type of openib specific control message. - * Therefore, just drop the fragments and call up into the PML to - * disable this endpoint for future communication. */ - if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && - (BTL_OPENIB_QP_TYPE_PP(qp))) || - (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) || - (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) { - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, - remote_proc, btlname); - /* Now that this connection has been mapped out at the PML layer, - * we change the state in the BTL layer. The change in the PML - * layer should prevent that we ever try to send on this BTL - * again. If we do, then this is an error case. */ - if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { - endpoint->endpoint_state = MCA_BTL_IB_FAILED; - mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); - error_out_all_pending_frags(endpoint, &openib_btl->super, true); - } - opal_output_verbose(60, mca_btl_openib_component.verbose_failover, - "MCA_BTL_OPENIG_FRAG=%d, " - "dropping since connection is broken (des=%lx)", - openib_frag_type(des), (long unsigned int) des); - if (NULL != btlname) free(btlname); - return; - } - - /* These are RDMA read type fragments. Just continue with processing */ - if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) { - OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); - opal_output_verbose(20, mca_btl_openib_component.verbose_failover, - "OPENIB_FRAG_RECV_USER fragment, " - "btl=%lx, continue with callbacks", - (long unsigned int) &openib_btl->super); - } - - /* If we are at this point, we have completed a send, RDMA read or - * RDMA write. Call the PML callback function to map out this - * btl for further sending. We just call this every time we get an - * error even though it is not necessary. Subsequent calls with - * the same remote_proc argument will not actually map anything out. */ - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, - remote_proc, btlname); - if (NULL != btlname) free(btlname); - - /* Since we believe we have done a send, read or write, then the - * des_segments fields should have valid data. */ - assert(des->des_segments != NULL); - - /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then - * change the status. Since this connection was mapped out in the - * PML layer, no more attempts should be made to send on it. In - * addition, send a message to other end of the connection letting - * it know that this side is now broken. This is needed in the case - * of a spurious error which may not cause the remote side to detect - * the error. */ - if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { - endpoint->endpoint_state = MCA_BTL_IB_FAILED; - mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); - } - - /* Now, call the callback function associated with the fragment. - * In case the fragments were coalesced we need to pull them apart - * and call the callback function for each one. */ - if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { - opal_list_item_t *i; - while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { - btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint, - &to_base_frag(i)->base, OPAL_ERROR); - if( btl_ownership ) { - mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base); - } - } - } - - /* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER - * or MCA_BTL_OPENIB_FRAG_RECV_USER. */ - btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_ERROR); - if( btl_ownership ) { - mca_btl_openib_free(&openib_btl->super, des); - } - - /* Here we send another control message to notify the remote side - * we had an error on a eager fragment. A non-zero value for the - * ftr variable indicates that this was an eager RDMA fragment. - * We need to do this in case the eager RDMA fragment after this - * one actually made it successfully. */ - if (0 != to_send_frag(des)->ftr) { - mca_btl_openib_endpoint_notify(endpoint, - MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR, - (long)to_send_frag(des)->ftr - 1); - } - - /* We know we have completed a send so return some resources even - * though connection is broken. With SRQ, the resources are shared - * so if we do not return the credits we may not be allowed to send - * anymore. */ - qp_put_wqe(endpoint, qp); - if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) { - OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); - } - - /* There are several queues associated with an endpoint that may - * have some unsent fragments sitting in them. Remove them and - * call the callback functions with an error so the PML can send - * them down a different path. This really only needs to be called - * once on an endpoint, but for now, just call it a bunch of times. - * The first time through will remove the unsent fragments so - * subsequent calls are no-ops. */ - if (endpoint) { - error_out_all_pending_frags(endpoint, &openib_btl->super, true); - } -} - -/** - * This functions allows an error to map out the entire BTL. First a - * call is made up to the PML to map out all connections from this BTL. - * Then a message is sent to all the endpoints connected to this BTL. - * This function is enabled by the btl_openib_port_error_failover - * MCA parameter. If that parameter is not set, then this function - * does not do anything. - * @param openib_btl Pointer to BTL that had the error - */ -void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) { - mca_btl_base_endpoint_t* endpoint; - int i; - - /* Check to see that the flag is set for the entire map out. */ - if(mca_btl_openib_component.port_error_failover) { - /* Since we are not specifying a specific connection to bring down, - * the PML layer will may out the entire BTL for future communication. */ - char *btlname = NULL; - asprintf(&btlname, "lid=%d:name=%s", - openib_btl->lid, openib_btl->device->ib_dev->name); - openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, - NULL, btlname); - if (NULL != btlname) free(btlname); - - /* Now send out messages to all endpoints that we are disconnecting. - * Only do this to endpoints that are connected. Otherwise, the - * remote side does not yet have the information on this endpoint. */ - for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) { - endpoint = (mca_btl_openib_endpoint_t*) - opal_pointer_array_get_item(openib_btl->device->endpoints, i); - if (NULL == endpoint) { - continue; - } - if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) { - mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); - endpoint->endpoint_state = MCA_BTL_IB_FAILED; - error_out_all_pending_frags(endpoint, &openib_btl->super, true); - } - } - } -} - -/** - * This function gets called when a control message is received that - * is one of the following types: - * MCA_BTL_OPENIB_CONTROL_EP_BROKEN - * MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message - * Note that we are using the working connection to send information - * about the broken connection. That is why we have to look at the - * various information in the control message to figure out which - * endpoint is broken. It is (obviously) not the one the message was - * received on, because we would not have received the message in that - * case. In the case of the BROKEN message, that means the remote - * side is notifying us that it has brought down its half of the - * connection. Therefore, we need to bring out half down. This is - * done because it has been observed that there are cases where only - * one side of the connection actually sees the error. This means we - * can be left in a state where one side believes it has two BTLs, but - * the other side believes it only has one. This can cause problems. - * In the case of the EAGER_RDMA_ERROR, see elsewhere in the code what - * we are doing. - * @param ctl_hdr Pointer control header that was received - */ -void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t *ctl_hdr, - mca_btl_openib_endpoint_t* ep) -{ - mca_btl_openib_broken_connection_header_t *bc_hdr = - (mca_btl_openib_broken_connection_header_t*)ctl_hdr; - int i; - int found = false; - - if(ep->nbo) { - BTL_OPENIB_BROKEN_CONNECTION_HEADER_NTOH((*bc_hdr)); - } - - opal_output_verbose(30, mca_btl_openib_component.verbose_failover, - "IB: Control message received from %d: lid=%d,subnet=0x%" PRIx64 "", - bc_hdr->vpid, bc_hdr->lid, bc_hdr->subnet_id); - - /* Now we walk through all the endpoints on all the BTLs to - * find out which one to map out. */ - for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { - mca_btl_openib_module_t* newbtl; - int j; - - newbtl = mca_btl_openib_component.openib_btls[i]; - /* Now, find the endpoint associated with it */ - for (j = 0; j < opal_pointer_array_get_size(newbtl->device->endpoints); j++) { - mca_btl_base_endpoint_t* newep; - newep = (mca_btl_openib_endpoint_t*) - opal_pointer_array_get_item(newbtl->device->endpoints, j); - if (NULL == newep) { - continue; - } - /* Now compare the LID, subnet ID, and the vpid we received - * from the remote side and try to match it to an endpoint. */ - if ((bc_hdr->lid == newep->rem_info.rem_lid) && - (bc_hdr->subnet_id == newep->rem_info.rem_subnet_id) && - (bc_hdr->vpid == newep->endpoint_proc->proc_opal->proc_name.vpid)) { - opal_output_verbose(30, mca_btl_openib_component.verbose_failover, - "IB: Control message received from %d: " - "found match: lid=%d," - "subnet=0x%" PRIx64 ",endpoint_state=%d", - newep->endpoint_proc->proc_opal->proc_name.vpid, - newep->rem_info.rem_lid, - newep->rem_info.rem_subnet_id, - newep->endpoint_state); - found = true; - /* At this point, we have found the endpoint. Now decode the - * message type and do the appropriate action. */ - if (MCA_BTL_OPENIB_CONTROL_EP_BROKEN == ctl_hdr->type) { - /* Now that we found a match, check the state of the - * endpoint to see it is already in a failed state. - * If not, then notify the upper layer and error out - * any pending fragments. */ - if (MCA_BTL_IB_FAILED == newep->endpoint_state) { - return; - } else { - char *btlname = NULL; - opal_proc_t* remote_proc = NULL; - - asprintf(&btlname, "lid=%d:name=%s", - newbtl->lid, newbtl->device->ib_dev->name); - - remote_proc = newep->endpoint_proc->proc_opal; - - opal_output_verbose(10, mca_btl_openib_component.verbose_failover, - "IB: Control message received from %d: " - "bringing down connection,lid=%d," - "subnet=0x%" PRIx64 ",endpoint_state=%d", - newep->endpoint_proc->proc_opal->proc_name.vpid, - newep->rem_info.rem_lid, - newep->rem_info.rem_subnet_id, - newep->endpoint_state); - newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, - remote_proc, btlname); - if (NULL != btlname) free(btlname); - - error_out_all_pending_frags(newep, &newbtl->super, true); - newep->endpoint_state = MCA_BTL_IB_FAILED; - return; - } - } else { /* MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message */ - /* If we are still pointing at the location where - * we detected an error on the remote side, then - * bump the index by one. */ - if (newep->eager_rdma_local.head == (uint16_t)bc_hdr->index) { - /* Adjust the local head by one just in case */ - MCA_BTL_OPENIB_RDMA_NEXT_INDEX(newep->eager_rdma_local.head); - opal_output_verbose(20, mca_btl_openib_component.verbose_failover, - "IB: rank=%d, control message (remote=%d), " - "moved local head by one (new=%d)", - OPAL_PROC_MY_NAME.vpid, - newep->endpoint_proc->proc_opal->proc_name.vpid, - newep->eager_rdma_local.head); - } else { - opal_output_verbose(20, mca_btl_openib_component.verbose_failover, - "IB: rank=%d, control message (remote=%d), " - "did not move local head by one (still=%d)", - OPAL_PROC_MY_NAME.vpid, - newep->endpoint_proc->proc_opal->proc_name.vpid, - newep->eager_rdma_local.head); - } - } - break; /* since we found the endpoint */ - } - } - } - if (false == found) { - opal_output_verbose(30, mca_btl_openib_component.verbose_failover, - "IB: Control message: no match found"); - } -} - -/** - * This function will find all the pending fragments on an endpoint - * and call the callback function with OPAL_ERROR. It walks through - * each qp with each priority and looks for both no_credits_pending_frags - * and no_wqe_pending_frags. It then looks for any pending_lazy_frags, - * pending_put_frags, and pending_get_frags. This function is only - * called when running with failover support enabled. Note that - * the errout parameter allows the function to also be used as a - * debugging tool to see if there are any fragments on any of the - * queues. - * @param ep Pointer to endpoint that had error - * @param module Pointer to module that had error - * @param errout Boolean which says whether to error them out or not - */ -static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep, - struct mca_btl_base_module_t* module, - bool errout) -{ - int qp, pri, len, total, btl_ownership; - - opal_list_item_t *item; - mca_btl_openib_com_frag_t* frag; - mca_btl_base_descriptor_t *des; - int verbose = 10; /* Verbosity level unless debugging */ - - /* If debugging, drop verbosity level so we can see the output - * regardless of the level the program was run with. */ - if (false == errout) { - verbose = 0; - } - - total = 0; - /* Traverse all QPs and all priorities and move to other endpoint */ - for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { - for (pri = 0; pri < 2; ++pri) { - /* All types of qp's have a no_wqe_pending_frags list */ - len = opal_list_get_size(&ep->qps[qp].no_wqe_pending_frags[pri]); - if (len > 0) { - total += len; - opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, - "IB: Checking for no_wqe_pending_frags qp=%d, " - "pri=%d, list size=%d", - qp, pri, len); - if (true == errout) { - while (NULL != (item = opal_list_remove_first(&ep->qps[qp]. - no_wqe_pending_frags[pri]))) { - frag = (mca_btl_openib_com_frag_t *) item; - des = (mca_btl_base_descriptor_t *)frag; - - /* Error out any coalesced frags if they exist */ - if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { - opal_list_item_t *i; - while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { - opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, - "IB: Found coalesced frag in no_wqe_pending_frags"); - btl_ownership = (to_base_frag(i)->base.des_flags & - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - to_base_frag(i)->base.des_cbfunc(module, ep, - &to_base_frag(i)->base, OPAL_ERROR); - if( btl_ownership ) { - mca_btl_openib_free(module, &to_base_frag(i)->base); - } - } - } - btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - des->des_cbfunc(module, ep, des, OPAL_ERROR); - if( btl_ownership ) { - mca_btl_openib_free(module, des); - } - } - } - } - if (BTL_OPENIB_QP_TYPE_PP(qp)) { - len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]); - if (len > 0) { - total += len; - opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, - "IB: Checking for no_credits_pending_frags qp=%d, " - "pri=%d, list size=%d", - qp, pri, len); - if (true == errout) { - while (NULL != (item = opal_list_remove_first(&ep->qps[qp]. - no_credits_pending_frags[pri]))) { - frag = (mca_btl_openib_com_frag_t *) item; - des = (mca_btl_base_descriptor_t *)frag; - - /* Error out any coalesced frags if they exist */ - if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { - opal_list_item_t *i; - while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { - opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, - "IB: Found coalesced frag in " - "no_credits_pending_frags"); - btl_ownership = (to_base_frag(i)->base.des_flags & - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - to_base_frag(i)->base.des_cbfunc(module, ep, - &to_base_frag(i)->base, OPAL_ERROR); - if( btl_ownership ) { - mca_btl_openib_free(module, &to_base_frag(i)->base); - } - } - } - btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - des->des_cbfunc(module, ep, des, OPAL_ERROR); - if( btl_ownership ) { - mca_btl_openib_free(module, des); - } - } - } - } - - } else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) { - len = opal_list_get_size(&ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]); - if (len > 0) { - total += len; - opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, - "IB: Checking for srq pending_frags qp=%d, pri=%d, " - "list size=%d", - qp, pri, len); - if (true == errout) { - while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp]. - u.srq_qp.pending_frags[pri]))) { - frag = (mca_btl_openib_com_frag_t *) item; - des = (mca_btl_base_descriptor_t *)frag; - - /* Error out any coalesced frags if they exist */ - if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { - opal_list_item_t *i; - while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { - opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, - "IB: Found coalesced frag in SRQ pending_frags"); - btl_ownership = (to_base_frag(i)->base.des_flags & - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - to_base_frag(i)->base.des_cbfunc(module, ep, - &to_base_frag(i)->base, OPAL_ERROR); - if( btl_ownership ) { - mca_btl_openib_free(module, &to_base_frag(i)->base); - } - } - } - btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - des->des_cbfunc(module, ep, des, OPAL_ERROR); - if( btl_ownership ) { - mca_btl_openib_free(module, des); - } - } - } - } - } - } - } - - /* Check for any frags from a connection that was never made. Not sure if this - * can actually happen. */ - len = opal_list_get_size(&ep->pending_lazy_frags); - - if (len > 0) { - total += len; - opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, - "IB: Checking for pending_lazy_frags, list size=%d", len); - if (true == errout) { - while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) { - frag = (mca_btl_openib_com_frag_t *) item; - des = (mca_btl_base_descriptor_t *)frag; - des->des_cbfunc(module, ep, des, OPAL_ERROR); - } - } - } - - len = opal_list_get_size(&ep->pending_put_frags); - if (len > 0) { - total += len; - opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, - "IB: Checking for pending_put_frags, list size=%d", len); - if (true == errout) { - while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) { - frag = (mca_btl_openib_com_frag_t *) item; - des = (mca_btl_base_descriptor_t *)frag; - des->des_cbfunc(module, ep, des, OPAL_ERROR); - } - } - } - - len = opal_list_get_size(&ep->pending_get_frags); - if (len > 0) { - total += len; - opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, - "IB: Checking for pending_get_frags, list size=%d", len); - if (true == errout) { - while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) { - frag = (mca_btl_openib_com_frag_t *) item; - des = (mca_btl_base_descriptor_t *)frag; - des->des_cbfunc(module, ep, des, OPAL_ERROR); - } - } - } - - opal_output_verbose(verbose + 30, mca_btl_openib_component.verbose_failover, - "IB: Finished checking for pending_frags, total moved=%d", - total); -} - -/* local callback function for completion of a failover control message */ -static void mca_btl_openib_endpoint_notify_cb(mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* descriptor, - int status) -{ - MCA_BTL_IB_FRAG_RETURN(descriptor); -} - -/** - * This function is used to send a message to the remote side - * indicating the endpoint is broken and telling the remote side to - * brings its endpoint down as well. This is needed because there are - * cases where only one side of the connection determines that the - * there was a problem. - * @param endpoint Pointer to endpoint with error - * @param type Type of message to be sent, can be one of two types - * @param index When sending RDMA error message, index is non zero - */ -static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index) -{ - mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; - mca_btl_openib_module_t* newbtl = NULL; - bool found = false; - mca_btl_openib_broken_connection_header_t *bc_hdr; - mca_btl_openib_send_control_frag_t* frag; - mca_btl_base_endpoint_t* newep; - int i, rc; - opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal; - - /* First, find a different BTL than this one that got the - * error to send the message over. */ - for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { - if (mca_btl_openib_component.openib_btls[i] != openib_btl) { - newbtl = mca_btl_openib_component.openib_btls[i]; - break; - } - } - if (NULL == newbtl) { - opal_output_verbose(20, mca_btl_openib_component.verbose_failover, - "IB: Endpoint Notify: No BTL found"); - /* If we cannot find one, then just return. */ - return; - } - - /* Now, find the endpoint associated with it. The device - * associated with the BTL has the list of all the - * endpoints. */ - for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) { - newep = (mca_btl_openib_endpoint_t*) - opal_pointer_array_get_item(newbtl->device->endpoints, i); - if (NULL == newep) { - continue; - } - if (newep->endpoint_proc->proc_opal == remote_proc) { - found = true; - break; - } - } - if (false == found) { - opal_output_verbose(20, mca_btl_openib_component.verbose_failover, - "IB: Endpoint Notify: No endpoint found"); - /* If we cannot find a match, then just return. */ - return; - } - - frag = alloc_control_frag(newbtl); - if(NULL == frag) { - opal_output_verbose(20, mca_btl_openib_component.verbose_failover, - "IB: Endpoint Notify: No frag space"); - /* If no frag available, then just return. */ - return; - } - - to_base_frag(frag)->base.des_cbfunc = - mca_btl_openib_endpoint_notify_cb; - to_base_frag(frag)->base.des_cbdata = NULL; - to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; - to_base_frag(frag)->segment.seg_len = - sizeof(mca_btl_openib_broken_connection_header_t); - to_com_frag(frag)->endpoint = newep; - - frag->hdr->tag = MCA_BTL_TAG_IB; - bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.seg_addr.pval; - bc_hdr->control.type = type; - bc_hdr->lid = endpoint->endpoint_btl->port_info.lid; - bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id; - bc_hdr->vpid = OPAL_PROC_MY_NAME.vpid; - bc_hdr->index = index; - - if(newep->nbo) { - BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr)); - } - rc = mca_btl_openib_endpoint_send(newep, frag); - if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) { - return; - } - - MCA_BTL_IB_FRAG_RETURN(frag); - BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno))); - return; -} - -/* - * Function used for debugging problems in eager rdma. - */ -static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint) { - mca_btl_openib_recv_frag_t *headers_buf = endpoint->eager_rdma_local.frags; - mca_btl_openib_recv_frag_t * frag; - mca_btl_openib_control_header_t* chdr; - int i, size; - - opal_output(0, "Head = %d", endpoint->eager_rdma_local.head); - - for (i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) { - frag = &headers_buf[i]; - size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr); - - frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) - - size + sizeof(mca_btl_openib_footer_t)); - to_base_frag(frag)->segment.seg_addr.pval = - ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); - - chdr = to_base_frag(frag)->segment.seg_addr.pval; - if ((MCA_BTL_TAG_IB == frag->hdr->tag) && - (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type)) { - opal_output(0, "tag[%d] is credit message", i); - } else { - opal_output(0, "frag[%d] size=%d,tag=%d,ftr->u.buf=%d", i, size, frag->hdr->tag, - frag->ftr->u.buf[3]); - } - } -} - -/* - * Function used for debugging problems in eager rdma. - */ -void mca_btl_openib_dump_all_local_rdma_frags(mca_btl_openib_device_t *device) { - int i, c; - mca_btl_openib_endpoint_t* endpoint; - - c = device->eager_rdma_buffers_count; - opal_output(0, "rank=%d, device=%s", OPAL_PROC_MY_NAME.vpid, device->ib_dev->name); - - for(i = 0; i < c; i++) { - endpoint = device->eager_rdma_buffers[i]; - - if(!endpoint) - continue; - - dump_local_rdma_frags(endpoint); - } -} - -/** - * This function is a debugging tool. If you notify a hang, you can - * call this function from a debugger and see if there are any - * messages stuck in any of the queues. If you call it with - * errout=true, then it will error them out. Otherwise, it will - * just print out the size of the queues with data in them. - */ -void mca_btl_openib_dump_all_internal_queues(bool errout) { - int i, j, num_eps; - mca_btl_openib_module_t* btl; - int total; - mca_btl_base_endpoint_t* ep; - struct mca_btl_base_module_t* module; - - for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { - btl = mca_btl_openib_component.openib_btls[i]; - module = &btl->super; - num_eps = opal_pointer_array_get_size(btl->device->endpoints); - - /* Now, find the endpoint associated with it */ - for (j = 0; j < num_eps; j++) { - ep = (mca_btl_openib_endpoint_t*) - opal_pointer_array_get_item(btl->device->endpoints, j); - if (NULL == ep) { - continue; - } - - total = 0; - error_out_all_pending_frags(ep, module, errout); - } - } -} - diff --git a/opal/mca/btl/openib/btl_openib_failover.h b/opal/mca/btl/openib/btl_openib_failover.h deleted file mode 100644 index afb77a42b9..0000000000 --- a/opal/mca/btl/openib/btl_openib_failover.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * Functions called by BTL to handle error events - */ - -#ifndef MCA_BTL_IB_FAILOVER_H -#define MCA_BTL_IB_FAILOVER_H - -BEGIN_C_DECLS - -void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, - mca_btl_base_descriptor_t *des, - int qp, - opal_proc_t* remote_proc, - mca_btl_openib_endpoint_t* endpoint); -void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl); -void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t *ctl_hdr, - mca_btl_openib_endpoint_t* ep); - -END_C_DECLS - -#endif diff --git a/opal/mca/btl/openib/btl_openib_frag.h b/opal/mca/btl/openib/btl_openib_frag.h index 7ca3714242..d140fe4a8a 100644 --- a/opal/mca/btl/openib/btl_openib_frag.h +++ b/opal/mca/btl/openib/btl_openib_frag.h @@ -190,10 +190,6 @@ typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t; #define MCA_BTL_OPENIB_CONTROL_RDMA 1 #define MCA_BTL_OPENIB_CONTROL_COALESCED 2 #define MCA_BTL_OPENIB_CONTROL_CTS 3 -#if BTL_OPENIB_FAILOVER_ENABLED -#define MCA_BTL_OPENIB_CONTROL_EP_BROKEN 4 -#define MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR 5 -#endif struct mca_btl_openib_control_header_t { uint8_t type; @@ -243,32 +239,6 @@ do { \ (h).rdma_credits = ntohs((h).rdma_credits); \ } while (0) -#if BTL_OPENIB_FAILOVER_ENABLED -struct mca_btl_openib_broken_connection_header_t { - mca_btl_openib_control_header_t control; - uint32_t lid; - uint64_t subnet_id; - uint32_t vpid; - uint32_t index; /* for eager RDMA only */ -}; -typedef struct mca_btl_openib_broken_connection_header_t mca_btl_openib_broken_connection_header_t; - -#define BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON(h) \ - do { \ - (h).lid = htonl((h).lid); \ - (h).subnet_id = hton64((h).subnet_id); \ - (h).vpid = htonl((h).vpid); \ - (h).index = htonl((h).index); \ - } while (0) - -#define BTL_OPENIB_BROKEN_CONNECTION_HEADER_NTOH(h) \ - do { \ - (h).lid = ntohl((h).lid); \ - (h).subnet_id = ntoh64((h).subnet_id); \ - (h).vpid = ntohl((h).vpid); \ - (h).index = ntohl((h).index); \ - } while (0) -#endif enum mca_btl_openib_frag_type_t { MCA_BTL_OPENIB_FRAG_RECV, MCA_BTL_OPENIB_FRAG_RECV_USER, diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c index 700ccb2763..9327ca347f 100644 --- a/opal/mca/btl/openib/btl_openib_mca.c +++ b/opal/mca/btl/openib/btl_openib_mca.c @@ -89,11 +89,6 @@ static mca_base_var_enum_value_t device_type_values[] = { static int btl_openib_cq_size; static bool btl_openib_have_fork_support = OPAL_HAVE_IBV_FORK_INIT; -#if BTL_OPENIB_FAILOVER_ENABLED -static int btl_openib_verbose_failover; -static bool btl_openib_failover_enabled = true; -#endif - /* * utility routine for string parameter registration */ @@ -473,30 +468,6 @@ int btl_openib_register_mca_params(void) "If nonzero, use the thread that will handle InfiniBand asynchronous events", true, &mca_btl_openib_component.use_async_event_thread)); -#if BTL_OPENIB_FAILOVER_ENABLED - /* failover specific output */ - CHECK(reg_int("verbose_failover", NULL, - "Output some verbose OpenIB BTL failover information " - "(0 = no output, nonzero = output)", 0, &btl_openib_verbose_failover, 0)); - mca_btl_openib_component.verbose_failover = opal_output_open(NULL); - opal_output_set_verbosity(mca_btl_openib_component.verbose_failover, btl_openib_verbose_failover); - - CHECK(reg_bool("port_error_failover", NULL, - "If nonzero, asynchronous port errors will trigger failover", - 0, &mca_btl_openib_component.port_error_failover)); - - /* Make non writeable parameter that indicates failover is configured in. */ - tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, - "failover_enabled", - "openib failover is configured: run with bfo PML to support failover between openib BTLs", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, - MCA_BASE_VAR_FLAG_DEFAULT_ONLY, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_CONSTANT, - &btl_openib_failover_enabled); - if (0 > tmp) ret = tmp; -#endif - CHECK(reg_bool("enable_srq_resize", NULL, "Enable/Disable on demand SRQ resize. " "(0 = without resizing, nonzero = with resizing)", 1, @@ -570,10 +541,6 @@ int btl_openib_register_mca_params(void) mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA | MCA_BTL_FLAGS_SEND; -#if BTL_OPENIB_FAILOVER_ENABLED - mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT; -#endif - #if HAVE_DECL_IBV_ATOMIC_HCA mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS; mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP; diff --git a/opal/mca/btl/openib/configure.m4 b/opal/mca/btl/openib/configure.m4 index c11dee47cb..d91c8edd78 100644 --- a/opal/mca/btl/openib/configure.m4 +++ b/opal/mca/btl/openib/configure.m4 @@ -104,22 +104,6 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[ AC_MSG_CHECKING([which openib btl cpcs will be built]) AC_MSG_RESULT([$cpcs])]) - # Enable openib device failover. It is disabled by default. - AC_MSG_CHECKING([whether openib failover is enabled]) - AC_ARG_ENABLE([btl-openib-failover], - [AC_HELP_STRING([--enable-btl-openib-failover], - [enable openib BTL failover (default: disabled)])]) - if test "$enable_btl_openib_failover" = "yes"; then - AC_MSG_RESULT([yes]) - btl_openib_failover_enabled=1 - else - AC_MSG_RESULT([no]) - btl_openib_failover_enabled=0 - fi - AC_DEFINE_UNQUOTED([BTL_OPENIB_FAILOVER_ENABLED], [$btl_openib_failover_enabled], - [enable openib BTL failover]) - AM_CONDITIONAL([MCA_btl_openib_enable_failover], [test "x$btl_openib_failover_enabled" = "x1"]) - # make sure that CUDA-aware checks have been done AC_REQUIRE([OPAL_CHECK_CUDA])