Merge pull request #2336 from larrystevenwise/master
openib btl: remove BTL_OPENIB_FAILOVER_ENABLED code
Этот коммит содержится в:
Коммит
27fcd2d6ba
@ -68,13 +68,6 @@ sources = \
|
|||||||
connect/btl_openib_connect_empty.h \
|
connect/btl_openib_connect_empty.h \
|
||||||
connect/connect.h
|
connect/connect.h
|
||||||
|
|
||||||
# If we have failover support, build that file
|
|
||||||
if MCA_btl_openib_enable_failover
|
|
||||||
sources += \
|
|
||||||
btl_openib_failover.c \
|
|
||||||
btl_openib_failover.h
|
|
||||||
endif
|
|
||||||
|
|
||||||
# If we have rdmacm support, build that CPC
|
# If we have rdmacm support, build that CPC
|
||||||
if MCA_btl_openib_have_rdmacm
|
if MCA_btl_openib_have_rdmacm
|
||||||
sources += \
|
sources += \
|
||||||
|
@ -1850,23 +1850,13 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
|||||||
assert(max_data == payload_size);
|
assert(max_data == payload_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
send_signaled = 1;
|
|
||||||
#else
|
|
||||||
send_signaled = qp_need_signal(ep, qp, payload_size + header_size, do_rdma);
|
send_signaled = qp_need_signal(ep, qp, payload_size + header_size, do_rdma);
|
||||||
#endif
|
|
||||||
ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled);
|
ib_rc = post_send(ep, to_send_frag(item), do_rdma, send_signaled);
|
||||||
|
|
||||||
if (!ib_rc) {
|
if (!ib_rc) {
|
||||||
if (0 == send_signaled) {
|
if (0 == send_signaled) {
|
||||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
MCA_BTL_IB_FRAG_RETURN(frag);
|
||||||
}
|
}
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
else {
|
|
||||||
/* Return up in case needed for failover */
|
|
||||||
*descriptor = (struct mca_btl_base_descriptor_t *) frag;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
|
@ -241,9 +241,6 @@ struct mca_btl_openib_component_t {
|
|||||||
opal_event_base_t *async_evbase; /**< Async event base */
|
opal_event_base_t *async_evbase; /**< Async event base */
|
||||||
bool use_async_event_thread; /**< Use the async event handler */
|
bool use_async_event_thread; /**< Use the async event handler */
|
||||||
mca_btl_openib_srq_manager_t srq_manager; /**< Hash table for all BTL SRQs */
|
mca_btl_openib_srq_manager_t srq_manager; /**< Hash table for all BTL SRQs */
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
bool port_error_failover; /**< Report port errors to speed up failover */
|
|
||||||
#endif
|
|
||||||
/* declare as an int instead of btl_openib_device_type_t since there is no
|
/* declare as an int instead of btl_openib_device_type_t since there is no
|
||||||
guarantee about the size of an enum. this value will be registered as an
|
guarantee about the size of an enum. this value will be registered as an
|
||||||
integer with the MCA variable system */
|
integer with the MCA variable system */
|
||||||
@ -310,9 +307,6 @@ struct mca_btl_openib_component_t {
|
|||||||
int memory_registration_verbose_level;
|
int memory_registration_verbose_level;
|
||||||
int memory_registration_verbose;
|
int memory_registration_verbose;
|
||||||
int ignore_locality;
|
int ignore_locality;
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
int verbose_failover;
|
|
||||||
#endif
|
|
||||||
#if OPAL_CUDA_SUPPORT
|
#if OPAL_CUDA_SUPPORT
|
||||||
bool cuda_async_send;
|
bool cuda_async_send;
|
||||||
bool cuda_async_recv;
|
bool cuda_async_recv;
|
||||||
|
@ -84,9 +84,6 @@
|
|||||||
#include "btl_openib_ini.h"
|
#include "btl_openib_ini.h"
|
||||||
#include "btl_openib_mca.h"
|
#include "btl_openib_mca.h"
|
||||||
#include "btl_openib_xrc.h"
|
#include "btl_openib_xrc.h"
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
#include "btl_openib_failover.h"
|
|
||||||
#endif
|
|
||||||
#include "btl_openib_async.h"
|
#include "btl_openib_async.h"
|
||||||
#include "connect/base.h"
|
#include "connect/base.h"
|
||||||
#include "btl_openib_ip.h"
|
#include "btl_openib_ip.h"
|
||||||
@ -504,12 +501,6 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
|
|||||||
mca_btl_openib_endpoint_connected(ep);
|
mca_btl_openib_endpoint_connected(ep);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
case MCA_BTL_OPENIB_CONTROL_EP_BROKEN:
|
|
||||||
case MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR:
|
|
||||||
btl_openib_handle_failover_control_messages(ctl_hdr, ep);
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
default:
|
default:
|
||||||
BTL_ERROR(("Unknown message type received by BTL"));
|
BTL_ERROR(("Unknown message type received by BTL"));
|
||||||
break;
|
break;
|
||||||
@ -3452,20 +3443,8 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq,
|
|||||||
opal_list_item_t *i;
|
opal_list_item_t *i;
|
||||||
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
||||||
btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
/* The check for the callback flag is only needed when running
|
|
||||||
* with the failover case because there is a chance that a fragment
|
|
||||||
* generated from a sendi call (which does not set the flag) gets
|
|
||||||
* coalesced. In normal operation, this cannot happen as the sendi
|
|
||||||
* call will never queue up a fragment which could potentially become
|
|
||||||
* a coalesced fragment. It will revert to a regular send. */
|
|
||||||
if (to_base_frag(i)->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
|
|
||||||
#endif
|
|
||||||
to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
|
to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
|
||||||
&to_base_frag(i)->base, OPAL_SUCCESS);
|
&to_base_frag(i)->base, OPAL_SUCCESS);
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if( btl_ownership ) {
|
if( btl_ownership ) {
|
||||||
mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
|
mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
|
||||||
}
|
}
|
||||||
@ -3590,14 +3569,9 @@ error:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
mca_btl_openib_handle_endpoint_error(openib_btl, des, qp,
|
|
||||||
remote_proc, endpoint);
|
|
||||||
#else
|
|
||||||
if(openib_btl)
|
if(openib_btl)
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
|
||||||
(struct opal_proc_t*)remote_proc, NULL);
|
(struct opal_proc_t*)remote_proc, NULL);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int poll_device(mca_btl_openib_device_t* device, int count)
|
static int poll_device(mca_btl_openib_device_t* device, int count)
|
||||||
@ -3808,9 +3782,6 @@ error:
|
|||||||
if(openib_btl->device->got_port_event) {
|
if(openib_btl->device->got_port_event) {
|
||||||
/* These are non-fatal so just ignore it. */
|
/* These are non-fatal so just ignore it. */
|
||||||
openib_btl->device->got_port_event = false;
|
openib_btl->device->got_port_event = false;
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
mca_btl_openib_handle_btl_error(openib_btl);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
|
@ -584,13 +584,6 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
|
|||||||
BTL_OPENIB_FOOTER_HTON(*ftr);
|
BTL_OPENIB_FOOTER_HTON(*ftr);
|
||||||
|
|
||||||
sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey;
|
sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey;
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
/* frag->ftr is unused on the sending fragment, so use it
|
|
||||||
* to indicate it is an eager fragment. A non-zero value
|
|
||||||
* indicates it is eager, and the value indicates the
|
|
||||||
* location in the eager RDMA array that it lives. */
|
|
||||||
frag->ftr = (mca_btl_openib_footer_t*)(long)(1 + head);
|
|
||||||
#endif
|
|
||||||
sr_desc->wr.rdma.remote_addr =
|
sr_desc->wr.rdma.remote_addr =
|
||||||
ep->eager_rdma_remote.base.lval +
|
ep->eager_rdma_remote.base.lval +
|
||||||
head * openib_btl->eager_rdma_frag_size +
|
head * openib_btl->eager_rdma_frag_size +
|
||||||
|
@ -1,790 +0,0 @@
|
|||||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
||||||
/*
|
|
||||||
* Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved.
|
|
||||||
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
|
||||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
|
||||||
* reserved.
|
|
||||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
|
||||||
* Copyright (c) 2014 Research Organization for Information Science
|
|
||||||
* and Technology (RIST). All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @file
|
|
||||||
* Functions specific to implementing failover support.
|
|
||||||
*
|
|
||||||
* This file is conditionally copiled into the BTL when one configures
|
|
||||||
* it in with --enable-openib-failover. When this file is compiled
|
|
||||||
* in, the multi-BTL configurations can handle errors. The
|
|
||||||
* requirement is that there needs to be more than one openib BTL in
|
|
||||||
* use so that all the traffic can move to the other BTL. This does
|
|
||||||
* not support failing over to a different BTL like TCP.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "opal_config.h"
|
|
||||||
#include "opal_stdint.h"
|
|
||||||
|
|
||||||
#include "btl_openib.h"
|
|
||||||
#include "btl_openib_endpoint.h"
|
|
||||||
#include "btl_openib_proc.h"
|
|
||||||
#include "btl_openib_failover.h"
|
|
||||||
|
|
||||||
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|
||||||
struct mca_btl_base_module_t* module,
|
|
||||||
bool errout);
|
|
||||||
static void mca_btl_openib_endpoint_notify(mca_btl_openib_endpoint_t *endpoint,
|
|
||||||
uint8_t type, int index);
|
|
||||||
|
|
||||||
/* debug functions that are normally not needed */
|
|
||||||
void mca_btl_openib_dump_all_local_rdma_frags(mca_btl_openib_device_t *device);
|
|
||||||
void mca_btl_openib_dump_all_internal_queues(bool errout);
|
|
||||||
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function is called when we get an error on the completion
|
|
||||||
* event of a fragment. We check to see what type of fragment it is
|
|
||||||
* and act accordingly. In most cases, we first call up into the PML
|
|
||||||
* and have it map out this connection for any future communication.
|
|
||||||
* In addition, this function will possibly send some control messages
|
|
||||||
* over the other openib BTL. The first control message will tell the
|
|
||||||
* remote side to also map out this connection. The second control
|
|
||||||
* message makes sure the eager RDMA connection remains in a sane
|
|
||||||
* state. See that function for more details.
|
|
||||||
* @param openib_btl Pointer to BTL that had the error
|
|
||||||
* @param des Pointer to descriptor that had the error
|
|
||||||
* @param qp Queue pair that had the error
|
|
||||||
* @param remote_proc Pointer to process that had the error
|
|
||||||
* @param endpoint Pointer to endpoint that had the error
|
|
||||||
*/
|
|
||||||
void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|
||||||
mca_btl_base_descriptor_t *des,
|
|
||||||
int qp,
|
|
||||||
opal_proc_t* remote_proc,
|
|
||||||
mca_btl_openib_endpoint_t* endpoint)
|
|
||||||
{
|
|
||||||
char *btlname = NULL;
|
|
||||||
int btl_ownership;
|
|
||||||
/* Since this BTL supports failover, it will call the PML error handler
|
|
||||||
* function with the NONFATAL flag. If the PML is running with failover
|
|
||||||
* support, then it will map out the endpoint for further communication
|
|
||||||
* and return control here. If the PML does not have failover support,
|
|
||||||
* it will abort the job and control will not return here. */
|
|
||||||
|
|
||||||
/* Note: At this point, what needs to be done is based on the type
|
|
||||||
* of openib fragment that got the error. Also note that in the wc
|
|
||||||
* struct, when wc->status != IBV_WC_SUCCESS, these are the only
|
|
||||||
* valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num.
|
|
||||||
* This means that one cannot key off of the wc->opcode to see what
|
|
||||||
* operation was done. The important information needs to be read
|
|
||||||
* from the fragment. */
|
|
||||||
|
|
||||||
/* Cannot issue callback to SRQ errors because the shared receive
|
|
||||||
* queue is shared and is not specific to a connection. There is no
|
|
||||||
* way to figure out what type of message created the error because
|
|
||||||
* we need the information in the wc->imm_data field which does not
|
|
||||||
* exist when we have an error. So, nothing to do here but return. */
|
|
||||||
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
|
||||||
!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
||||||
"SRQ RECV type=%d", openib_frag_type(des));
|
|
||||||
/* Need to think about returning any shared resources of the
|
|
||||||
* SRQ. For now, we do nothing as we rarely see an error on
|
|
||||||
* the SRQ. */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
assert(NULL != remote_proc);
|
|
||||||
|
|
||||||
/* Create a nice string to help with debug */
|
|
||||||
if (NULL != openib_btl) {
|
|
||||||
asprintf(&btlname, "lid=%d:name=%s",
|
|
||||||
openib_btl->lid, openib_btl->device->ib_dev->name);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* The next set of errors are associated with an endpoint, but not
|
|
||||||
* with a PML descriptor. They are not associated with a PML
|
|
||||||
* descriptor because:
|
|
||||||
* A. It was a receive
|
|
||||||
* B. It was some type of openib specific control message.
|
|
||||||
* Therefore, just drop the fragments and call up into the PML to
|
|
||||||
* disable this endpoint for future communication. */
|
|
||||||
if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
|
||||||
(BTL_OPENIB_QP_TYPE_PP(qp))) ||
|
|
||||||
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) ||
|
|
||||||
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) {
|
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
|
||||||
remote_proc, btlname);
|
|
||||||
/* Now that this connection has been mapped out at the PML layer,
|
|
||||||
* we change the state in the BTL layer. The change in the PML
|
|
||||||
* layer should prevent that we ever try to send on this BTL
|
|
||||||
* again. If we do, then this is an error case. */
|
|
||||||
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
|
|
||||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
|
||||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
|
||||||
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
|
||||||
}
|
|
||||||
opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
|
|
||||||
"MCA_BTL_OPENIG_FRAG=%d, "
|
|
||||||
"dropping since connection is broken (des=%lx)",
|
|
||||||
openib_frag_type(des), (long unsigned int) des);
|
|
||||||
if (NULL != btlname) free(btlname);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* These are RDMA read type fragments. Just continue with processing */
|
|
||||||
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) {
|
|
||||||
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
|
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
||||||
"OPENIB_FRAG_RECV_USER fragment, "
|
|
||||||
"btl=%lx, continue with callbacks",
|
|
||||||
(long unsigned int) &openib_btl->super);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If we are at this point, we have completed a send, RDMA read or
|
|
||||||
* RDMA write. Call the PML callback function to map out this
|
|
||||||
* btl for further sending. We just call this every time we get an
|
|
||||||
* error even though it is not necessary. Subsequent calls with
|
|
||||||
* the same remote_proc argument will not actually map anything out. */
|
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
|
||||||
remote_proc, btlname);
|
|
||||||
if (NULL != btlname) free(btlname);
|
|
||||||
|
|
||||||
/* Since we believe we have done a send, read or write, then the
|
|
||||||
* des_segments fields should have valid data. */
|
|
||||||
assert(des->des_segments != NULL);
|
|
||||||
|
|
||||||
/* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then
|
|
||||||
* change the status. Since this connection was mapped out in the
|
|
||||||
* PML layer, no more attempts should be made to send on it. In
|
|
||||||
* addition, send a message to other end of the connection letting
|
|
||||||
* it know that this side is now broken. This is needed in the case
|
|
||||||
* of a spurious error which may not cause the remote side to detect
|
|
||||||
* the error. */
|
|
||||||
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
|
|
||||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
|
||||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now, call the callback function associated with the fragment.
|
|
||||||
* In case the fragments were coalesced we need to pull them apart
|
|
||||||
* and call the callback function for each one. */
|
|
||||||
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
||||||
opal_list_item_t *i;
|
|
||||||
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
||||||
btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
|
|
||||||
&to_base_frag(i)->base, OPAL_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER
|
|
||||||
* or MCA_BTL_OPENIB_FRAG_RECV_USER. */
|
|
||||||
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(&openib_btl->super, des);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Here we send another control message to notify the remote side
|
|
||||||
* we had an error on a eager fragment. A non-zero value for the
|
|
||||||
* ftr variable indicates that this was an eager RDMA fragment.
|
|
||||||
* We need to do this in case the eager RDMA fragment after this
|
|
||||||
* one actually made it successfully. */
|
|
||||||
if (0 != to_send_frag(des)->ftr) {
|
|
||||||
mca_btl_openib_endpoint_notify(endpoint,
|
|
||||||
MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR,
|
|
||||||
(long)to_send_frag(des)->ftr - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* We know we have completed a send so return some resources even
|
|
||||||
* though connection is broken. With SRQ, the resources are shared
|
|
||||||
* so if we do not return the credits we may not be allowed to send
|
|
||||||
* anymore. */
|
|
||||||
qp_put_wqe(endpoint, qp);
|
|
||||||
if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) {
|
|
||||||
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* There are several queues associated with an endpoint that may
|
|
||||||
* have some unsent fragments sitting in them. Remove them and
|
|
||||||
* call the callback functions with an error so the PML can send
|
|
||||||
* them down a different path. This really only needs to be called
|
|
||||||
* once on an endpoint, but for now, just call it a bunch of times.
|
|
||||||
* The first time through will remove the unsent fragments so
|
|
||||||
* subsequent calls are no-ops. */
|
|
||||||
if (endpoint) {
|
|
||||||
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This functions allows an error to map out the entire BTL. First a
|
|
||||||
* call is made up to the PML to map out all connections from this BTL.
|
|
||||||
* Then a message is sent to all the endpoints connected to this BTL.
|
|
||||||
* This function is enabled by the btl_openib_port_error_failover
|
|
||||||
* MCA parameter. If that parameter is not set, then this function
|
|
||||||
* does not do anything.
|
|
||||||
* @param openib_btl Pointer to BTL that had the error
|
|
||||||
*/
|
|
||||||
void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
|
||||||
mca_btl_base_endpoint_t* endpoint;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
/* Check to see that the flag is set for the entire map out. */
|
|
||||||
if(mca_btl_openib_component.port_error_failover) {
|
|
||||||
/* Since we are not specifying a specific connection to bring down,
|
|
||||||
* the PML layer will may out the entire BTL for future communication. */
|
|
||||||
char *btlname = NULL;
|
|
||||||
asprintf(&btlname, "lid=%d:name=%s",
|
|
||||||
openib_btl->lid, openib_btl->device->ib_dev->name);
|
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
|
||||||
NULL, btlname);
|
|
||||||
if (NULL != btlname) free(btlname);
|
|
||||||
|
|
||||||
/* Now send out messages to all endpoints that we are disconnecting.
|
|
||||||
* Only do this to endpoints that are connected. Otherwise, the
|
|
||||||
* remote side does not yet have the information on this endpoint. */
|
|
||||||
for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) {
|
|
||||||
endpoint = (mca_btl_openib_endpoint_t*)
|
|
||||||
opal_pointer_array_get_item(openib_btl->device->endpoints, i);
|
|
||||||
if (NULL == endpoint) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) {
|
|
||||||
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
|
||||||
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
|
||||||
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function gets called when a control message is received that
|
|
||||||
* is one of the following types:
|
|
||||||
* MCA_BTL_OPENIB_CONTROL_EP_BROKEN
|
|
||||||
* MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message
|
|
||||||
* Note that we are using the working connection to send information
|
|
||||||
* about the broken connection. That is why we have to look at the
|
|
||||||
* various information in the control message to figure out which
|
|
||||||
* endpoint is broken. It is (obviously) not the one the message was
|
|
||||||
* received on, because we would not have received the message in that
|
|
||||||
* case. In the case of the BROKEN message, that means the remote
|
|
||||||
* side is notifying us that it has brought down its half of the
|
|
||||||
* connection. Therefore, we need to bring out half down. This is
|
|
||||||
* done because it has been observed that there are cases where only
|
|
||||||
* one side of the connection actually sees the error. This means we
|
|
||||||
* can be left in a state where one side believes it has two BTLs, but
|
|
||||||
* the other side believes it only has one. This can cause problems.
|
|
||||||
* In the case of the EAGER_RDMA_ERROR, see elsewhere in the code what
|
|
||||||
* we are doing.
|
|
||||||
* @param ctl_hdr Pointer control header that was received
|
|
||||||
*/
|
|
||||||
void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t *ctl_hdr,
|
|
||||||
mca_btl_openib_endpoint_t* ep)
|
|
||||||
{
|
|
||||||
mca_btl_openib_broken_connection_header_t *bc_hdr =
|
|
||||||
(mca_btl_openib_broken_connection_header_t*)ctl_hdr;
|
|
||||||
int i;
|
|
||||||
int found = false;
|
|
||||||
|
|
||||||
if(ep->nbo) {
|
|
||||||
BTL_OPENIB_BROKEN_CONNECTION_HEADER_NTOH((*bc_hdr));
|
|
||||||
}
|
|
||||||
|
|
||||||
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Control message received from %d: lid=%d,subnet=0x%" PRIx64 "",
|
|
||||||
bc_hdr->vpid, bc_hdr->lid, bc_hdr->subnet_id);
|
|
||||||
|
|
||||||
/* Now we walk through all the endpoints on all the BTLs to
|
|
||||||
* find out which one to map out. */
|
|
||||||
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
||||||
mca_btl_openib_module_t* newbtl;
|
|
||||||
int j;
|
|
||||||
|
|
||||||
newbtl = mca_btl_openib_component.openib_btls[i];
|
|
||||||
/* Now, find the endpoint associated with it */
|
|
||||||
for (j = 0; j < opal_pointer_array_get_size(newbtl->device->endpoints); j++) {
|
|
||||||
mca_btl_base_endpoint_t* newep;
|
|
||||||
newep = (mca_btl_openib_endpoint_t*)
|
|
||||||
opal_pointer_array_get_item(newbtl->device->endpoints, j);
|
|
||||||
if (NULL == newep) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
/* Now compare the LID, subnet ID, and the vpid we received
|
|
||||||
* from the remote side and try to match it to an endpoint. */
|
|
||||||
if ((bc_hdr->lid == newep->rem_info.rem_lid) &&
|
|
||||||
(bc_hdr->subnet_id == newep->rem_info.rem_subnet_id) &&
|
|
||||||
(bc_hdr->vpid == newep->endpoint_proc->proc_opal->proc_name.vpid)) {
|
|
||||||
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Control message received from %d: "
|
|
||||||
"found match: lid=%d,"
|
|
||||||
"subnet=0x%" PRIx64 ",endpoint_state=%d",
|
|
||||||
newep->endpoint_proc->proc_opal->proc_name.vpid,
|
|
||||||
newep->rem_info.rem_lid,
|
|
||||||
newep->rem_info.rem_subnet_id,
|
|
||||||
newep->endpoint_state);
|
|
||||||
found = true;
|
|
||||||
/* At this point, we have found the endpoint. Now decode the
|
|
||||||
* message type and do the appropriate action. */
|
|
||||||
if (MCA_BTL_OPENIB_CONTROL_EP_BROKEN == ctl_hdr->type) {
|
|
||||||
/* Now that we found a match, check the state of the
|
|
||||||
* endpoint to see it is already in a failed state.
|
|
||||||
* If not, then notify the upper layer and error out
|
|
||||||
* any pending fragments. */
|
|
||||||
if (MCA_BTL_IB_FAILED == newep->endpoint_state) {
|
|
||||||
return;
|
|
||||||
} else {
|
|
||||||
char *btlname = NULL;
|
|
||||||
opal_proc_t* remote_proc = NULL;
|
|
||||||
|
|
||||||
asprintf(&btlname, "lid=%d:name=%s",
|
|
||||||
newbtl->lid, newbtl->device->ib_dev->name);
|
|
||||||
|
|
||||||
remote_proc = newep->endpoint_proc->proc_opal;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Control message received from %d: "
|
|
||||||
"bringing down connection,lid=%d,"
|
|
||||||
"subnet=0x%" PRIx64 ",endpoint_state=%d",
|
|
||||||
newep->endpoint_proc->proc_opal->proc_name.vpid,
|
|
||||||
newep->rem_info.rem_lid,
|
|
||||||
newep->rem_info.rem_subnet_id,
|
|
||||||
newep->endpoint_state);
|
|
||||||
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
|
||||||
remote_proc, btlname);
|
|
||||||
if (NULL != btlname) free(btlname);
|
|
||||||
|
|
||||||
error_out_all_pending_frags(newep, &newbtl->super, true);
|
|
||||||
newep->endpoint_state = MCA_BTL_IB_FAILED;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} else { /* MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message */
|
|
||||||
/* If we are still pointing at the location where
|
|
||||||
* we detected an error on the remote side, then
|
|
||||||
* bump the index by one. */
|
|
||||||
if (newep->eager_rdma_local.head == (uint16_t)bc_hdr->index) {
|
|
||||||
/* Adjust the local head by one just in case */
|
|
||||||
MCA_BTL_OPENIB_RDMA_NEXT_INDEX(newep->eager_rdma_local.head);
|
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: rank=%d, control message (remote=%d), "
|
|
||||||
"moved local head by one (new=%d)",
|
|
||||||
OPAL_PROC_MY_NAME.vpid,
|
|
||||||
newep->endpoint_proc->proc_opal->proc_name.vpid,
|
|
||||||
newep->eager_rdma_local.head);
|
|
||||||
} else {
|
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: rank=%d, control message (remote=%d), "
|
|
||||||
"did not move local head by one (still=%d)",
|
|
||||||
OPAL_PROC_MY_NAME.vpid,
|
|
||||||
newep->endpoint_proc->proc_opal->proc_name.vpid,
|
|
||||||
newep->eager_rdma_local.head);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break; /* since we found the endpoint */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (false == found) {
|
|
||||||
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Control message: no match found");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function will find all the pending fragments on an endpoint
|
|
||||||
* and call the callback function with OPAL_ERROR. It walks through
|
|
||||||
* each qp with each priority and looks for both no_credits_pending_frags
|
|
||||||
* and no_wqe_pending_frags. It then looks for any pending_lazy_frags,
|
|
||||||
* pending_put_frags, and pending_get_frags. This function is only
|
|
||||||
* called when running with failover support enabled. Note that
|
|
||||||
* the errout parameter allows the function to also be used as a
|
|
||||||
* debugging tool to see if there are any fragments on any of the
|
|
||||||
* queues.
|
|
||||||
* @param ep Pointer to endpoint that had error
|
|
||||||
* @param module Pointer to module that had error
|
|
||||||
* @param errout Boolean which says whether to error them out or not
|
|
||||||
*/
|
|
||||||
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|
||||||
struct mca_btl_base_module_t* module,
|
|
||||||
bool errout)
|
|
||||||
{
|
|
||||||
int qp, pri, len, total, btl_ownership;
|
|
||||||
|
|
||||||
opal_list_item_t *item;
|
|
||||||
mca_btl_openib_com_frag_t* frag;
|
|
||||||
mca_btl_base_descriptor_t *des;
|
|
||||||
int verbose = 10; /* Verbosity level unless debugging */
|
|
||||||
|
|
||||||
/* If debugging, drop verbosity level so we can see the output
|
|
||||||
* regardless of the level the program was run with. */
|
|
||||||
if (false == errout) {
|
|
||||||
verbose = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
total = 0;
|
|
||||||
/* Traverse all QPs and all priorities and move to other endpoint */
|
|
||||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
|
|
||||||
for (pri = 0; pri < 2; ++pri) {
|
|
||||||
/* All types of qp's have a no_wqe_pending_frags list */
|
|
||||||
len = opal_list_get_size(&ep->qps[qp].no_wqe_pending_frags[pri]);
|
|
||||||
if (len > 0) {
|
|
||||||
total += len;
|
|
||||||
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Checking for no_wqe_pending_frags qp=%d, "
|
|
||||||
"pri=%d, list size=%d",
|
|
||||||
qp, pri, len);
|
|
||||||
if (true == errout) {
|
|
||||||
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
|
||||||
no_wqe_pending_frags[pri]))) {
|
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
|
||||||
|
|
||||||
/* Error out any coalesced frags if they exist */
|
|
||||||
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
||||||
opal_list_item_t *i;
|
|
||||||
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
||||||
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Found coalesced frag in no_wqe_pending_frags");
|
|
||||||
btl_ownership = (to_base_frag(i)->base.des_flags &
|
|
||||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
to_base_frag(i)->base.des_cbfunc(module, ep,
|
|
||||||
&to_base_frag(i)->base, OPAL_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(module, des);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
|
|
||||||
len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
|
|
||||||
if (len > 0) {
|
|
||||||
total += len;
|
|
||||||
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Checking for no_credits_pending_frags qp=%d, "
|
|
||||||
"pri=%d, list size=%d",
|
|
||||||
qp, pri, len);
|
|
||||||
if (true == errout) {
|
|
||||||
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
|
||||||
no_credits_pending_frags[pri]))) {
|
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
|
||||||
|
|
||||||
/* Error out any coalesced frags if they exist */
|
|
||||||
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
||||||
opal_list_item_t *i;
|
|
||||||
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
||||||
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Found coalesced frag in "
|
|
||||||
"no_credits_pending_frags");
|
|
||||||
btl_ownership = (to_base_frag(i)->base.des_flags &
|
|
||||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
to_base_frag(i)->base.des_cbfunc(module, ep,
|
|
||||||
&to_base_frag(i)->base, OPAL_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(module, des);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
|
|
||||||
len = opal_list_get_size(&ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]);
|
|
||||||
if (len > 0) {
|
|
||||||
total += len;
|
|
||||||
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Checking for srq pending_frags qp=%d, pri=%d, "
|
|
||||||
"list size=%d",
|
|
||||||
qp, pri, len);
|
|
||||||
if (true == errout) {
|
|
||||||
while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp].
|
|
||||||
u.srq_qp.pending_frags[pri]))) {
|
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
|
||||||
|
|
||||||
/* Error out any coalesced frags if they exist */
|
|
||||||
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
||||||
opal_list_item_t *i;
|
|
||||||
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
||||||
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Found coalesced frag in SRQ pending_frags");
|
|
||||||
btl_ownership = (to_base_frag(i)->base.des_flags &
|
|
||||||
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
to_base_frag(i)->base.des_cbfunc(module, ep,
|
|
||||||
&to_base_frag(i)->base, OPAL_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
||||||
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
||||||
if( btl_ownership ) {
|
|
||||||
mca_btl_openib_free(module, des);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Check for any frags from a connection that was never made. Not sure if this
|
|
||||||
* can actually happen. */
|
|
||||||
len = opal_list_get_size(&ep->pending_lazy_frags);
|
|
||||||
|
|
||||||
if (len > 0) {
|
|
||||||
total += len;
|
|
||||||
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Checking for pending_lazy_frags, list size=%d", len);
|
|
||||||
if (true == errout) {
|
|
||||||
while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) {
|
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
|
||||||
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
len = opal_list_get_size(&ep->pending_put_frags);
|
|
||||||
if (len > 0) {
|
|
||||||
total += len;
|
|
||||||
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Checking for pending_put_frags, list size=%d", len);
|
|
||||||
if (true == errout) {
|
|
||||||
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
|
||||||
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
len = opal_list_get_size(&ep->pending_get_frags);
|
|
||||||
if (len > 0) {
|
|
||||||
total += len;
|
|
||||||
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Checking for pending_get_frags, list size=%d", len);
|
|
||||||
if (true == errout) {
|
|
||||||
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
|
||||||
frag = (mca_btl_openib_com_frag_t *) item;
|
|
||||||
des = (mca_btl_base_descriptor_t *)frag;
|
|
||||||
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
opal_output_verbose(verbose + 30, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Finished checking for pending_frags, total moved=%d",
|
|
||||||
total);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* local callback function for completion of a failover control message */
|
|
||||||
static void mca_btl_openib_endpoint_notify_cb(mca_btl_base_module_t* btl,
|
|
||||||
struct mca_btl_base_endpoint_t* endpoint,
|
|
||||||
struct mca_btl_base_descriptor_t* descriptor,
|
|
||||||
int status)
|
|
||||||
{
|
|
||||||
MCA_BTL_IB_FRAG_RETURN(descriptor);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function is used to send a message to the remote side
|
|
||||||
* indicating the endpoint is broken and telling the remote side to
|
|
||||||
* brings its endpoint down as well. This is needed because there are
|
|
||||||
* cases where only one side of the connection determines that the
|
|
||||||
* there was a problem.
|
|
||||||
* @param endpoint Pointer to endpoint with error
|
|
||||||
* @param type Type of message to be sent, can be one of two types
|
|
||||||
* @param index When sending RDMA error message, index is non zero
|
|
||||||
*/
|
|
||||||
static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index)
|
|
||||||
{
|
|
||||||
mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
|
|
||||||
mca_btl_openib_module_t* newbtl = NULL;
|
|
||||||
bool found = false;
|
|
||||||
mca_btl_openib_broken_connection_header_t *bc_hdr;
|
|
||||||
mca_btl_openib_send_control_frag_t* frag;
|
|
||||||
mca_btl_base_endpoint_t* newep;
|
|
||||||
int i, rc;
|
|
||||||
opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal;
|
|
||||||
|
|
||||||
/* First, find a different BTL than this one that got the
|
|
||||||
* error to send the message over. */
|
|
||||||
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
||||||
if (mca_btl_openib_component.openib_btls[i] != openib_btl) {
|
|
||||||
newbtl = mca_btl_openib_component.openib_btls[i];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (NULL == newbtl) {
|
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Endpoint Notify: No BTL found");
|
|
||||||
/* If we cannot find one, then just return. */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Now, find the endpoint associated with it. The device
|
|
||||||
* associated with the BTL has the list of all the
|
|
||||||
* endpoints. */
|
|
||||||
for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) {
|
|
||||||
newep = (mca_btl_openib_endpoint_t*)
|
|
||||||
opal_pointer_array_get_item(newbtl->device->endpoints, i);
|
|
||||||
if (NULL == newep) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (newep->endpoint_proc->proc_opal == remote_proc) {
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (false == found) {
|
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Endpoint Notify: No endpoint found");
|
|
||||||
/* If we cannot find a match, then just return. */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
frag = alloc_control_frag(newbtl);
|
|
||||||
if(NULL == frag) {
|
|
||||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
||||||
"IB: Endpoint Notify: No frag space");
|
|
||||||
/* If no frag available, then just return. */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
to_base_frag(frag)->base.des_cbfunc =
|
|
||||||
mca_btl_openib_endpoint_notify_cb;
|
|
||||||
to_base_frag(frag)->base.des_cbdata = NULL;
|
|
||||||
to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
|
||||||
to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
|
|
||||||
to_base_frag(frag)->segment.seg_len =
|
|
||||||
sizeof(mca_btl_openib_broken_connection_header_t);
|
|
||||||
to_com_frag(frag)->endpoint = newep;
|
|
||||||
|
|
||||||
frag->hdr->tag = MCA_BTL_TAG_IB;
|
|
||||||
bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.seg_addr.pval;
|
|
||||||
bc_hdr->control.type = type;
|
|
||||||
bc_hdr->lid = endpoint->endpoint_btl->port_info.lid;
|
|
||||||
bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id;
|
|
||||||
bc_hdr->vpid = OPAL_PROC_MY_NAME.vpid;
|
|
||||||
bc_hdr->index = index;
|
|
||||||
|
|
||||||
if(newep->nbo) {
|
|
||||||
BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr));
|
|
||||||
}
|
|
||||||
rc = mca_btl_openib_endpoint_send(newep, frag);
|
|
||||||
if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
MCA_BTL_IB_FRAG_RETURN(frag);
|
|
||||||
BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno)));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Function used for debugging problems in eager rdma.
|
|
||||||
*/
|
|
||||||
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint) {
|
|
||||||
mca_btl_openib_recv_frag_t *headers_buf = endpoint->eager_rdma_local.frags;
|
|
||||||
mca_btl_openib_recv_frag_t * frag;
|
|
||||||
mca_btl_openib_control_header_t* chdr;
|
|
||||||
int i, size;
|
|
||||||
|
|
||||||
opal_output(0, "Head = %d", endpoint->eager_rdma_local.head);
|
|
||||||
|
|
||||||
for (i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
|
|
||||||
frag = &headers_buf[i];
|
|
||||||
size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);
|
|
||||||
|
|
||||||
frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
|
|
||||||
size + sizeof(mca_btl_openib_footer_t));
|
|
||||||
to_base_frag(frag)->segment.seg_addr.pval =
|
|
||||||
((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
|
|
||||||
|
|
||||||
chdr = to_base_frag(frag)->segment.seg_addr.pval;
|
|
||||||
if ((MCA_BTL_TAG_IB == frag->hdr->tag) &&
|
|
||||||
(MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type)) {
|
|
||||||
opal_output(0, "tag[%d] is credit message", i);
|
|
||||||
} else {
|
|
||||||
opal_output(0, "frag[%d] size=%d,tag=%d,ftr->u.buf=%d", i, size, frag->hdr->tag,
|
|
||||||
frag->ftr->u.buf[3]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Function used for debugging problems in eager rdma.
|
|
||||||
*/
|
|
||||||
void mca_btl_openib_dump_all_local_rdma_frags(mca_btl_openib_device_t *device) {
|
|
||||||
int i, c;
|
|
||||||
mca_btl_openib_endpoint_t* endpoint;
|
|
||||||
|
|
||||||
c = device->eager_rdma_buffers_count;
|
|
||||||
opal_output(0, "rank=%d, device=%s", OPAL_PROC_MY_NAME.vpid, device->ib_dev->name);
|
|
||||||
|
|
||||||
for(i = 0; i < c; i++) {
|
|
||||||
endpoint = device->eager_rdma_buffers[i];
|
|
||||||
|
|
||||||
if(!endpoint)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
dump_local_rdma_frags(endpoint);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function is a debugging tool. If you notify a hang, you can
|
|
||||||
* call this function from a debugger and see if there are any
|
|
||||||
* messages stuck in any of the queues. If you call it with
|
|
||||||
* errout=true, then it will error them out. Otherwise, it will
|
|
||||||
* just print out the size of the queues with data in them.
|
|
||||||
*/
|
|
||||||
void mca_btl_openib_dump_all_internal_queues(bool errout) {
|
|
||||||
int i, j, num_eps;
|
|
||||||
mca_btl_openib_module_t* btl;
|
|
||||||
int total;
|
|
||||||
mca_btl_base_endpoint_t* ep;
|
|
||||||
struct mca_btl_base_module_t* module;
|
|
||||||
|
|
||||||
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
||||||
btl = mca_btl_openib_component.openib_btls[i];
|
|
||||||
module = &btl->super;
|
|
||||||
num_eps = opal_pointer_array_get_size(btl->device->endpoints);
|
|
||||||
|
|
||||||
/* Now, find the endpoint associated with it */
|
|
||||||
for (j = 0; j < num_eps; j++) {
|
|
||||||
ep = (mca_btl_openib_endpoint_t*)
|
|
||||||
opal_pointer_array_get_item(btl->device->endpoints, j);
|
|
||||||
if (NULL == ep) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
total = 0;
|
|
||||||
error_out_all_pending_frags(ep, module, errout);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
|
||||||
* $COPYRIGHT$
|
|
||||||
*
|
|
||||||
* Additional copyrights may follow
|
|
||||||
*
|
|
||||||
* $HEADER$
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @file
|
|
||||||
* Functions called by BTL to handle error events
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MCA_BTL_IB_FAILOVER_H
|
|
||||||
#define MCA_BTL_IB_FAILOVER_H
|
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
|
||||||
|
|
||||||
void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|
||||||
mca_btl_base_descriptor_t *des,
|
|
||||||
int qp,
|
|
||||||
opal_proc_t* remote_proc,
|
|
||||||
mca_btl_openib_endpoint_t* endpoint);
|
|
||||||
void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl);
|
|
||||||
void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t *ctl_hdr,
|
|
||||||
mca_btl_openib_endpoint_t* ep);
|
|
||||||
|
|
||||||
END_C_DECLS
|
|
||||||
|
|
||||||
#endif
|
|
@ -190,10 +190,6 @@ typedef struct mca_btl_openib_footer_t mca_btl_openib_footer_t;
|
|||||||
#define MCA_BTL_OPENIB_CONTROL_RDMA 1
|
#define MCA_BTL_OPENIB_CONTROL_RDMA 1
|
||||||
#define MCA_BTL_OPENIB_CONTROL_COALESCED 2
|
#define MCA_BTL_OPENIB_CONTROL_COALESCED 2
|
||||||
#define MCA_BTL_OPENIB_CONTROL_CTS 3
|
#define MCA_BTL_OPENIB_CONTROL_CTS 3
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
#define MCA_BTL_OPENIB_CONTROL_EP_BROKEN 4
|
|
||||||
#define MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR 5
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct mca_btl_openib_control_header_t {
|
struct mca_btl_openib_control_header_t {
|
||||||
uint8_t type;
|
uint8_t type;
|
||||||
@ -243,32 +239,6 @@ do { \
|
|||||||
(h).rdma_credits = ntohs((h).rdma_credits); \
|
(h).rdma_credits = ntohs((h).rdma_credits); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
struct mca_btl_openib_broken_connection_header_t {
|
|
||||||
mca_btl_openib_control_header_t control;
|
|
||||||
uint32_t lid;
|
|
||||||
uint64_t subnet_id;
|
|
||||||
uint32_t vpid;
|
|
||||||
uint32_t index; /* for eager RDMA only */
|
|
||||||
};
|
|
||||||
typedef struct mca_btl_openib_broken_connection_header_t mca_btl_openib_broken_connection_header_t;
|
|
||||||
|
|
||||||
#define BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON(h) \
|
|
||||||
do { \
|
|
||||||
(h).lid = htonl((h).lid); \
|
|
||||||
(h).subnet_id = hton64((h).subnet_id); \
|
|
||||||
(h).vpid = htonl((h).vpid); \
|
|
||||||
(h).index = htonl((h).index); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define BTL_OPENIB_BROKEN_CONNECTION_HEADER_NTOH(h) \
|
|
||||||
do { \
|
|
||||||
(h).lid = ntohl((h).lid); \
|
|
||||||
(h).subnet_id = ntoh64((h).subnet_id); \
|
|
||||||
(h).vpid = ntohl((h).vpid); \
|
|
||||||
(h).index = ntohl((h).index); \
|
|
||||||
} while (0)
|
|
||||||
#endif
|
|
||||||
enum mca_btl_openib_frag_type_t {
|
enum mca_btl_openib_frag_type_t {
|
||||||
MCA_BTL_OPENIB_FRAG_RECV,
|
MCA_BTL_OPENIB_FRAG_RECV,
|
||||||
MCA_BTL_OPENIB_FRAG_RECV_USER,
|
MCA_BTL_OPENIB_FRAG_RECV_USER,
|
||||||
|
@ -89,11 +89,6 @@ static mca_base_var_enum_value_t device_type_values[] = {
|
|||||||
static int btl_openib_cq_size;
|
static int btl_openib_cq_size;
|
||||||
static bool btl_openib_have_fork_support = OPAL_HAVE_IBV_FORK_INIT;
|
static bool btl_openib_have_fork_support = OPAL_HAVE_IBV_FORK_INIT;
|
||||||
|
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
static int btl_openib_verbose_failover;
|
|
||||||
static bool btl_openib_failover_enabled = true;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* utility routine for string parameter registration
|
* utility routine for string parameter registration
|
||||||
*/
|
*/
|
||||||
@ -473,30 +468,6 @@ int btl_openib_register_mca_params(void)
|
|||||||
"If nonzero, use the thread that will handle InfiniBand asynchronous events",
|
"If nonzero, use the thread that will handle InfiniBand asynchronous events",
|
||||||
true, &mca_btl_openib_component.use_async_event_thread));
|
true, &mca_btl_openib_component.use_async_event_thread));
|
||||||
|
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
/* failover specific output */
|
|
||||||
CHECK(reg_int("verbose_failover", NULL,
|
|
||||||
"Output some verbose OpenIB BTL failover information "
|
|
||||||
"(0 = no output, nonzero = output)", 0, &btl_openib_verbose_failover, 0));
|
|
||||||
mca_btl_openib_component.verbose_failover = opal_output_open(NULL);
|
|
||||||
opal_output_set_verbosity(mca_btl_openib_component.verbose_failover, btl_openib_verbose_failover);
|
|
||||||
|
|
||||||
CHECK(reg_bool("port_error_failover", NULL,
|
|
||||||
"If nonzero, asynchronous port errors will trigger failover",
|
|
||||||
0, &mca_btl_openib_component.port_error_failover));
|
|
||||||
|
|
||||||
/* Make non writeable parameter that indicates failover is configured in. */
|
|
||||||
tmp = mca_base_component_var_register(&mca_btl_openib_component.super.btl_version,
|
|
||||||
"failover_enabled",
|
|
||||||
"openib failover is configured: run with bfo PML to support failover between openib BTLs",
|
|
||||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
|
|
||||||
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
|
|
||||||
OPAL_INFO_LVL_9,
|
|
||||||
MCA_BASE_VAR_SCOPE_CONSTANT,
|
|
||||||
&btl_openib_failover_enabled);
|
|
||||||
if (0 > tmp) ret = tmp;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
CHECK(reg_bool("enable_srq_resize", NULL,
|
CHECK(reg_bool("enable_srq_resize", NULL,
|
||||||
"Enable/Disable on demand SRQ resize. "
|
"Enable/Disable on demand SRQ resize. "
|
||||||
"(0 = without resizing, nonzero = with resizing)", 1,
|
"(0 = without resizing, nonzero = with resizing)", 1,
|
||||||
@ -570,10 +541,6 @@ int btl_openib_register_mca_params(void)
|
|||||||
mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
|
mca_btl_openib_module.super.btl_flags = MCA_BTL_FLAGS_RDMA |
|
||||||
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA |
|
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA |
|
||||||
MCA_BTL_FLAGS_SEND;
|
MCA_BTL_FLAGS_SEND;
|
||||||
#if BTL_OPENIB_FAILOVER_ENABLED
|
|
||||||
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_FAILOVER_SUPPORT;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if HAVE_DECL_IBV_ATOMIC_HCA
|
#if HAVE_DECL_IBV_ATOMIC_HCA
|
||||||
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS;
|
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS;
|
||||||
mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
|
mca_btl_openib_module.super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP;
|
||||||
|
@ -104,22 +104,6 @@ AC_DEFUN([MCA_opal_btl_openib_CONFIG],[
|
|||||||
AC_MSG_CHECKING([which openib btl cpcs will be built])
|
AC_MSG_CHECKING([which openib btl cpcs will be built])
|
||||||
AC_MSG_RESULT([$cpcs])])
|
AC_MSG_RESULT([$cpcs])])
|
||||||
|
|
||||||
# Enable openib device failover. It is disabled by default.
|
|
||||||
AC_MSG_CHECKING([whether openib failover is enabled])
|
|
||||||
AC_ARG_ENABLE([btl-openib-failover],
|
|
||||||
[AC_HELP_STRING([--enable-btl-openib-failover],
|
|
||||||
[enable openib BTL failover (default: disabled)])])
|
|
||||||
if test "$enable_btl_openib_failover" = "yes"; then
|
|
||||||
AC_MSG_RESULT([yes])
|
|
||||||
btl_openib_failover_enabled=1
|
|
||||||
else
|
|
||||||
AC_MSG_RESULT([no])
|
|
||||||
btl_openib_failover_enabled=0
|
|
||||||
fi
|
|
||||||
AC_DEFINE_UNQUOTED([BTL_OPENIB_FAILOVER_ENABLED], [$btl_openib_failover_enabled],
|
|
||||||
[enable openib BTL failover])
|
|
||||||
AM_CONDITIONAL([MCA_btl_openib_enable_failover], [test "x$btl_openib_failover_enabled" = "x1"])
|
|
||||||
|
|
||||||
# make sure that CUDA-aware checks have been done
|
# make sure that CUDA-aware checks have been done
|
||||||
AC_REQUIRE([OPAL_CHECK_CUDA])
|
AC_REQUIRE([OPAL_CHECK_CUDA])
|
||||||
|
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user