552c9ca5a0
WHAT: Open our low-level communication infrastructure by moving all necessary components (btl/rcache/allocator/mpool) down in OPAL All the components required for inter-process communications are currently deeply integrated in the OMPI layer. Several groups/institutions have express interest in having a more generic communication infrastructure, without all the OMPI layer dependencies. This communication layer should be made available at a different software level, available to all layers in the Open MPI software stack. As an example, our ORTE layer could replace the current OOB and instead use the BTL directly, gaining access to more reactive network interfaces than TCP. Similarly, external software libraries could take advantage of our highly optimized AM (active message) communication layer for their own purpose. UTK with support from Sandia, developped a version of Open MPI where the entire communication infrastucture has been moved down to OPAL (btl/rcache/allocator/mpool). Most of the moved components have been updated to match the new schema, with few exceptions (mainly BTLs where I have no way of compiling/testing them). Thus, the completion of this RFC is tied to being able to completing this move for all BTLs. For this we need help from the rest of the Open MPI community, especially those supporting some of the BTLs. A non-exhaustive list of BTLs that qualify here is: mx, portals4, scif, udapl, ugni, usnic. This commit was SVN r32317.
788 строки
37 KiB
C
788 строки
37 KiB
C
/*
|
|
* Copyright (c) 2010-2011 Oracle and/or its affiliates. All rights reserved.
|
|
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
|
|
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* Functions specific to implementing failover support.
|
|
*
|
|
* This file is conditionally copiled into the BTL when one configures
|
|
* it in with --enable-openib-failover. When this file is compiled
|
|
* in, the multi-BTL configurations can handle errors. The
|
|
* requirement is that there needs to be more than one openib BTL in
|
|
* use so that all the traffic can move to the other BTL. This does
|
|
* not support failing over to a different BTL like TCP.
|
|
*/
|
|
|
|
#include "opal_config.h"
|
|
#include "opal_stdint.h"
|
|
|
|
#include "btl_openib.h"
|
|
#include "btl_openib_endpoint.h"
|
|
#include "btl_openib_proc.h"
|
|
#include "btl_openib_failover.h"
|
|
|
|
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|
struct mca_btl_base_module_t* module,
|
|
bool errout);
|
|
static void mca_btl_openib_endpoint_notify(mca_btl_openib_endpoint_t *endpoint,
|
|
uint8_t type, int index);
|
|
|
|
/* debug functions that are normally not needed */
|
|
void mca_btl_openib_dump_all_local_rdma_frags(mca_btl_openib_device_t *device);
|
|
void mca_btl_openib_dump_all_internal_queues(bool errout);
|
|
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint);
|
|
|
|
/**
|
|
* This function is called when we get an error on the completion
|
|
* event of a fragment. We check to see what type of fragment it is
|
|
* and act accordingly. In most cases, we first call up into the PML
|
|
* and have it map out this connection for any future communication.
|
|
* In addition, this function will possibly send some control messages
|
|
* over the other openib BTL. The first control message will tell the
|
|
* remote side to also map out this connection. The second control
|
|
* message makes sure the eager RDMA connection remains in a sane
|
|
* state. See that function for more details.
|
|
* @param openib_btl Pointer to BTL that had the error
|
|
* @param des Pointer to descriptor that had the error
|
|
* @param qp Queue pair that had the error
|
|
* @param remote_proc Pointer to process that had the error
|
|
* @param endpoint Pointer to endpoint that had the error
|
|
*/
|
|
void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
|
mca_btl_base_descriptor_t *des,
|
|
int qp,
|
|
opal_proc_t* remote_proc,
|
|
mca_btl_openib_endpoint_t* endpoint)
|
|
{
|
|
char *btlname = NULL;
|
|
int btl_ownership;
|
|
/* Since this BTL supports failover, it will call the PML error handler
|
|
* function with the NONFATAL flag. If the PML is running with failover
|
|
* support, then it will map out the endpoint for further communication
|
|
* and return control here. If the PML does not have failover support,
|
|
* it will abort the job and control will not return here. */
|
|
|
|
/* Note: At this point, what needs to be done is based on the type
|
|
* of openib fragment that got the error. Also note that in the wc
|
|
* struct, when wc->status != IBV_WC_SUCCESS, these are the only
|
|
* valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num.
|
|
* This means that one cannot key off of the wc->opcode to see what
|
|
* operation was done. The important information needs to be read
|
|
* from the fragment. */
|
|
|
|
/* Cannot issue callback to SRQ errors because the shared receive
|
|
* queue is shared and is not specific to a connection. There is no
|
|
* way to figure out what type of message created the error because
|
|
* we need the information in the wc->imm_data field which does not
|
|
* exist when we have an error. So, nothing to do here but return. */
|
|
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
|
!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
"SRQ RECV type=%d", openib_frag_type(des));
|
|
/* Need to think about returning any shared resources of the
|
|
* SRQ. For now, we do nothing as we rarely see an error on
|
|
* the SRQ. */
|
|
return;
|
|
}
|
|
assert(NULL != remote_proc);
|
|
|
|
/* Create a nice string to help with debug */
|
|
if (NULL != openib_btl) {
|
|
asprintf(&btlname, "lid=%d:name=%s",
|
|
openib_btl->lid, openib_btl->device->ib_dev->name);
|
|
}
|
|
|
|
/* The next set of errors are associated with an endpoint, but not
|
|
* with a PML descriptor. They are not associated with a PML
|
|
* descriptor because:
|
|
* A. It was a receive
|
|
* B. It was some type of openib specific control message.
|
|
* Therefore, just drop the fragments and call up into the PML to
|
|
* disable this endpoint for future communication. */
|
|
if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
|
(BTL_OPENIB_QP_TYPE_PP(qp))) ||
|
|
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) ||
|
|
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) {
|
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
|
remote_proc, btlname);
|
|
/* Now that this connection has been mapped out at the PML layer,
|
|
* we change the state in the BTL layer. The change in the PML
|
|
* layer should prevent that we ever try to send on this BTL
|
|
* again. If we do, then this is an error case. */
|
|
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
|
|
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
|
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
|
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
|
}
|
|
opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
|
|
"MCA_BTL_OPENIG_FRAG=%d, "
|
|
"dropping since connection is broken (des=%lx)",
|
|
openib_frag_type(des), (long unsigned int) des);
|
|
if (NULL != btlname) free(btlname);
|
|
return;
|
|
}
|
|
|
|
/* These are RDMA read type fragments. Just continue with processing */
|
|
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) {
|
|
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
|
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
"OPENIB_FRAG_RECV_USER fragment, "
|
|
"btl=%lx, continue with callbacks",
|
|
(long unsigned int) &openib_btl->super);
|
|
}
|
|
|
|
/* If we are at this point, we have completed a send, RDMA read or
|
|
* RDMA write. Call the PML callback function to map out this
|
|
* btl for further sending. We just call this every time we get an
|
|
* error even though it is not necessary. Subsequent calls with
|
|
* the same remote_proc argument will not actually map anything out. */
|
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
|
remote_proc, btlname);
|
|
if (NULL != btlname) free(btlname);
|
|
|
|
/* Since we believe we have done a send, read or write, then the
|
|
* des_local fields should have valid data. */
|
|
assert(des->des_local != NULL);
|
|
|
|
/* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then
|
|
* change the status. Since this connection was mapped out in the
|
|
* PML layer, no more attempts should be made to send on it. In
|
|
* addition, send a message to other end of the connection letting
|
|
* it know that this side is now broken. This is needed in the case
|
|
* of a spurious error which may not cause the remote side to detect
|
|
* the error. */
|
|
if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
|
|
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
|
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
|
}
|
|
|
|
/* Now, call the callback function associated with the fragment.
|
|
* In case the fragments were coalesced we need to pull them apart
|
|
* and call the callback function for each one. */
|
|
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
opal_list_item_t *i;
|
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
|
|
&to_base_frag(i)->base, OPAL_ERROR);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER
|
|
* or MCA_BTL_OPENIB_FRAG_RECV_USER. */
|
|
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_ERROR);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(&openib_btl->super, des);
|
|
}
|
|
|
|
/* Here we send another control message to notify the remote side
|
|
* we had an error on a eager fragment. A non-zero value for the
|
|
* ftr variable indicates that this was an eager RDMA fragment.
|
|
* We need to do this in case the eager RDMA fragment after this
|
|
* one actually made it successfully. */
|
|
if (0 != to_send_frag(des)->ftr) {
|
|
mca_btl_openib_endpoint_notify(endpoint,
|
|
MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR,
|
|
(long)to_send_frag(des)->ftr - 1);
|
|
}
|
|
|
|
/* We know we have completed a send so return some resources even
|
|
* though connection is broken. With SRQ, the resources are shared
|
|
* so if we do not return the credits we may not be allowed to send
|
|
* anymore. */
|
|
qp_put_wqe(endpoint, qp);
|
|
if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) {
|
|
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
|
|
}
|
|
|
|
/* There are several queues associated with an endpoint that may
|
|
* have some unsent fragments sitting in them. Remove them and
|
|
* call the callback functions with an error so the PML can send
|
|
* them down a different path. This really only needs to be called
|
|
* once on an endpoint, but for now, just call it a bunch of times.
|
|
* The first time through will remove the unsent fragments so
|
|
* subsequent calls are no-ops. */
|
|
if (endpoint) {
|
|
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This functions allows an error to map out the entire BTL. First a
|
|
* call is made up to the PML to map out all connections from this BTL.
|
|
* Then a message is sent to all the endpoints connected to this BTL.
|
|
* This function is enabled by the btl_openib_port_error_failover
|
|
* MCA parameter. If that parameter is not set, then this function
|
|
* does not do anything.
|
|
* @param openib_btl Pointer to BTL that had the error
|
|
*/
|
|
void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
|
mca_btl_base_endpoint_t* endpoint;
|
|
int i;
|
|
|
|
/* Check to see that the flag is set for the entire map out. */
|
|
if(mca_btl_openib_component.port_error_failover) {
|
|
/* Since we are not specifying a specific connection to bring down,
|
|
* the PML layer will may out the entire BTL for future communication. */
|
|
char *btlname = NULL;
|
|
asprintf(&btlname, "lid=%d:name=%s",
|
|
openib_btl->lid, openib_btl->device->ib_dev->name);
|
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
|
NULL, btlname);
|
|
if (NULL != btlname) free(btlname);
|
|
|
|
/* Now send out messages to all endpoints that we are disconnecting.
|
|
* Only do this to endpoints that are connected. Otherwise, the
|
|
* remote side does not yet have the information on this endpoint. */
|
|
for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) {
|
|
endpoint = (mca_btl_openib_endpoint_t*)
|
|
opal_pointer_array_get_item(openib_btl->device->endpoints, i);
|
|
if (NULL == endpoint) {
|
|
continue;
|
|
}
|
|
if (MCA_BTL_IB_CONNECTED == endpoint->endpoint_state) {
|
|
mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
|
|
endpoint->endpoint_state = MCA_BTL_IB_FAILED;
|
|
error_out_all_pending_frags(endpoint, &openib_btl->super, true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This function gets called when a control message is received that
|
|
* is one of the following types:
|
|
* MCA_BTL_OPENIB_CONTROL_EP_BROKEN
|
|
* MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message
|
|
* Note that we are using the working connection to send information
|
|
* about the broken connection. That is why we have to look at the
|
|
* various information in the control message to figure out which
|
|
* endpoint is broken. It is (obviously) not the one the message was
|
|
* received on, because we would not have received the message in that
|
|
* case. In the case of the BROKEN message, that means the remote
|
|
* side is notifying us that it has brought down its half of the
|
|
* connection. Therefore, we need to bring out half down. This is
|
|
* done because it has been observed that there are cases where only
|
|
* one side of the connection actually sees the error. This means we
|
|
* can be left in a state where one side believes it has two BTLs, but
|
|
* the other side believes it only has one. This can cause problems.
|
|
* In the case of the EAGER_RDMA_ERROR, see elsewhere in the code what
|
|
* we are doing.
|
|
* @param ctl_hdr Pointer control header that was received
|
|
*/
|
|
void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t *ctl_hdr,
|
|
mca_btl_openib_endpoint_t* ep)
|
|
{
|
|
mca_btl_openib_broken_connection_header_t *bc_hdr =
|
|
(mca_btl_openib_broken_connection_header_t*)ctl_hdr;
|
|
int i;
|
|
int found = false;
|
|
|
|
if(ep->nbo) {
|
|
BTL_OPENIB_BROKEN_CONNECTION_HEADER_NTOH((*bc_hdr));
|
|
}
|
|
|
|
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
|
"IB: Control message received from %d: lid=%d,subnet=0x%" PRIx64 "",
|
|
bc_hdr->vpid, bc_hdr->lid, bc_hdr->subnet_id);
|
|
|
|
/* Now we walk through all the endpoints on all the BTLs to
|
|
* find out which one to map out. */
|
|
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
mca_btl_openib_module_t* newbtl;
|
|
int j;
|
|
|
|
newbtl = mca_btl_openib_component.openib_btls[i];
|
|
/* Now, find the endpoint associated with it */
|
|
for (j = 0; j < opal_pointer_array_get_size(newbtl->device->endpoints); j++) {
|
|
mca_btl_base_endpoint_t* newep;
|
|
newep = (mca_btl_openib_endpoint_t*)
|
|
opal_pointer_array_get_item(newbtl->device->endpoints, j);
|
|
if (NULL == newep) {
|
|
continue;
|
|
}
|
|
/* Now compare the LID, subnet ID, and the vpid we received
|
|
* from the remote side and try to match it to an endpoint. */
|
|
if ((bc_hdr->lid == newep->rem_info.rem_lid) &&
|
|
(bc_hdr->subnet_id == newep->rem_info.rem_subnet_id) &&
|
|
(bc_hdr->vpid == newep->endpoint_proc->proc_opal->proc_name.vpid)) {
|
|
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
|
"IB: Control message received from %d: "
|
|
"found match: lid=%d,"
|
|
"subnet=0x%" PRIx64 ",endpoint_state=%d",
|
|
newep->endpoint_proc->proc_opal->proc_name.vpid,
|
|
newep->rem_info.rem_lid,
|
|
newep->rem_info.rem_subnet_id,
|
|
newep->endpoint_state);
|
|
found = true;
|
|
/* At this point, we have found the endpoint. Now decode the
|
|
* message type and do the appropriate action. */
|
|
if (MCA_BTL_OPENIB_CONTROL_EP_BROKEN == ctl_hdr->type) {
|
|
/* Now that we found a match, check the state of the
|
|
* endpoint to see it is already in a failed state.
|
|
* If not, then notify the upper layer and error out
|
|
* any pending fragments. */
|
|
if (MCA_BTL_IB_FAILED == newep->endpoint_state) {
|
|
return;
|
|
} else {
|
|
char *btlname = NULL;
|
|
opal_proc_t* remote_proc = NULL;
|
|
|
|
asprintf(&btlname, "lid=%d:name=%s",
|
|
newbtl->lid, newbtl->device->ib_dev->name);
|
|
|
|
remote_proc = newep->endpoint_proc->proc_opal;
|
|
|
|
opal_output_verbose(10, mca_btl_openib_component.verbose_failover,
|
|
"IB: Control message received from %d: "
|
|
"bringing down connection,lid=%d,"
|
|
"subnet=0x%" PRIx64 ",endpoint_state=%d",
|
|
newep->endpoint_proc->proc_opal->proc_name.vpid,
|
|
newep->rem_info.rem_lid,
|
|
newep->rem_info.rem_subnet_id,
|
|
newep->endpoint_state);
|
|
newbtl->error_cb(&newbtl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
|
|
remote_proc, btlname);
|
|
if (NULL != btlname) free(btlname);
|
|
|
|
error_out_all_pending_frags(newep, &newbtl->super, true);
|
|
newep->endpoint_state = MCA_BTL_IB_FAILED;
|
|
return;
|
|
}
|
|
} else { /* MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR message */
|
|
/* If we are still pointing at the location where
|
|
* we detected an error on the remote side, then
|
|
* bump the index by one. */
|
|
if (newep->eager_rdma_local.head == (uint16_t)bc_hdr->index) {
|
|
/* Adjust the local head by one just in case */
|
|
MCA_BTL_OPENIB_RDMA_NEXT_INDEX(newep->eager_rdma_local.head);
|
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
"IB: rank=%d, control message (remote=%d), "
|
|
"moved local head by one (new=%d)",
|
|
opal_process_name_vpid(OPAL_PROC_MY_NAME),
|
|
opal_process_name_vpid(newep->endpoint_proc->proc_opal->proc_name),
|
|
newep->eager_rdma_local.head);
|
|
} else {
|
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
"IB: rank=%d, control message (remote=%d), "
|
|
"did not move local head by one (still=%d)",
|
|
opal_process_name_vpid(OPAL_PROC_MY_NAME),
|
|
opal_process_name_vpid(newep->endpoint_proc->proc_opal->proc_name),
|
|
newep->eager_rdma_local.head);
|
|
}
|
|
}
|
|
break; /* since we found the endpoint */
|
|
}
|
|
}
|
|
}
|
|
if (false == found) {
|
|
opal_output_verbose(30, mca_btl_openib_component.verbose_failover,
|
|
"IB: Control message: no match found");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This function will find all the pending fragments on an endpoint
|
|
* and call the callback function with OPAL_ERROR. It walks through
|
|
* each qp with each priority and looks for both no_credits_pending_frags
|
|
* and no_wqe_pending_frags. It then looks for any pending_lazy_frags,
|
|
* pending_put_frags, and pending_get_frags. This function is only
|
|
* called when running with failover support enabled. Note that
|
|
* the errout parameter allows the function to also be used as a
|
|
* debugging tool to see if there are any fragments on any of the
|
|
* queues.
|
|
* @param ep Pointer to endpoint that had error
|
|
* @param module Pointer to module that had error
|
|
* @param errout Boolean which says whether to error them out or not
|
|
*/
|
|
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
|
|
struct mca_btl_base_module_t* module,
|
|
bool errout)
|
|
{
|
|
int qp, pri, len, total, btl_ownership;
|
|
|
|
opal_list_item_t *item;
|
|
mca_btl_openib_com_frag_t* frag;
|
|
mca_btl_base_descriptor_t *des;
|
|
int verbose = 10; /* Verbosity level unless debugging */
|
|
|
|
/* If debugging, drop verbosity level so we can see the output
|
|
* regardless of the level the program was run with. */
|
|
if (false == errout) {
|
|
verbose = 0;
|
|
}
|
|
|
|
total = 0;
|
|
/* Traverse all QPs and all priorities and move to other endpoint */
|
|
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
|
|
for (pri = 0; pri < 2; ++pri) {
|
|
/* All types of qp's have a no_wqe_pending_frags list */
|
|
len = opal_list_get_size(&ep->qps[qp].no_wqe_pending_frags[pri]);
|
|
if (len > 0) {
|
|
total += len;
|
|
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
"IB: Checking for no_wqe_pending_frags qp=%d, "
|
|
"pri=%d, list size=%d",
|
|
qp, pri, len);
|
|
if (true == errout) {
|
|
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
|
no_wqe_pending_frags[pri]))) {
|
|
frag = (mca_btl_openib_com_frag_t *) item;
|
|
des = (mca_btl_base_descriptor_t *)frag;
|
|
|
|
/* Error out any coalesced frags if they exist */
|
|
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
opal_list_item_t *i;
|
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
"IB: Found coalesced frag in no_wqe_pending_frags");
|
|
btl_ownership = (to_base_frag(i)->base.des_flags &
|
|
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
to_base_frag(i)->base.des_cbfunc(module, ep,
|
|
&to_base_frag(i)->base, OPAL_ERROR);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
|
}
|
|
}
|
|
}
|
|
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(module, des);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
|
|
len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
|
|
if (len > 0) {
|
|
total += len;
|
|
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
"IB: Checking for no_credits_pending_frags qp=%d, "
|
|
"pri=%d, list size=%d",
|
|
qp, pri, len);
|
|
if (true == errout) {
|
|
while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
|
|
no_credits_pending_frags[pri]))) {
|
|
frag = (mca_btl_openib_com_frag_t *) item;
|
|
des = (mca_btl_base_descriptor_t *)frag;
|
|
|
|
/* Error out any coalesced frags if they exist */
|
|
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
opal_list_item_t *i;
|
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
"IB: Found coalesced frag in "
|
|
"no_credits_pending_frags");
|
|
btl_ownership = (to_base_frag(i)->base.des_flags &
|
|
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
to_base_frag(i)->base.des_cbfunc(module, ep,
|
|
&to_base_frag(i)->base, OPAL_ERROR);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
|
}
|
|
}
|
|
}
|
|
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(module, des);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
|
|
len = opal_list_get_size(&ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]);
|
|
if (len > 0) {
|
|
total += len;
|
|
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
"IB: Checking for srq pending_frags qp=%d, pri=%d, "
|
|
"list size=%d",
|
|
qp, pri, len);
|
|
if (true == errout) {
|
|
while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp].
|
|
u.srq_qp.pending_frags[pri]))) {
|
|
frag = (mca_btl_openib_com_frag_t *) item;
|
|
des = (mca_btl_base_descriptor_t *)frag;
|
|
|
|
/* Error out any coalesced frags if they exist */
|
|
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
opal_list_item_t *i;
|
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
"IB: Found coalesced frag in SRQ pending_frags");
|
|
btl_ownership = (to_base_frag(i)->base.des_flags &
|
|
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
to_base_frag(i)->base.des_cbfunc(module, ep,
|
|
&to_base_frag(i)->base, OPAL_ERROR);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(module, &to_base_frag(i)->base);
|
|
}
|
|
}
|
|
}
|
|
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(module, des);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Check for any frags from a connection that was never made. Not sure if this
|
|
* can actually happen. */
|
|
len = opal_list_get_size(&ep->pending_lazy_frags);
|
|
|
|
if (len > 0) {
|
|
total += len;
|
|
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
"IB: Checking for pending_lazy_frags, list size=%d", len);
|
|
if (true == errout) {
|
|
while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) {
|
|
frag = (mca_btl_openib_com_frag_t *) item;
|
|
des = (mca_btl_base_descriptor_t *)frag;
|
|
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
}
|
|
}
|
|
}
|
|
|
|
len = opal_list_get_size(&ep->pending_put_frags);
|
|
if (len > 0) {
|
|
total += len;
|
|
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
"IB: Checking for pending_put_frags, list size=%d", len);
|
|
if (true == errout) {
|
|
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
|
frag = (mca_btl_openib_com_frag_t *) item;
|
|
des = (mca_btl_base_descriptor_t *)frag;
|
|
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
}
|
|
}
|
|
}
|
|
|
|
len = opal_list_get_size(&ep->pending_get_frags);
|
|
if (len > 0) {
|
|
total += len;
|
|
opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
|
|
"IB: Checking for pending_get_frags, list size=%d", len);
|
|
if (true == errout) {
|
|
while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
|
|
frag = (mca_btl_openib_com_frag_t *) item;
|
|
des = (mca_btl_base_descriptor_t *)frag;
|
|
des->des_cbfunc(module, ep, des, OPAL_ERROR);
|
|
}
|
|
}
|
|
}
|
|
|
|
opal_output_verbose(verbose + 30, mca_btl_openib_component.verbose_failover,
|
|
"IB: Finished checking for pending_frags, total moved=%d",
|
|
total);
|
|
}
|
|
|
|
/* local callback function for completion of a failover control message */
|
|
static void mca_btl_openib_endpoint_notify_cb(mca_btl_base_module_t* btl,
|
|
struct mca_btl_base_endpoint_t* endpoint,
|
|
struct mca_btl_base_descriptor_t* descriptor,
|
|
int status)
|
|
{
|
|
MCA_BTL_IB_FRAG_RETURN(descriptor);
|
|
}
|
|
|
|
/**
|
|
* This function is used to send a message to the remote side
|
|
* indicating the endpoint is broken and telling the remote side to
|
|
* brings its endpoint down as well. This is needed because there are
|
|
* cases where only one side of the connection determines that the
|
|
* there was a problem.
|
|
* @param endpoint Pointer to endpoint with error
|
|
* @param type Type of message to be sent, can be one of two types
|
|
* @param index When sending RDMA error message, index is non zero
|
|
*/
|
|
static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index)
|
|
{
|
|
mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
|
|
mca_btl_openib_module_t* newbtl = NULL;
|
|
bool found = false;
|
|
mca_btl_openib_broken_connection_header_t *bc_hdr;
|
|
mca_btl_openib_send_control_frag_t* frag;
|
|
mca_btl_base_endpoint_t* newep;
|
|
int i, rc;
|
|
opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal;
|
|
|
|
/* First, find a different BTL than this one that got the
|
|
* error to send the message over. */
|
|
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
if (mca_btl_openib_component.openib_btls[i] != openib_btl) {
|
|
newbtl = mca_btl_openib_component.openib_btls[i];
|
|
break;
|
|
}
|
|
}
|
|
if (NULL == newbtl) {
|
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
"IB: Endpoint Notify: No BTL found");
|
|
/* If we cannot find one, then just return. */
|
|
return;
|
|
}
|
|
|
|
/* Now, find the endpoint associated with it. The device
|
|
* associated with the BTL has the list of all the
|
|
* endpoints. */
|
|
for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) {
|
|
newep = (mca_btl_openib_endpoint_t*)
|
|
opal_pointer_array_get_item(newbtl->device->endpoints, i);
|
|
if (NULL == newep) {
|
|
continue;
|
|
}
|
|
if (newep->endpoint_proc->proc_opal == remote_proc) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
if (false == found) {
|
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
"IB: Endpoint Notify: No endpoint found");
|
|
/* If we cannot find a match, then just return. */
|
|
return;
|
|
}
|
|
|
|
frag = alloc_control_frag(newbtl);
|
|
if(NULL == frag) {
|
|
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
|
"IB: Endpoint Notify: No frag space");
|
|
/* If no frag available, then just return. */
|
|
return;
|
|
}
|
|
|
|
to_base_frag(frag)->base.des_cbfunc =
|
|
mca_btl_openib_endpoint_notify_cb;
|
|
to_base_frag(frag)->base.des_cbdata = NULL;
|
|
to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
|
to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
|
|
to_base_frag(frag)->segment.base.seg_len =
|
|
sizeof(mca_btl_openib_broken_connection_header_t);
|
|
to_com_frag(frag)->endpoint = newep;
|
|
|
|
frag->hdr->tag = MCA_BTL_TAG_IB;
|
|
bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval;
|
|
bc_hdr->control.type = type;
|
|
bc_hdr->lid = endpoint->endpoint_btl->port_info.lid;
|
|
bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id;
|
|
bc_hdr->vpid = opal_process_name_vpid(OPAL_PROC_MY_NAME);
|
|
bc_hdr->index = index;
|
|
|
|
if(newep->nbo) {
|
|
BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr));
|
|
}
|
|
rc = mca_btl_openib_endpoint_send(newep, frag);
|
|
if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) {
|
|
return;
|
|
}
|
|
|
|
MCA_BTL_IB_FRAG_RETURN(frag);
|
|
BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno)));
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Function used for debugging problems in eager rdma.
|
|
*/
|
|
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint) {
|
|
mca_btl_openib_recv_frag_t *headers_buf = endpoint->eager_rdma_local.frags;
|
|
mca_btl_openib_recv_frag_t * frag;
|
|
mca_btl_openib_control_header_t* chdr;
|
|
int i, size;
|
|
|
|
opal_output(0, "Head = %d", endpoint->eager_rdma_local.head);
|
|
|
|
for (i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
|
|
frag = &headers_buf[i];
|
|
size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);
|
|
|
|
frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
|
|
size + sizeof(mca_btl_openib_footer_t));
|
|
to_base_frag(frag)->segment.base.seg_addr.pval =
|
|
((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
|
|
|
|
chdr = to_base_frag(frag)->segment.base.seg_addr.pval;
|
|
if ((MCA_BTL_TAG_IB == frag->hdr->tag) &&
|
|
(MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type)) {
|
|
opal_output(0, "tag[%d] is credit message", i);
|
|
} else {
|
|
opal_output(0, "frag[%d] size=%d,tag=%d,ftr->u.buf=%d", i, size, frag->hdr->tag,
|
|
frag->ftr->u.buf[3]);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Function used for debugging problems in eager rdma.
|
|
*/
|
|
void mca_btl_openib_dump_all_local_rdma_frags(mca_btl_openib_device_t *device) {
|
|
int i, c;
|
|
mca_btl_openib_endpoint_t* endpoint;
|
|
|
|
c = device->eager_rdma_buffers_count;
|
|
opal_output(0, "rank=%d, device=%s", opal_process_name_vpid(OPAL_PROC_MY_NAME), device->ib_dev->name);
|
|
|
|
for(i = 0; i < c; i++) {
|
|
endpoint = device->eager_rdma_buffers[i];
|
|
|
|
if(!endpoint)
|
|
continue;
|
|
|
|
dump_local_rdma_frags(endpoint);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This function is a debugging tool. If you notify a hang, you can
|
|
* call this function from a debugger and see if there are any
|
|
* messages stuck in any of the queues. If you call it with
|
|
* errout=true, then it will error them out. Otherwise, it will
|
|
* just print out the size of the queues with data in them.
|
|
*/
|
|
void mca_btl_openib_dump_all_internal_queues(bool errout) {
|
|
int i, j, num_eps;
|
|
mca_btl_openib_module_t* btl;
|
|
int total;
|
|
mca_btl_base_endpoint_t* ep;
|
|
struct mca_btl_base_module_t* module;
|
|
|
|
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
btl = mca_btl_openib_component.openib_btls[i];
|
|
module = &btl->super;
|
|
num_eps = opal_pointer_array_get_size(btl->device->endpoints);
|
|
|
|
/* Now, find the endpoint associated with it */
|
|
for (j = 0; j < num_eps; j++) {
|
|
ep = (mca_btl_openib_endpoint_t*)
|
|
opal_pointer_array_get_item(btl->device->endpoints, j);
|
|
if (NULL == ep) {
|
|
continue;
|
|
}
|
|
|
|
total = 0;
|
|
error_out_all_pending_frags(ep, module, errout);
|
|
}
|
|
}
|
|
}
|
|
|