openmpi/opal/mca/btl/openib/btl_openib_async.c

/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
 * Copyright (c) 2007-2013 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2006-2007 Voltaire All rights reserved.
 * Copyright (c) 2009-2010 Oracle and/or its affiliates.  All rights reserved
 * Copyright (c) 2013      Los Alamos National Security, LLC. All rights
 *                         reserved.
 * Copyright (c) 2014      Intel, Inc. All rights reserved.
 * Copyright (c) 2014      Bull SAS.  All rights reserved.
 * Copyright (c) 2015      Research Organization for Information Science
 *                         and Technology (RIST). All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */


#include "opal_config.h"

#if OPAL_HAVE_THREADS
#include <infiniband/verbs.h>
#include <fcntl.h>
#include <sys/poll.h>
#include <unistd.h>
#include <errno.h>

#include "opal/util/show_help.h"
#include "opal/util/proc.h"

#include "opal/mca/btl/base/base.h"
#include "btl_openib.h"
#include "btl_openib_mca.h"
#include "btl_openib_async.h"
#include "btl_openib_proc.h"
#include "btl_openib_endpoint.h"

struct mca_btl_openib_async_poll {
    int active_poll_size;
    int poll_size;
    struct pollfd *async_pollfd;
};
typedef struct mca_btl_openib_async_poll mca_btl_openib_async_poll;

typedef struct {
    opal_list_item_t super;
    struct ibv_qp *qp;
} mca_btl_openib_qp_list;

OBJ_CLASS_INSTANCE(mca_btl_openib_qp_list, opal_list_item_t, NULL, NULL);

static int return_status = OPAL_ERROR;

static int btl_openib_async_poll_init(struct mca_btl_openib_async_poll *hcas_poll);
static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *hcas_poll, opal_list_t *ignore_qp_err_list);
static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *hcas_poll, int index,
                                    opal_list_t *ignore_qp_err_list);
static const char *openib_event_to_str (enum ibv_event_type event);
static int send_command_comp(int in);

/* Function converts event to string (name)
 * Open Fabris don't have function that do this job :(
 */
static const char *openib_event_to_str (enum ibv_event_type event)
{
    switch (event) {
    case IBV_EVENT_CQ_ERR:
        return "IBV_EVENT_CQ_ERR";
    case IBV_EVENT_QP_FATAL:
        return "IBV_EVENT_QP_FATAL";
    case IBV_EVENT_QP_REQ_ERR:
        return "IBV_EVENT_QP_REQ_ERR";
    case IBV_EVENT_QP_ACCESS_ERR:
        return "IBV_EVENT_QP_ACCESS_ERR";
    case IBV_EVENT_PATH_MIG:
        return "IBV_EVENT_PATH_MIG";
    case IBV_EVENT_PATH_MIG_ERR:
        return "IBV_EVENT_PATH_MIG_ERR";
    case IBV_EVENT_DEVICE_FATAL:
        return "IBV_EVENT_DEVICE_FATAL";
    case IBV_EVENT_SRQ_ERR:
        return "IBV_EVENT_SRQ_ERR";
    case IBV_EVENT_PORT_ERR:
        return "IBV_EVENT_PORT_ERR";
    case IBV_EVENT_COMM_EST:
        return "IBV_EVENT_COMM_EST";
    case IBV_EVENT_PORT_ACTIVE:
        return "IBV_EVENT_PORT_ACTIVE";
    case IBV_EVENT_SQ_DRAINED:
        return "IBV_EVENT_SQ_DRAINED";
    case IBV_EVENT_LID_CHANGE:
        return "IBV_EVENT_LID_CHANGE";
    case IBV_EVENT_PKEY_CHANGE:
        return "IBV_EVENT_PKEY_CHANGE";
    case IBV_EVENT_SM_CHANGE:
        return "IBV_EVENT_SM_CHANGE";
    case IBV_EVENT_QP_LAST_WQE_REACHED:
        return "IBV_EVENT_QP_LAST_WQE_REACHED";
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
    case IBV_EVENT_CLIENT_REREGISTER:
        return "IBV_EVENT_CLIENT_REREGISTER";
#endif
    case IBV_EVENT_SRQ_LIMIT_REACHED:
        return "IBV_EVENT_SRQ_LIMIT_REACHED";
    default:
        return "UNKNOWN";
    }
}
/* QP to endpoint */
static mca_btl_openib_endpoint_t * qp2endpoint(struct ibv_qp *qp, mca_btl_openib_device_t *device)
{
    mca_btl_openib_endpoint_t *ep;
    int  ep_i, qp_i;
    for(ep_i = 0; ep_i < opal_pointer_array_get_size(device->endpoints); ep_i++) {
        ep = opal_pointer_array_get_item(device->endpoints, ep_i);
        for(qp_i = 0; qp_i < mca_btl_openib_component.num_qps; qp_i++) {
            if (qp == ep->qps[qp_i].qp->lcl_qp)
                return ep;
        }
    }
    return NULL;
}

#if HAVE_XRC
/* XRC recive QP to endpoint */
static mca_btl_openib_endpoint_t * xrc_qp2endpoint(uint32_t qp_num, mca_btl_openib_device_t *device)
{
    mca_btl_openib_endpoint_t *ep;
    int  ep_i;
    for(ep_i = 0; ep_i < opal_pointer_array_get_size(device->endpoints); ep_i++) {
        ep = opal_pointer_array_get_item(device->endpoints, ep_i);
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
        if (qp_num == ep->xrc_recv_qp->qp_num)
#else
        if (qp_num == ep->xrc_recv_qp_num)
#endif
            return ep;
    }
    return NULL;
}
#endif

/* Function inits mca_btl_openib_async_poll */
static int btl_openib_async_poll_init(struct mca_btl_openib_async_poll *devices_poll)
{
    devices_poll->active_poll_size = 1;
    devices_poll->poll_size = 4;
    devices_poll->async_pollfd = malloc(sizeof(struct pollfd) * devices_poll->poll_size);
    if (NULL == devices_poll->async_pollfd) {
        BTL_ERROR(("Failed malloc: %s:%d"
                    , __FILE__, __LINE__));
        return OPAL_ERROR;
    }
    /* Creating comunication channel with the main thread */
    devices_poll->async_pollfd[0].fd = mca_btl_openib_component.async_pipe[0];
    devices_poll->async_pollfd[0].events = POLLIN;
    devices_poll->async_pollfd[0].revents = 0;
    return OPAL_SUCCESS;
}

/* Send command completion to main thread */
static int send_command_comp(int in)
{
    if (write(mca_btl_openib_component.async_comp_pipe[1], &in, sizeof(int)) < 0) {
        BTL_ERROR(("Write failed [%d]",errno));
        return OPAL_ERROR;
    }
    return OPAL_SUCCESS;
}

/* Function handle async thread commands */
static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *devices_poll, opal_list_t *ignore_qp_err_list)
{
    struct pollfd *async_pollfd_tmp;
    mca_btl_openib_async_cmd_t cmd;
    int fd,flags,j;
    /* Got command from main thread */
    if (read(devices_poll->async_pollfd[0].fd, &cmd, sizeof(mca_btl_openib_async_cmd_t)) < 0) {
        BTL_ERROR(("Read failed [%d]",errno));
        return OPAL_ERROR;
    }

    BTL_VERBOSE(("Got cmd %d", cmd.a_cmd));
    if (OPENIB_ASYNC_CMD_FD_ADD == cmd.a_cmd) {
        fd = cmd.fd;
        BTL_VERBOSE(("Got fd %d", fd));
        BTL_VERBOSE(("Adding device [%d] to async event poll[%d]",
                     fd, devices_poll->active_poll_size));
        flags = fcntl(fd, F_GETFL);
        if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) {
            BTL_ERROR(("Failed to change file descriptor of async event"));
            return OPAL_ERROR;
        }
        if ((devices_poll->active_poll_size + 1) > devices_poll->poll_size) {
            devices_poll->poll_size+=devices_poll->poll_size;
            async_pollfd_tmp = malloc(sizeof(struct pollfd) * devices_poll->poll_size);
            if (NULL == async_pollfd_tmp) {
                BTL_ERROR(("Failed malloc: %s:%d.  "
                            "Fatal error, stoping asynch event thread"
                            , __FILE__, __LINE__));
                return OPAL_ERROR;
            }
            memcpy (async_pollfd_tmp,devices_poll->async_pollfd,
                    sizeof(struct pollfd) * (devices_poll->active_poll_size));
            free(devices_poll->async_pollfd);
            devices_poll->async_pollfd = async_pollfd_tmp;
        }
        devices_poll->async_pollfd[devices_poll->active_poll_size].fd = fd;
        devices_poll->async_pollfd[devices_poll->active_poll_size].events = POLLIN;
        devices_poll->async_pollfd[devices_poll->active_poll_size].revents = 0;
        devices_poll->active_poll_size++;
        if (OPAL_SUCCESS != send_command_comp(fd)) {
            return OPAL_ERROR;
        }
    } else if (OPENIB_ASYNC_CMD_FD_REMOVE == cmd.a_cmd) {
        bool fd_found = false;

        fd = cmd.fd;
        BTL_VERBOSE(("Got fd %d", fd));

        /* Removing device from poll */
        BTL_VERBOSE(("Removing device [%d] from async event poll [%d]",
                     fd, devices_poll->active_poll_size));
        if (devices_poll->active_poll_size > 1) {
            for (j=0; (j < devices_poll->active_poll_size || !fd_found); j++) {
                if (devices_poll->async_pollfd[j].fd == fd) {
                    devices_poll->async_pollfd[j].fd =
                        devices_poll->async_pollfd[devices_poll->active_poll_size-1].fd;
                    devices_poll->async_pollfd[j].events =
                        devices_poll->async_pollfd[devices_poll->active_poll_size-1].events;
                    devices_poll->async_pollfd[j].revents =
                        devices_poll->async_pollfd[devices_poll->active_poll_size-1].revents;
                    fd_found = true;
                }
            }
            if (!fd_found) {
                BTL_ERROR(("Requested FD[%d] was not found in poll array",fd));
                return OPAL_ERROR;
            }
        }
        devices_poll->active_poll_size--;
        if (OPAL_SUCCESS != send_command_comp(fd)) {
            return OPAL_ERROR;
        }
    } else if (OPENIB_ASYNC_IGNORE_QP_ERR == cmd.a_cmd) {
        mca_btl_openib_qp_list *new_qp;
        new_qp = OBJ_NEW(mca_btl_openib_qp_list);
        BTL_VERBOSE(("Ignore errors on QP %p", (void *)cmd.qp));
        new_qp->qp = cmd.qp;
        opal_list_append(ignore_qp_err_list, (opal_list_item_t *)new_qp);
        send_command_comp(OPENIB_ASYNC_IGNORE_QP_ERR);

    } else if (OPENIB_ASYNC_THREAD_EXIT == cmd.a_cmd) {
        /* Got 0 - command to close the thread */
        opal_list_item_t *item;
        BTL_VERBOSE(("Async event thread exit"));
        free(devices_poll->async_pollfd);
        return_status = OPAL_SUCCESS;

        while ((item = opal_list_remove_first(ignore_qp_err_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_DESTRUCT(ignore_qp_err_list);

        pthread_exit(&return_status);
    }
    return OPAL_SUCCESS;
}

/* The main idea of resizing SRQ algorithm -
   We create a SRQ with size = rd_num, but for efficient usage of resources
   the number of WQEs that we post = rd_curr_num < rd_num and this value is
   increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function),
   the event will thrown by device if number of WQEs in SRQ will be less than srq_limit */
static int btl_openib_async_srq_limit_event(struct ibv_srq* srq)
{
    int qp, rc = OPAL_SUCCESS;
    mca_btl_openib_module_t *openib_btl = NULL;

    opal_mutex_t *lock = &mca_btl_openib_component.srq_manager.lock;
    opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table;

    opal_mutex_lock(lock);

    if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(srq_addr_table,
                            &srq, sizeof(struct ibv_srq*), (void*) &openib_btl)) {
        /* If there isn't any element with the key in the table =>
           we assume that SRQ was destroyed and don't serve the event */
        goto srq_limit_event_exit;
    }

    for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
        if (!BTL_OPENIB_QP_TYPE_PP(qp)) {
            if(openib_btl->qps[qp].u.srq_qp.srq == srq) {
                break;
            }
        }
    }

    if(qp >= mca_btl_openib_component.num_qps) {
        BTL_ERROR(("Open MPI tried to access a shared receive queue (SRQ) on the device %s that was not found.  This should not happen, and is a fatal error.  Your MPI job will now abort.\n", ibv_get_device_name(openib_btl->device->ib_dev)));
        rc = OPAL_ERROR;
        goto srq_limit_event_exit;
    }

    /* dynamically re-size the SRQ to be larger */
    openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1;

    if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >=
                         mca_btl_openib_component.qp_infos[qp].rd_num) {
        openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].rd_num;
        openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;

        openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;

        goto srq_limit_event_exit;
    }

    openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1;
    openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;

srq_limit_event_exit:
    opal_mutex_unlock(lock);
    return rc;
}

/* Function handle async device events */
static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index,
                                    opal_list_t *ignore_qp_err_list)
{
    int j;
    mca_btl_openib_device_t *device = NULL;
    struct ibv_async_event event;
    bool xrc_event = false;
    int event_type;

    /* We need to find correct device and process this event */
    for (j=0; j < mca_btl_openib_component.ib_num_btls; j++) {
        if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd ==
                devices_poll->async_pollfd[index].fd ) {
            device = mca_btl_openib_component.openib_btls[j]->device;
            break;
        }
    }
    if (NULL != device) {
        if (ibv_get_async_event((struct ibv_context *)device->ib_dev_context,&event) < 0) {
            if (EWOULDBLOCK == errno) {
                /* No event found ?
                 * It was handled by somebody other */
                return OPAL_SUCCESS;
            } else {
                BTL_ERROR(("Failed to get async event"));
                return OPAL_ERROR;
            }
        }

        event_type = event.event_type;
#if HAVE_XRC
        /* is it XRC event ?*/
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
#else
        if (IBV_XRC_QP_EVENT_FLAG & event.event_type) {
            xrc_event = true;
            /* Clean the bitnd handel as usual */
            event_type ^= IBV_XRC_QP_EVENT_FLAG;
        }
#endif
#endif
        switch(event_type) {
            case IBV_EVENT_PATH_MIG:
                BTL_ERROR(("Alternative path migration event reported"));
                if (APM_ENABLED) {
                    BTL_ERROR(("Trying to find additional path..."));
                    if (!xrc_event)
                        mca_btl_openib_load_apm(event.element.qp,
                                qp2endpoint(event.element.qp, device));
#if HAVE_XRC
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
#else
                    else
                        mca_btl_openib_load_apm_xrc_rcv(event.element.xrc_qp_num,
                                xrc_qp2endpoint(event.element.xrc_qp_num, device));
#endif
#endif
                }
                break;
            case IBV_EVENT_DEVICE_FATAL:
                /* Set the flag to fatal */
                device->got_fatal_event = true;
                /* It is not critical to protect the counter */
                OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1);
            case IBV_EVENT_CQ_ERR:
            case IBV_EVENT_QP_FATAL:
              if (event_type == IBV_EVENT_QP_FATAL) {
                  opal_list_item_t *item;
                  mca_btl_openib_qp_list *qp_item;
                  bool in_ignore_list = false;

                  BTL_VERBOSE(("QP is in err state %p", (void *)event.element.qp));

                  /* look through ignore list */
                  for (item = opal_list_get_first(ignore_qp_err_list);
                       item != opal_list_get_end(ignore_qp_err_list);
                       item = opal_list_get_next(item)) {
                      qp_item = (mca_btl_openib_qp_list *)item;
                      if (qp_item->qp == event.element.qp) {
                          BTL_VERBOSE(("QP %p is in error ignore list",
                                       (void *)event.element.qp));
                          in_ignore_list = true;
                          break;
                      }
                  }
                  if (in_ignore_list)
                      break;
              }

            case IBV_EVENT_QP_REQ_ERR:
            case IBV_EVENT_QP_ACCESS_ERR:
            case IBV_EVENT_PATH_MIG_ERR:
            case IBV_EVENT_SRQ_ERR:
                opal_show_help("help-mpi-btl-openib.txt", "of error event",
                    true,opal_process_info.nodename, (int)getpid(),
                    event_type,
                    openib_event_to_str((enum ibv_event_type)event_type),
                    xrc_event ? "true" : "false");
                break;
            case IBV_EVENT_PORT_ERR:
                opal_show_help("help-mpi-btl-openib.txt", "of error event",
                    true,opal_process_info.nodename, (int)getpid(),
                    event_type,
                    openib_event_to_str((enum ibv_event_type)event_type),
                    xrc_event ? "true" : "false");
                /* Set the flag to indicate port error */
                device->got_port_event = true;
                OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1);
                break;
            case IBV_EVENT_COMM_EST:
            case IBV_EVENT_PORT_ACTIVE:
            case IBV_EVENT_SQ_DRAINED:
            case IBV_EVENT_LID_CHANGE:
            case IBV_EVENT_PKEY_CHANGE:
            case IBV_EVENT_SM_CHANGE:
            case IBV_EVENT_QP_LAST_WQE_REACHED:
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
            case IBV_EVENT_CLIENT_REREGISTER:
#endif
                break;
            /* The event is signaled when number of prepost receive WQEs is going
                                            under predefined threshold - srq_limit */
            case IBV_EVENT_SRQ_LIMIT_REACHED:
                if(OPAL_SUCCESS !=
                         btl_openib_async_srq_limit_event(event.element.srq)) {
                    return OPAL_ERROR;
                }

                break;
            default:
                opal_show_help("help-mpi-btl-openib.txt", "of unknown event",
                        true,opal_process_info.nodename, (int)getpid(),
                        event_type, xrc_event ? "true" : "false");
        }
        ibv_ack_async_event(&event);
    } else {
        /* if (device == NULL), then failed to locate the device!
           This should never happen... */
        BTL_ERROR(("Failed to find device with FD %d.  "
                   "Fatal error, stoping asynch event thread",
                   devices_poll->async_pollfd[index].fd));
        return OPAL_ERROR;
    }
    return OPAL_SUCCESS;
}

/* This Async event thread is handling all async event of
 * all btls/devices in openib component
 */
static void* btl_openib_async_thread(void * async)
{
    int rc;
    int i;
    struct mca_btl_openib_async_poll devices_poll;
    opal_list_t ignore_qp_err_list;

    OBJ_CONSTRUCT(&ignore_qp_err_list, opal_list_t);

    if (OPAL_SUCCESS != btl_openib_async_poll_init(&devices_poll)) {
        BTL_ERROR(("Fatal error, stoping asynch event thread"));
        pthread_exit(&return_status);
    }

    while(1) {
        rc = poll(devices_poll.async_pollfd, devices_poll.active_poll_size, -1);
        if (rc < 0) {
            if (errno != EINTR) {
                BTL_ERROR(("Poll failed.  Fatal error, stoping asynch event thread"));
                pthread_exit(&return_status);
            } else {
                /* EINTR - we got interupt */
                continue;
            }
        }
        for(i = 0; i < devices_poll.active_poll_size; i++) {
            switch (devices_poll.async_pollfd[i].revents) {
                case 0:
                    /* no events */
                    break;
                case POLLIN:
#if defined(__SVR4) && defined(__sun)
                /*
                 * Need workaround for Solaris IB user verbs since
                 * "Poll on IB async fd returns POLLRDNORM revent even though it is masked out"
                 */
                case POLLIN | POLLRDNORM:
#endif
                    /* Processing our event */
                    if (0 == i) {
                        /* 0 poll we use for comunication with main thread */
                        if (OPAL_SUCCESS != btl_openib_async_commandh(&devices_poll,
                                                                      &ignore_qp_err_list)) {
                            free(devices_poll.async_pollfd);
                            BTL_ERROR(("Failed to process async thread process.  "
                                        "Fatal error, stoping asynch event thread"));
                            pthread_exit(&return_status);
                        }
                    } else {
                        /* We get device event */
                        if (btl_openib_async_deviceh(&devices_poll, i,
                                                     &ignore_qp_err_list)) {
                            free(devices_poll.async_pollfd);
                            BTL_ERROR(("Failed to process async thread process.  "
                                        "Fatal error, stoping asynch event thread"));
                            pthread_exit(&return_status);
                        }
                    }
                    break;
                default:
                    /* Get event other than POLLIN
                     * this case should not never happend */
                    BTL_ERROR(("Got unexpected event %d.  "
                               "Fatal error, stoping asynch event thread",
                               devices_poll.async_pollfd[i].revents));
                    free(devices_poll.async_pollfd);
                    pthread_exit(&return_status);
            }
        }
    }
    return PTHREAD_CANCELED;
}

int btl_openib_async_command_done(int exp)
{
    int comp;
    if (read(mca_btl_openib_component.async_comp_pipe[0], &comp,
                sizeof(int)) < 0){
        BTL_ERROR(("Failed to read from pipe"));
        return OPAL_ERROR;
    }
    if (exp != comp){
        BTL_ERROR(("Get wrong completion on async command. Waiting for %d and got %d",
                    exp, comp));
        return OPAL_ERROR;
    }
    return OPAL_SUCCESS;
}

static void apm_update_attr(struct ibv_qp_attr *attr, enum ibv_qp_attr_mask *mask)
{
    *mask = IBV_QP_ALT_PATH|IBV_QP_PATH_MIG_STATE;
    attr->alt_ah_attr.dlid = attr->ah_attr.dlid + 1;
    attr->alt_ah_attr.src_path_bits = attr->ah_attr.src_path_bits + 1;
    attr->alt_ah_attr.static_rate = attr->ah_attr.static_rate;
    attr->alt_ah_attr.sl = attr->ah_attr.sl;
    attr->alt_pkey_index = attr->pkey_index;
    attr->alt_port_num = attr->port_num;
    attr->alt_timeout = attr->timeout;
    attr->path_mig_state = IBV_MIG_REARM;
    BTL_VERBOSE(("New APM LMC loaded: alt_src_port:%d, dlid: %d, src_bits %d, old_src_bits: %d, old_dlid %d",
                attr->alt_port_num, attr->alt_ah_attr.dlid,
                attr->alt_ah_attr.src_path_bits, attr->ah_attr.src_path_bits, attr->ah_attr.dlid));
}

static int apm_update_port(mca_btl_openib_endpoint_t *ep,
        struct ibv_qp_attr *attr, enum ibv_qp_attr_mask *mask)
{
    size_t port_i;
    uint16_t apm_lid = 0;

    if (attr->port_num == ep->endpoint_btl->apm_port) {
        /* all ports were used */
        BTL_ERROR(("APM: already all ports were used port_num %d apm_port %d",
                    attr->port_num, ep->endpoint_btl->apm_port));
        return OPAL_ERROR;
    }
    /* looking for alternatve lid on remote site */
    for(port_i = 0; port_i < ep->endpoint_proc->proc_port_count; port_i++) {
        if (ep->endpoint_proc->proc_ports[port_i].pm_port_info.lid == attr->ah_attr.dlid - mca_btl_openib_component.apm_lmc) {
            apm_lid = ep->endpoint_proc->proc_ports[port_i].pm_port_info.apm_lid;
        }
    }
    if (0 == apm_lid) {
        /* APM was disabled on one of site ? */
        BTL_VERBOSE(("APM: Was disabled ? dlid %d %d %d", attr->ah_attr.dlid, attr->ah_attr.src_path_bits, ep->endpoint_btl->src_path_bits));
        return OPAL_ERROR;
    }
    /* We guess cthat the LMC is the same on all ports */
    attr->alt_ah_attr.static_rate = attr->ah_attr.static_rate;
    attr->alt_ah_attr.sl = attr->ah_attr.sl;
    attr->alt_pkey_index = attr->pkey_index;
    attr->alt_timeout = attr->timeout;
    attr->path_mig_state = IBV_MIG_REARM;
    *mask = IBV_QP_ALT_PATH|IBV_QP_PATH_MIG_STATE;

    attr->alt_port_num = ep->endpoint_btl->apm_port;
    attr->alt_ah_attr.src_path_bits = ep->endpoint_btl->src_path_bits;
    attr->alt_ah_attr.dlid = apm_lid;

    BTL_VERBOSE(("New APM port loaded: alt_src_port:%d, dlid: %d, src_bits: %d:%d, old_dlid %d",
                attr->alt_port_num, attr->alt_ah_attr.dlid,
                attr->ah_attr.src_path_bits, attr->alt_ah_attr.src_path_bits,
                attr->ah_attr.dlid));
    return OPAL_SUCCESS;
}

/* Load new dlid to the QP */
void mca_btl_openib_load_apm(struct ibv_qp *qp, mca_btl_openib_endpoint_t *ep)
{
    struct ibv_qp_init_attr qp_init_attr;
    struct ibv_qp_attr attr;
    enum ibv_qp_attr_mask mask = 0;
    struct mca_btl_openib_module_t *btl;

    BTL_VERBOSE(("APM: Loading alternative path"));
    assert (NULL != ep);
    btl = ep->endpoint_btl;

    if (ibv_query_qp(qp, &attr, mask, &qp_init_attr))
        BTL_ERROR(("Failed to ibv_query_qp, qp num: %d", qp->qp_num));

    if (mca_btl_openib_component.apm_lmc &&
            attr.ah_attr.src_path_bits - btl->src_path_bits < mca_btl_openib_component.apm_lmc) {
        BTL_VERBOSE(("APM LMC: src: %d btl_src: %d lmc_max: %d",
                    attr.ah_attr.src_path_bits,
                    btl->src_path_bits,
                    mca_btl_openib_component.apm_lmc));
        apm_update_attr(&attr, &mask);
    } else {
        if (mca_btl_openib_component.apm_ports) {
            /* Try to migrate to next port */
            if (OPAL_SUCCESS != apm_update_port(ep, &attr, &mask))
                return;
        } else {
            BTL_ERROR(("Failed to load alternative path, all %d were used",
                        attr.ah_attr.src_path_bits - btl->src_path_bits));
        }
    }

    if (ibv_modify_qp(qp, &attr, mask))
        BTL_ERROR(("Failed to ibv_query_qp, qp num: %d, errno says: %s (%d)",
                   qp->qp_num, strerror(errno), errno));
}

#if HAVE_XRC && ! OPAL_HAVE_CONNECTX_XRC_DOMAINS
void mca_btl_openib_load_apm_xrc_rcv(uint32_t qp_num, mca_btl_openib_endpoint_t *ep)
{
    struct ibv_qp_init_attr qp_init_attr;
    struct ibv_qp_attr attr;
    enum ibv_qp_attr_mask mask = 0;
    struct mca_btl_openib_module_t *btl;

    BTL_VERBOSE(("APM XRC: Loading alternative path"));
    assert (NULL != ep);
    btl = ep->endpoint_btl;

    if (ibv_query_xrc_rcv_qp(btl->device->xrc_domain, qp_num, &attr, mask, &qp_init_attr))
        BTL_ERROR(("Failed to ibv_query_qp, qp num: %d", qp_num));

    if (mca_btl_openib_component.apm_lmc &&
            attr.ah_attr.src_path_bits - btl->src_path_bits < mca_btl_openib_component.apm_lmc) {
        apm_update_attr(&attr, &mask);
    } else {
        if (mca_btl_openib_component.apm_ports) {
            /* Try to migrate to next port */
            if (OPAL_SUCCESS != apm_update_port(ep, &attr, &mask))
                return;
        } else {
            BTL_ERROR(("Failed to load alternative path, all %d were used",
                        attr.ah_attr.src_path_bits - btl->src_path_bits));
        }
    }

    ibv_modify_xrc_rcv_qp(btl->device->xrc_domain, qp_num, &attr, mask);

    /* Maybe the qp already was modified by other process - ignoring error */
}
#endif

int start_async_event_thread(void)
{
    if (0 != mca_btl_openib_component.async_thread) {
        return OPAL_SUCCESS;
    }

    /* Set the error counter to zero */
    mca_btl_openib_component.error_counter = 0;

    /* Create pipe for communication with async event thread */
    if (pipe(mca_btl_openib_component.async_pipe)) {
        BTL_ERROR(("Failed to create pipe for communication with "
                   "async event thread"));
        return OPAL_ERROR;
    }

    if (pipe(mca_btl_openib_component.async_comp_pipe)) {
        BTL_ERROR(("Failed to create comp pipe for communication with "
                   "main thread"));
        return OPAL_ERROR;
    }

    /* Starting async event thread for the component */
    if (pthread_create(&mca_btl_openib_component.async_thread, NULL,
                       (void*(*)(void*)) btl_openib_async_thread, NULL)) {
        BTL_ERROR(("Failed to create async event thread"));
        return OPAL_ERROR;
    }

    return OPAL_SUCCESS;
}

#endif