/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
 *                         reserved.
 *               2014      Mellanox Technologies, Inc.
 *                         All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 *
 */

#include "oob_ud_component.h"
#include "oob_ud_qp.h"
#include "oob_ud.h"
#include "orte/util/show_help.h"

static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp);
static void mca_oob_ud_qp_destructor  (mca_oob_ud_qp_t *qp);

OBJ_CLASS_INSTANCE(mca_oob_ud_qp_t, opal_free_list_item_t,
                   mca_oob_ud_qp_constructor,
                   mca_oob_ud_qp_destructor);

static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp,
                                                          int num_completions);

#define MCA_OOB_UD_CLEAR_CQ(cq)                                        \
    do {                                                               \
        if (NULL == (cq)->channel) {                                   \
            struct ibv_wc wc;                                          \
            while (ibv_poll_cq ((cq), 1, &wc));                        \
        }                                                              \
    } while (0);                                                       \

int mca_oob_ud_qp_init (mca_oob_ud_qp_t *qp, struct mca_oob_ud_port_t *port,
                        struct ibv_comp_channel *recv_channel,
                        struct ibv_comp_channel *send_channel, bool onecq)
{
    struct ibv_qp_init_attr init_attr;
    int max_cqe = min(port->device->attr.max_cqe, 16384);

    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s oob:ud:qp_init creating UD QP on port %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), port->port_num);

    /* create a UD queue pair */
    memset(&init_attr, 0, sizeof(init_attr));

    init_attr.qp_type = IBV_QPT_UD;

    qp->ib_recv_cq = ibv_create_cq (port->device->ib_context, max_cqe,
                                    port, recv_channel, 0);
    if (NULL == qp->ib_recv_cq) {
        orte_show_help("help-oob-ud.txt", "create-cq-failed", true,
                       orte_process_info.nodename, max_cqe, strerror(errno));
        return ORTE_ERROR;
    }
    if (false == onecq) {
        qp->ib_send_cq = ibv_create_cq (port->device->ib_context, max_cqe,
                                        port, send_channel, 0);
        if (NULL == qp->ib_send_cq) {
            orte_show_help("help-oob-ud.txt", "create-cq-failed", true,
                       orte_process_info.nodename, max_cqe, strerror(errno));
            return ORTE_ERROR;
        }
    } else {
        qp->ib_send_cq = qp->ib_recv_cq;
    }

    init_attr.send_cq = qp->ib_send_cq;
    init_attr.recv_cq = qp->ib_recv_cq;

    mca_oob_ud_device_t *device = (mca_oob_ud_device_t *) opal_list_get_first (&mca_oob_ud_component.ud_devices);
    opal_output_verbose(80, orte_oob_base_framework.framework_output,
                        "%s oob:ud:qp_init create queue pair for device: device->attr.max_sge = %d, device->attr.max_qp_wr = %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), device->attr.max_sge, device->attr.max_qp_wr);

    init_attr.cap.max_send_sge    = mca_oob_ud_component.ud_qp_max_send_sge;
    init_attr.cap.max_recv_sge    = mca_oob_ud_component.ud_qp_max_recv_sge; /* GRH, data */
    init_attr.cap.max_inline_data = mca_oob_ud_component.ud_qp_max_inline_data;
    init_attr.cap.max_recv_wr     = min(mca_oob_ud_component.ud_qp_max_recv_wr, device->attr.max_qp_wr);
    init_attr.cap.max_send_wr     = min(mca_oob_ud_component.ud_qp_max_send_wr, device->attr.max_qp_wr);

    qp->ib_qp = ibv_create_qp (port->device->ib_pd, &init_attr);
    if (NULL == qp->ib_qp) {
        orte_show_help("help-oob-ud.txt", "create-qp-failed", true,
                       orte_process_info.nodename, init_attr.cap.max_send_sge, init_attr.cap.max_recv_sge,
                       init_attr.cap.max_send_wr, init_attr.cap.max_recv_wr, init_attr.cap.max_inline_data,
                       strerror(errno));
        return ORTE_ERROR;
    }
    /* end: create the UD queue pair */

    qp->port = port;

    return ORTE_SUCCESS;
}

int mca_oob_ud_qp_to_reset (mca_oob_ud_qp_t *qp)
{
    struct ibv_qp_attr attr;

    /* move the QP into the ERR state */
    memset(&attr, 0, sizeof(attr));
    attr.qp_state = IBV_QPS_ERR;

    if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
        orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
                       orte_process_info.nodename, IBV_QP_STATE, strerror(errno));
        return ORTE_ERROR;
    }

    /* poll thread/event will clear failed work requests */
    MCA_OOB_UD_CLEAR_CQ(qp->ib_send_cq);
    MCA_OOB_UD_CLEAR_CQ(qp->ib_recv_cq);

    /* move the QP into the RESET state */
    memset(&attr, 0, sizeof(attr));
    attr.qp_state = IBV_QPS_RESET;

    if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
        orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
                       orte_process_info.nodename, IBV_QP_STATE, strerror(errno));
        return ORTE_ERROR;
    }

    return ORTE_SUCCESS;
}

int mca_oob_ud_qp_to_rts (mca_oob_ud_qp_t *qp)
{
    struct mca_oob_ud_port_t *port = qp->port;
    int attr_mask;
    struct ibv_qp_attr attr;

    /* move the QP into the INIT state */
    memset(&attr, 0, sizeof(attr));
    attr.qp_state   = IBV_QPS_INIT;
    attr.pkey_index = 0; /* NTH: might need to modify the pkey index later */
    attr.port_num   = port->port_num;
    attr.qkey       = 0;

    attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY;

    if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) {
        orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
                       orte_process_info.nodename, attr_mask, strerror(errno));
        return ORTE_ERROR;
    }

    /* Move QP to RTR */
    attr.qp_state = IBV_QPS_RTR;

    if (0 != ibv_modify_qp(qp->ib_qp, &attr, IBV_QP_STATE)) {
        orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
                       orte_process_info.nodename, attr_mask, strerror(errno));
        return ORTE_ERROR;
    }

    /* Setup attributes */
    memset(&attr, 0, sizeof(attr));
    attr.qp_state = IBV_QPS_RTS;
    attr.sq_psn = 0;
    attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN;

    if (0 != ibv_modify_qp(qp->ib_qp, &attr, attr_mask)) {
        orte_show_help("help-oob-ud.txt", "modify-qp-failed", true,
                       orte_process_info.nodename, attr_mask, strerror(errno));
        return ORTE_ERROR;
    }

    return ORTE_SUCCESS;
}

/* purge all work requests on a qp */
int mca_oob_ud_qp_purge (mca_oob_ud_qp_t *qp)
{
    int rc;

    rc = mca_oob_ud_qp_to_reset (qp);
    if (ORTE_SUCCESS != rc) {
        return rc;
    }

    return mca_oob_ud_qp_to_rts (qp);
}

static void mca_oob_ud_qp_constructor (mca_oob_ud_qp_t *qp)
{
    memset ((char *)qp + sizeof(qp->super), 0, sizeof (*qp) - sizeof (qp->super));
}

static void mca_oob_ud_qp_destructor (mca_oob_ud_qp_t *qp)
{
    int rc;

    if (NULL != qp->ib_qp) {
        /* clear qp and move to reset */
        (void) mca_oob_ud_qp_to_reset (qp);

        /* destroy qp */
        rc = ibv_destroy_qp (qp->ib_qp);
        if (0 != rc) {
            orte_show_help("help-oob-ud.txt", "destroy-qp-failed", true,
                       orte_process_info.nodename, strerror(errno));
        }
    }

    if (NULL != qp->ib_send_cq) {
        (void) ibv_destroy_cq (qp->ib_send_cq);
    }

    if (NULL != qp->ib_recv_cq && qp->ib_recv_cq != qp->ib_send_cq) {
        (void) ibv_destroy_cq (qp->ib_recv_cq);
    }
}

static inline int mca_oob_ud_qp_process_send_completions (mca_oob_ud_qp_t *qp,
                                                          int num_completions)
{
    struct ibv_wc wc[1];
    int count, rc, ret, i;

    opal_output_verbose(10, orte_oob_base_framework.framework_output,
                         "%s oob:ud:qp_process_send_completions polling for %d completions",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         num_completions);

    rc = ORTE_SUCCESS;

    for (count = 0 ; count < num_completions ; ) {
        ret = ibv_poll_cq (qp->ib_send_cq, 1, wc);
        if (ret < 0) {
            orte_show_help("help-oob-ud.txt", "poll-cq-failed", true,
                       orte_process_info.nodename, 1, strerror(errno));
            return ORTE_ERROR;
        }
        for (i = 0 ; i < ret ; ++i) {
            if (IBV_WC_SUCCESS != wc[i].status) {
                orte_show_help("help-oob-ud.txt", "poll-cq-failed-wc", true,
                       orte_process_info.nodename, 1, i, wc[i].status);
                rc = ORTE_ERROR;
            }
        }
        count += ret;
    }

    return rc;
}

int mca_oob_ud_qp_post_send (mca_oob_ud_qp_t *qp, struct ibv_send_wr *wr,
                             int num_completions) {
    struct ibv_send_wr *bad_wr;
    int rc;

    rc = ibv_post_send (qp->ib_qp, wr, &bad_wr);
    if (0 != rc) {
        orte_show_help("help-oob-ud.txt", "post-send-failed", true,
                       orte_process_info.nodename, strerror(errno));
        return ORTE_ERROR;
    }
    return mca_oob_ud_qp_process_send_completions (qp, num_completions);
}

int mca_oob_ud_qp_post_recv (mca_oob_ud_qp_t *qp, struct ibv_recv_wr *wr) {

    struct ibv_recv_wr *bad_wr;
    int rc;

    rc = ibv_post_recv (qp->ib_qp, wr, &bad_wr);
    if (0 != rc) {
        orte_show_help("help-oob-ud.txt", "post-recv-failed", true,
                       orte_process_info.nodename, strerror(errno));
        return ORTE_ERROR;
    }
    return ORTE_SUCCESS;
}

int mca_oob_ud_qp_data_aquire (struct mca_oob_ud_port_t *port, mca_oob_ud_qp_t **qp_ptr) {
    int rc = ORTE_SUCCESS;
    opal_free_list_item_t *item;

    do {
        item = opal_free_list_get_st (&port->data_qps);
        if (NULL == item) {
            opal_output_verbose(5, orte_oob_base_framework.framework_output,
                                 "%s oob:ud:qp_data_aquire error allocating new data qp. error = %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc);
            rc = ORTE_ERR_TEMP_OUT_OF_RESOURCE;
            break;
        }

        *qp_ptr = (mca_oob_ud_qp_t *) item;

        if (NULL == (*qp_ptr)->ib_qp) {
            rc = mca_oob_ud_qp_init (*qp_ptr, port, NULL, NULL, true);
            if (ORTE_SUCCESS != rc) {
                break;
            }

            rc = mca_oob_ud_qp_to_rts (*qp_ptr);
        }
    } while (0);

    return rc;
}

int mca_oob_ud_qp_data_release (mca_oob_ud_qp_t *qp) {
    int rc;
    rc = mca_oob_ud_qp_purge (qp);
    if (ORTE_SUCCESS != rc) {
        return rc;
    }

    opal_free_list_return_st (&qp->port->data_qps, &qp->super);

    return ORTE_SUCCESS;
}