502d164908
This enables subnet differientation for iWARP devices, and rearrange initilization so that the services are available when they are needed. This commit was SVN r18393.
2324 строки
81 KiB
C
2324 строки
81 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; -*- */
|
|
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2008 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
|
* Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
|
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "ompi_config.h"
|
|
|
|
#include <infiniband/verbs.h>
|
|
#include <errno.h>
|
|
#include <string.h> /* for strerror()*/
|
|
#include <unistd.h>
|
|
|
|
#include "ompi/constants.h"
|
|
#include "opal/event/event.h"
|
|
#include "opal/include/opal/align.h"
|
|
#include "opal/util/if.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/util/output.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "opal/sys/timer.h"
|
|
#include "opal/sys/atomic.h"
|
|
#include "opal/util/argv.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "opal/mca/carto/carto.h"
|
|
#include "opal/mca/carto/base/base.h"
|
|
#include "opal/mca/paffinity/base/base.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/util/proc_info.h"
|
|
#include "orte/runtime/orte_globals.h"
|
|
|
|
#include "ompi/proc/proc.h"
|
|
#include "ompi/mca/pml/pml.h"
|
|
#include "ompi/mca/btl/btl.h"
|
|
#include "ompi/mca/mpool/base/base.h"
|
|
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
|
#include "ompi/mca/btl/base/base.h"
|
|
#include "ompi/datatype/convertor.h"
|
|
#include "ompi/mca/mpool/mpool.h"
|
|
#include "ompi/runtime/ompi_module_exchange.h"
|
|
|
|
#include "btl_openib.h"
|
|
#include "btl_openib_frag.h"
|
|
#include "btl_openib_endpoint.h"
|
|
#include "btl_openib_eager_rdma.h"
|
|
#include "btl_openib_proc.h"
|
|
#include "btl_openib_ini.h"
|
|
#include "btl_openib_mca.h"
|
|
#include "btl_openib_xrc.h"
|
|
#include "btl_openib_fd.h"
|
|
#if OMPI_HAVE_THREADS
|
|
#include "btl_openib_async.h"
|
|
#endif
|
|
#include "connect/base.h"
|
|
|
|
/*
|
|
* Local functions
|
|
*/
|
|
static int btl_openib_component_open(void);
|
|
static int btl_openib_component_close(void);
|
|
static mca_btl_base_module_t **btl_openib_component_init(int*, bool, bool);
|
|
static int btl_openib_component_progress(void);
|
|
|
|
mca_btl_openib_component_t mca_btl_openib_component = {
|
|
{
|
|
/* First, the mca_base_component_t struct containing meta information
|
|
about the component itself */
|
|
|
|
{
|
|
/* Indicate that we are a pml v1.0.0 component (which also implies a
|
|
specific MCA version) */
|
|
|
|
MCA_BTL_BASE_VERSION_1_0_1,
|
|
|
|
"openib", /* MCA component name */
|
|
OMPI_MAJOR_VERSION, /* MCA component major version */
|
|
OMPI_MINOR_VERSION, /* MCA component minor version */
|
|
OMPI_RELEASE_VERSION, /* MCA component release version */
|
|
btl_openib_component_open, /* component open */
|
|
btl_openib_component_close /* component close */
|
|
},
|
|
|
|
/* Next the MCA v1.0.0 component meta data */
|
|
|
|
{
|
|
/* The component is not checkpoint ready */
|
|
MCA_BASE_METADATA_PARAM_NONE
|
|
},
|
|
|
|
btl_openib_component_init,
|
|
btl_openib_component_progress,
|
|
}
|
|
};
|
|
|
|
/*
|
|
* Called by MCA framework to open the component, registers
|
|
* component parameters.
|
|
*/
|
|
int btl_openib_component_open(void)
|
|
{
|
|
int ret;
|
|
|
|
/* initialize state */
|
|
mca_btl_openib_component.ib_num_btls = 0;
|
|
mca_btl_openib_component.openib_btls = NULL;
|
|
OBJ_CONSTRUCT(&mca_btl_openib_component.hcas, opal_pointer_array_t);
|
|
mca_btl_openib_component.hcas_count = 0;
|
|
|
|
/* initialize objects */
|
|
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_procs, opal_list_t);
|
|
|
|
/* register IB component parameters */
|
|
ret = btl_openib_register_mca_params();
|
|
|
|
mca_btl_openib_component.max_send_size =
|
|
mca_btl_openib_module.super.btl_max_send_size;
|
|
mca_btl_openib_component.eager_limit =
|
|
mca_btl_openib_module.super.btl_eager_limit;
|
|
|
|
srand48(getpid() * time(NULL));
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* component cleanup - sanity checking of queue lengths
|
|
*/
|
|
|
|
static int btl_openib_component_close(void)
|
|
{
|
|
ompi_btl_openib_connect_base_finalize();
|
|
ompi_btl_openib_fd_finalize();
|
|
ompi_btl_openib_ini_finalize();
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static void inline pack8(char **dest, uint8_t value)
|
|
{
|
|
/* Copy one character */
|
|
**dest = (char) value;
|
|
/* Most the dest ahead one */
|
|
++*dest;
|
|
}
|
|
|
|
/*
|
|
* Register local openib port information with the modex so that it
|
|
* can be shared with all other peers.
|
|
*/
|
|
static int btl_openib_modex_send(void)
|
|
{
|
|
int rc, i, j;
|
|
int modex_message_size;
|
|
mca_btl_openib_modex_message_t dummy;
|
|
char *message, *offset;
|
|
size_t size, msg_size;
|
|
ompi_btl_openib_connect_base_module_t *cpc;
|
|
|
|
opal_output(-1, "Starting to modex send");
|
|
if (0 == mca_btl_openib_component.ib_num_btls) {
|
|
return 0;
|
|
}
|
|
modex_message_size = ((char *) &(dummy.end)) - ((char*) &dummy);
|
|
|
|
/* The message is packed into multiple parts:
|
|
* 1. a uint8_t indicating the number of modules (ports) in the message
|
|
* 2. for each module:
|
|
* a. the common module data
|
|
* b. a uint8_t indicating how many CPCs follow
|
|
* c. for each CPC:
|
|
* a. a uint8_t indicating the index of the CPC in the all[]
|
|
* array in btl_openib_connect_base.c
|
|
* b. a uint8_t indicating the priority of this CPC
|
|
* c. a uint8_t indicating the length of the blob to follow
|
|
* d. a blob that is only meaningful to that CPC
|
|
*/
|
|
msg_size =
|
|
/* uint8_t for number of modules in the message */
|
|
1 +
|
|
/* For each module: */
|
|
mca_btl_openib_component.ib_num_btls *
|
|
(
|
|
/* Common module data */
|
|
modex_message_size +
|
|
/* uint8_t for how many CPCs follow */
|
|
1
|
|
);
|
|
/* For each module, add in the size of the per-CPC data */
|
|
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
for (j = 0;
|
|
j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
|
|
++j) {
|
|
msg_size +=
|
|
/* uint8_t for the index of the CPC */
|
|
1 +
|
|
/* uint8_t for the CPC's priority */
|
|
1 +
|
|
/* uint8_t for the blob length */
|
|
1 +
|
|
/* blob length */
|
|
mca_btl_openib_component.openib_btls[i]->cpcs[j]->data.cbm_modex_message_len;
|
|
}
|
|
}
|
|
message = malloc(msg_size);
|
|
if (NULL == message) {
|
|
BTL_ERROR(("Failed malloc"));
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
/* Pack the number of modules */
|
|
offset = message;
|
|
pack8(&offset, mca_btl_openib_component.ib_num_btls);
|
|
opal_output(-1, "modex sending %d btls (packed: %d, offset now at %d)", mca_btl_openib_component.ib_num_btls, *((uint8_t*) message), (int) (offset - message));
|
|
|
|
/* Pack each of the modules */
|
|
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
|
|
/* Pack the modex common message struct. */
|
|
size = modex_message_size;
|
|
memcpy(offset,
|
|
&(mca_btl_openib_component.openib_btls[i]->port_info),
|
|
size);
|
|
opal_output(-1, "modex packed btl port modex message: %lx, %d, %d (size: %d)",
|
|
mca_btl_openib_component.openib_btls[i]->port_info.subnet_id,
|
|
mca_btl_openib_component.openib_btls[i]->port_info.mtu,
|
|
mca_btl_openib_component.openib_btls[i]->port_info.lid,
|
|
(int) size);
|
|
|
|
#if !defined(WORDS_BIGENDIAN) && OMPI_ENABLE_HETEROGENEOUS_SUPPORT
|
|
MCA_BTL_OPENIB_MODEX_MSG_HTON(*(mca_btl_openib_modex_message_t *)offset);
|
|
#endif
|
|
offset += size;
|
|
opal_output(-1, "modex packed btl %d: modex message, offset now %d",
|
|
i, (int) (offset -message));
|
|
|
|
/* Pack the number of CPCs that follow */
|
|
pack8(&offset,
|
|
mca_btl_openib_component.openib_btls[i]->num_cpcs);
|
|
opal_output(-1, "modex packed btl %d: to pack %d cpcs (packed: %d, offset now %d)",
|
|
i, mca_btl_openib_component.openib_btls[i]->num_cpcs,
|
|
*((uint8_t*) (offset - 1)), (int) (offset-message));
|
|
|
|
/* Pack each CPC */
|
|
for (j = 0;
|
|
j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
|
|
++j) {
|
|
uint8_t u8;
|
|
|
|
cpc = mca_btl_openib_component.openib_btls[i]->cpcs[j];
|
|
opal_output(-1, "modex packed btl %d: packing cpc %s",
|
|
i, cpc->data.cbm_component->cbc_name);
|
|
/* Pack the CPC index */
|
|
u8 = ompi_btl_openib_connect_base_get_cpc_index(cpc->data.cbm_component);
|
|
pack8(&offset, u8);
|
|
opal_output(-1, "packing btl %d: cpc %d: index %d (packed %d, offset now %d)",
|
|
i, j, u8, *((uint8_t*) (offset-1)), (int)(offset-message));
|
|
/* Pack the CPC priority */
|
|
pack8(&offset, cpc->data.cbm_priority);
|
|
opal_output(-1, "packing btl %d: cpc %d: priority %d (packed %d, offset now %d)",
|
|
i, j, cpc->data.cbm_priority, *((uint8_t*) (offset-1)), (int)(offset-message));
|
|
/* Pack the blob length */
|
|
u8 = cpc->data.cbm_modex_message_len;
|
|
pack8(&offset, u8);
|
|
opal_output(-1, "packing btl %d: cpc %d: message len %d (packed %d, offset now %d)",
|
|
i, j, u8, *((uint8_t*) (offset-1)), (int)(offset-message));
|
|
/* If the blob length is > 0, pack the blob */
|
|
if (u8 > 0) {
|
|
memcpy(offset, cpc->data.cbm_modex_message, u8);
|
|
offset += u8;
|
|
opal_output(-1, "packing btl %d: cpc %d: blob packed %d %x (offset now %d)",
|
|
i, j,
|
|
((uint32_t*)cpc->data.cbm_modex_message)[0],
|
|
((uint32_t*)cpc->data.cbm_modex_message)[1],
|
|
(int)(offset-message));
|
|
}
|
|
|
|
/* Sanity check */
|
|
assert((size_t) (offset - message) <= msg_size);
|
|
}
|
|
}
|
|
|
|
/* All done -- send it! */
|
|
rc = ompi_modex_send(&mca_btl_openib_component.super.btl_version,
|
|
message, msg_size);
|
|
free(message);
|
|
opal_output(-1, "Modex sent! %d calculated, %d actual\n", (int) msg_size, (int) (offset - message));
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
* Active Message Callback function on control message.
|
|
*/
|
|
|
|
static void btl_openib_control(mca_btl_base_module_t* btl,
|
|
mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des,
|
|
void* cbdata)
|
|
{
|
|
/* don't return credits used for control messages */
|
|
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
|
|
mca_btl_openib_endpoint_t* ep = to_com_frag(des)->endpoint;
|
|
mca_btl_openib_control_header_t *ctl_hdr =
|
|
to_base_frag(des)->segment.seg_addr.pval;
|
|
mca_btl_openib_eager_rdma_header_t *rdma_hdr;
|
|
mca_btl_openib_header_coalesced_t *clsc_hdr =
|
|
(mca_btl_openib_header_coalesced_t*)(ctl_hdr + 1);
|
|
mca_btl_active_message_callback_t* reg;
|
|
size_t len = des->des_dst->seg_len - sizeof(*ctl_hdr);
|
|
|
|
switch (ctl_hdr->type) {
|
|
case MCA_BTL_OPENIB_CONTROL_CREDITS:
|
|
assert(0); /* Credit message is handled elsewhere */
|
|
break;
|
|
case MCA_BTL_OPENIB_CONTROL_RDMA:
|
|
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)ctl_hdr;
|
|
|
|
BTL_VERBOSE(("prior to NTOH received rkey %lu, rdma_start.lval %llu, pval %p, ival %u",
|
|
rdma_hdr->rkey,
|
|
(unsigned long) rdma_hdr->rdma_start.lval,
|
|
rdma_hdr->rdma_start.pval,
|
|
rdma_hdr->rdma_start.ival
|
|
));
|
|
|
|
if(ep->nbo) {
|
|
BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(*rdma_hdr);
|
|
}
|
|
|
|
BTL_VERBOSE(("received rkey %lu, rdma_start.lval %llu, pval %p,"
|
|
" ival %u", rdma_hdr->rkey,
|
|
(unsigned long) rdma_hdr->rdma_start.lval,
|
|
rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival));
|
|
|
|
if (ep->eager_rdma_remote.base.pval) {
|
|
BTL_ERROR(("Got RDMA connect twice!"));
|
|
return;
|
|
}
|
|
ep->eager_rdma_remote.rkey = rdma_hdr->rkey;
|
|
ep->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval;
|
|
ep->eager_rdma_remote.tokens=mca_btl_openib_component.eager_rdma_num - 1;
|
|
break;
|
|
case MCA_BTL_OPENIB_CONTROL_COALESCED:
|
|
while(len > 0) {
|
|
size_t skip;
|
|
mca_btl_base_descriptor_t tmp_des;
|
|
mca_btl_base_segment_t tmp_seg;
|
|
|
|
assert(len >= sizeof(*clsc_hdr));
|
|
|
|
if(ep->nbo)
|
|
BTL_OPENIB_HEADER_COALESCED_NTOH(*clsc_hdr);
|
|
|
|
skip = (sizeof(*clsc_hdr) + clsc_hdr->alloc_size);
|
|
|
|
tmp_des.des_dst = &tmp_seg;
|
|
tmp_des.des_dst_cnt = 1;
|
|
tmp_seg.seg_addr.pval = clsc_hdr + 1;
|
|
tmp_seg.seg_len = clsc_hdr->size;
|
|
|
|
/* call registered callback */
|
|
reg = mca_btl_base_active_message_trigger + clsc_hdr->tag;
|
|
reg->cbfunc( &obtl->super, clsc_hdr->tag, &tmp_des, reg->cbdata );
|
|
len -= skip;
|
|
clsc_hdr = (mca_btl_openib_header_coalesced_t*)
|
|
(((unsigned char*)clsc_hdr) + skip);
|
|
}
|
|
break;
|
|
default:
|
|
BTL_ERROR(("Unknown message type received by BTL"));
|
|
break;
|
|
}
|
|
}
|
|
|
|
static int openib_reg_mr(void *reg_data, void *base, size_t size,
|
|
mca_mpool_base_registration_t *reg)
|
|
{
|
|
mca_btl_openib_hca_t *hca = (mca_btl_openib_hca_t*)reg_data;
|
|
mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg;
|
|
|
|
openib_reg->mr = ibv_reg_mr(hca->ib_pd, base, size, IBV_ACCESS_LOCAL_WRITE |
|
|
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
|
|
|
|
if(NULL == openib_reg->mr)
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
|
|
{
|
|
mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg;
|
|
|
|
if(openib_reg->mr != NULL) {
|
|
if(ibv_dereg_mr(openib_reg->mr)) {
|
|
BTL_ERROR(("%s: error unpinning openib memory errno says %s",
|
|
__func__, strerror(errno)));
|
|
return OMPI_ERROR;
|
|
}
|
|
}
|
|
openib_reg->mr = NULL;
|
|
return OMPI_SUCCESS;
|
|
}
|
|
static inline int param_register_int(const char* param_name, int default_value)
|
|
{
|
|
int param_value = default_value;
|
|
int id = mca_base_param_register_int("btl", "openib", param_name, NULL,
|
|
default_value);
|
|
mca_base_param_lookup_int(id, ¶m_value);
|
|
return param_value;
|
|
}
|
|
|
|
#if OMPI_HAVE_THREADS
|
|
static int start_async_event_thread(void)
|
|
{
|
|
/* Set the fatal counter to zero */
|
|
mca_btl_openib_component.fatal_counter = 0;
|
|
|
|
/* Create pipe for communication with async event thread */
|
|
if(pipe(mca_btl_openib_component.async_pipe)) {
|
|
BTL_ERROR(("Failed to create pipe for communication with "
|
|
"async event thread"));
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
/* Starting async event thread for the component */
|
|
if(pthread_create(&mca_btl_openib_component.async_thread, NULL,
|
|
(void*(*)(void*))btl_openib_async_thread, NULL)) {
|
|
BTL_ERROR(("Failed to create async event thread"));
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
#endif
|
|
|
|
#if OMPI_HAVE_RDMACM
|
|
extern uint64_t get_iwarp_subnet_id(struct ibv_device *ib_dev);
|
|
#else
|
|
static inline uint64_t get_iwarp_subnet_id(struct ibv_device *ib_dev) {return 0;}
|
|
#endif
|
|
|
|
static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca,
|
|
uint8_t port_num, uint16_t pkey_index,
|
|
struct ibv_port_attr *ib_port_attr)
|
|
{
|
|
uint16_t lid, i, lmc, lmc_step;
|
|
mca_btl_openib_module_t *openib_btl;
|
|
mca_btl_base_selected_module_t *ib_selected;
|
|
union ibv_gid gid;
|
|
uint64_t subnet_id;
|
|
|
|
ibv_query_gid(hca->ib_dev_context, port_num, 0, &gid);
|
|
|
|
/* If we have struct ibv_device.transport_type, then we're >= OFED
|
|
v1.2, and it could be iWarp of IB. If we don't have that
|
|
member, then we're < OFED v1.2, and it can only be IB. */
|
|
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
|
|
if (IBV_TRANSPORT_IWARP == hca->ib_dev->transport_type) {
|
|
subnet_id = get_iwarp_subnet_id(hca->ib_dev);
|
|
} else {
|
|
subnet_id = ntoh64(gid.global.subnet_prefix);
|
|
}
|
|
#else
|
|
subnet_id = ntoh64(gid.global.subnet_prefix);
|
|
#endif
|
|
|
|
BTL_VERBOSE(("my subnet_id is %016x", subnet_id));
|
|
|
|
if(mca_btl_openib_component.ib_num_btls > 0 &&
|
|
IB_DEFAULT_GID_PREFIX == subnet_id &&
|
|
mca_btl_openib_component.warn_default_gid_prefix) {
|
|
opal_show_help("help-mpi-btl-openib.txt", "default subnet prefix",
|
|
true, orte_process_info.nodename);
|
|
}
|
|
|
|
lmc = (1 << ib_port_attr->lmc);
|
|
lmc_step = 1;
|
|
|
|
if (0 != mca_btl_openib_component.max_lmc &&
|
|
mca_btl_openib_component.max_lmc < lmc) {
|
|
lmc = mca_btl_openib_component.max_lmc;
|
|
}
|
|
|
|
#if OMPI_HAVE_THREADS
|
|
/* APM support -- only meaningful if async event support is
|
|
enabled. If async events are not enabled, then there's nothing
|
|
to listen for the APM event to load the new path, so it's not
|
|
worth enabling APM. */
|
|
if (lmc > 1){
|
|
if (-1 == mca_btl_openib_component.apm_lmc) {
|
|
lmc_step = lmc;
|
|
mca_btl_openib_component.apm_lmc = lmc - 1;
|
|
} else if (0 == lmc % (mca_btl_openib_component.apm_lmc + 1)) {
|
|
lmc_step = mca_btl_openib_component.apm_lmc + 1;
|
|
} else {
|
|
opal_show_help("help-mpi-btl-openib.txt", "apm with wrong lmc",true,
|
|
mca_btl_openib_component.apm_lmc, lmc);
|
|
return OMPI_ERROR;
|
|
}
|
|
} else {
|
|
if (mca_btl_openib_component.apm_lmc) {
|
|
/* Disable apm and report warning */
|
|
mca_btl_openib_component.apm_lmc = 0;
|
|
opal_show_help("help-mpi-btl-openib.txt", "apm without lmc",true);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
for(lid = ib_port_attr->lid;
|
|
lid < ib_port_attr->lid + lmc; lid += lmc_step){
|
|
for(i = 0; i < mca_btl_openib_component.btls_per_lid; i++){
|
|
char param[40];
|
|
int rc;
|
|
|
|
openib_btl = malloc(sizeof(mca_btl_openib_module_t));
|
|
if(NULL == openib_btl) {
|
|
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
memcpy(openib_btl, &mca_btl_openib_module,
|
|
sizeof(mca_btl_openib_module));
|
|
memcpy(&openib_btl->ib_port_attr, ib_port_attr,
|
|
sizeof(struct ibv_port_attr));
|
|
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
|
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
|
|
openib_btl->hca = hca;
|
|
openib_btl->port_num = (uint8_t) port_num;
|
|
openib_btl->pkey_index = pkey_index;
|
|
openib_btl->lid = lid;
|
|
openib_btl->apm_port = 0;
|
|
openib_btl->src_path_bits = lid - ib_port_attr->lid;
|
|
|
|
openib_btl->port_info.subnet_id = subnet_id;
|
|
openib_btl->port_info.mtu = hca->mtu;
|
|
openib_btl->port_info.lid = lid;
|
|
|
|
openib_btl->cpcs = NULL;
|
|
openib_btl->num_cpcs = 0;
|
|
|
|
/* Do we have at least one CPC that can handle this
|
|
port? */
|
|
rc =
|
|
ompi_btl_openib_connect_base_select_for_local_port(openib_btl);
|
|
if (OMPI_ERR_NOT_SUPPORTED == rc) {
|
|
continue;
|
|
} else if (OMPI_SUCCESS != rc) {
|
|
return rc;
|
|
}
|
|
|
|
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control;
|
|
mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL;
|
|
|
|
/* Check bandwidth configured for this HCA */
|
|
sprintf(param, "bandwidth_%s", ibv_get_device_name(hca->ib_dev));
|
|
openib_btl->super.btl_bandwidth =
|
|
param_register_int(param, openib_btl->super.btl_bandwidth);
|
|
|
|
/* Check bandwidth configured for this HCA/port */
|
|
sprintf(param, "bandwidth_%s:%d", ibv_get_device_name(hca->ib_dev),
|
|
port_num);
|
|
openib_btl->super.btl_bandwidth =
|
|
param_register_int(param, openib_btl->super.btl_bandwidth);
|
|
|
|
/* Check bandwidth configured for this HCA/port/LID */
|
|
sprintf(param, "bandwidth_%s:%d:%d",
|
|
ibv_get_device_name(hca->ib_dev), port_num, lid);
|
|
openib_btl->super.btl_bandwidth =
|
|
param_register_int(param, openib_btl->super.btl_bandwidth);
|
|
|
|
/* Check latency configured for this HCA */
|
|
sprintf(param, "latency_%s", ibv_get_device_name(hca->ib_dev));
|
|
openib_btl->super.btl_latency =
|
|
param_register_int(param, openib_btl->super.btl_latency);
|
|
|
|
/* Check latency configured for this HCA/port */
|
|
sprintf(param, "latency_%s:%d", ibv_get_device_name(hca->ib_dev),
|
|
port_num);
|
|
openib_btl->super.btl_latency =
|
|
param_register_int(param, openib_btl->super.btl_latency);
|
|
|
|
/* Check latency configured for this HCA/port/LID */
|
|
sprintf(param, "latency_%s:%d:%d", ibv_get_device_name(hca->ib_dev),
|
|
port_num, lid);
|
|
openib_btl->super.btl_latency =
|
|
param_register_int(param, openib_btl->super.btl_latency);
|
|
|
|
/* Auto-detect the port bandwidth */
|
|
if (0 == openib_btl->super.btl_bandwidth) {
|
|
/* To calculate the bandwidth available on this port,
|
|
we have to look up the values corresponding to
|
|
port->active_speed and port->active_width. These
|
|
are enums corresponding to the IB spec. Overall
|
|
forumula is 80% of the reported speed (to get the
|
|
true link speed) times the number of links. */
|
|
switch (ib_port_attr->active_speed) {
|
|
case 1:
|
|
/* 2.5Gbps * 0.8, in megabits */
|
|
openib_btl->super.btl_bandwidth = 2000;
|
|
break;
|
|
case 2:
|
|
/* 5.0Gbps * 0.8, in megabits */
|
|
openib_btl->super.btl_bandwidth = 4000;
|
|
break;
|
|
case 4:
|
|
/* 10.0Gbps * 0.8, in megabits */
|
|
openib_btl->super.btl_bandwidth = 8000;
|
|
break;
|
|
default:
|
|
/* Who knows? Declare this port unreachable (do
|
|
*not* return ERR_VALUE_OF_OUT_OF_BOUNDS; that
|
|
is reserved for when we exceed the number of
|
|
allowable BTLs). */
|
|
return OMPI_ERR_UNREACH;
|
|
}
|
|
switch (ib_port_attr->active_width) {
|
|
case 1:
|
|
/* 1x */
|
|
/* unity */
|
|
break;
|
|
case 2:
|
|
/* 4x */
|
|
openib_btl->super.btl_bandwidth *= 4;
|
|
break;
|
|
case 4:
|
|
/* 8x */
|
|
openib_btl->super.btl_bandwidth *= 8;
|
|
break;
|
|
case 8:
|
|
/* 12x */
|
|
openib_btl->super.btl_bandwidth *= 12;
|
|
break;
|
|
default:
|
|
/* Who knows? Declare this port unreachable (do
|
|
*not* return ERR_VALUE_OF_OUT_OF_BOUNDS; that
|
|
is reserved for when we exceed the number of
|
|
allowable BTLs). */
|
|
return OMPI_ERR_UNREACH;
|
|
}
|
|
}
|
|
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
|
opal_pointer_array_add(hca->hca_btls, (void*) openib_btl);
|
|
hca->btls++;
|
|
++mca_btl_openib_component.ib_num_btls;
|
|
if (-1 != mca_btl_openib_component.ib_max_btls &&
|
|
mca_btl_openib_component.ib_num_btls >=
|
|
mca_btl_openib_component.ib_max_btls) {
|
|
return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
|
|
}
|
|
}
|
|
}
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static void hca_construct(mca_btl_openib_hca_t *hca)
|
|
{
|
|
int i;
|
|
|
|
hca->ib_dev = NULL;
|
|
hca->ib_dev_context = NULL;
|
|
hca->ib_pd = NULL;
|
|
hca->mpool = NULL;
|
|
#if OMPI_ENABLE_PROGRESS_THREADS
|
|
hca->ib_channel = NULL;
|
|
#endif
|
|
hca->btls = 0;
|
|
hca->ib_cq[BTL_OPENIB_HP_CQ] = NULL;
|
|
hca->ib_cq[BTL_OPENIB_LP_CQ] = NULL;
|
|
hca->cq_size[BTL_OPENIB_HP_CQ] = 0;
|
|
hca->cq_size[BTL_OPENIB_LP_CQ] = 0;
|
|
hca->non_eager_rdma_endpoints = 0;
|
|
hca->hp_cq_polls = mca_btl_openib_component.cq_poll_ratio;
|
|
hca->eager_rdma_polls = mca_btl_openib_component.eager_rdma_poll_ratio;
|
|
hca->pollme = true;
|
|
hca->eager_rdma_buffers_count = 0;
|
|
hca->eager_rdma_buffers = NULL;
|
|
#if HAVE_XRC
|
|
hca->xrc_fd = -1;
|
|
#endif
|
|
hca->qps = (mca_btl_openib_hca_qp_t*)calloc(mca_btl_openib_component.num_qps,
|
|
sizeof(mca_btl_openib_hca_qp_t));
|
|
OBJ_CONSTRUCT(&hca->hca_lock, opal_mutex_t);
|
|
for(i = 0; i < mca_btl_openib_component.num_qps; i++) {
|
|
OBJ_CONSTRUCT(&hca->qps[i].send_free, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&hca->qps[i].recv_free, ompi_free_list_t);
|
|
}
|
|
OBJ_CONSTRUCT(&hca->send_free_control, ompi_free_list_t);
|
|
}
|
|
|
|
static void hca_destruct(mca_btl_openib_hca_t *hca)
|
|
{
|
|
int i;
|
|
|
|
if(hca->eager_rdma_buffers) {
|
|
int i;
|
|
for(i = 0; i < hca->eager_rdma_buffers_count; i++)
|
|
if(hca->eager_rdma_buffers[i])
|
|
OBJ_RELEASE(hca->eager_rdma_buffers[i]);
|
|
free(hca->eager_rdma_buffers);
|
|
}
|
|
OBJ_DESTRUCT(&hca->hca_lock);
|
|
for(i = 0; i < mca_btl_openib_component.num_qps; i++) {
|
|
OBJ_DESTRUCT(&hca->qps[i].send_free);
|
|
OBJ_DESTRUCT(&hca->qps[i].recv_free);
|
|
}
|
|
OBJ_DESTRUCT(&hca->send_free_control);
|
|
if(hca->qps)
|
|
free(hca->qps);
|
|
}
|
|
|
|
OBJ_CLASS_INSTANCE(mca_btl_openib_hca_t, opal_object_t, hca_construct,
|
|
hca_destruct);
|
|
|
|
static int prepare_hca_for_use(mca_btl_openib_hca_t *hca)
|
|
{
|
|
mca_btl_openib_frag_init_data_t *init_data;
|
|
int qp, length;
|
|
|
|
#if OMPI_HAVE_THREADS
|
|
if(mca_btl_openib_component.use_async_event_thread) {
|
|
if(0 == mca_btl_openib_component.async_thread) {
|
|
/* async thread is not yet started, so start it here */
|
|
if(start_async_event_thread() != OMPI_SUCCESS)
|
|
return OMPI_ERROR;
|
|
}
|
|
hca->got_fatal_event = false;
|
|
if (write(mca_btl_openib_component.async_pipe[1],
|
|
&hca->ib_dev_context->async_fd, sizeof(int))<0){
|
|
BTL_ERROR(("Failed to write to pipe [%d]",errno));
|
|
return OMPI_ERROR;
|
|
}
|
|
}
|
|
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
|
/* Prepare data for thread, but not starting it */
|
|
OBJ_CONSTRUCT(&hca->thread, opal_thread_t);
|
|
hca->thread.t_run = mca_btl_openib_progress_thread;
|
|
hca->thread.t_arg = hca;
|
|
hca->progress = false;
|
|
#endif
|
|
#endif
|
|
|
|
hca->endpoints = OBJ_NEW(opal_pointer_array_t);
|
|
opal_pointer_array_init(hca->endpoints, 10, INT_MAX, 10);
|
|
opal_pointer_array_add(&mca_btl_openib_component.hcas, hca);
|
|
if(mca_btl_openib_component.max_eager_rdma > 0 &&
|
|
mca_btl_openib_component.use_eager_rdma &&
|
|
hca->use_eager_rdma) {
|
|
hca->eager_rdma_buffers =
|
|
calloc(mca_btl_openib_component.max_eager_rdma * hca->btls,
|
|
sizeof(mca_btl_openib_endpoint_t*));
|
|
if(NULL == hca->eager_rdma_buffers) {
|
|
BTL_ERROR(("Memory allocation fails"));
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
}
|
|
|
|
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
|
|
length = sizeof(mca_btl_openib_header_t) +
|
|
sizeof(mca_btl_openib_footer_t) +
|
|
sizeof(mca_btl_openib_eager_rdma_header_t);
|
|
|
|
init_data->order = MCA_BTL_NO_ORDER;
|
|
init_data->list = &hca->send_free_control;
|
|
|
|
if(OMPI_SUCCESS != ompi_free_list_init_ex_new(
|
|
&hca->send_free_control,
|
|
sizeof(mca_btl_openib_send_control_frag_t), CACHE_LINE_SIZE,
|
|
OBJ_CLASS(mca_btl_openib_send_control_frag_t), length,
|
|
mca_btl_openib_component.buffer_alignment,
|
|
mca_btl_openib_component.ib_free_list_num, -1,
|
|
mca_btl_openib_component.ib_free_list_inc,
|
|
hca->mpool, mca_btl_openib_frag_init,
|
|
init_data)) {
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
/* setup all the qps */
|
|
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
|
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
|
|
/* Initialize pool of send fragments */
|
|
length = sizeof(mca_btl_openib_header_t) +
|
|
sizeof(mca_btl_openib_header_coalesced_t) +
|
|
sizeof(mca_btl_openib_control_header_t) +
|
|
sizeof(mca_btl_openib_footer_t) +
|
|
mca_btl_openib_component.qp_infos[qp].size;
|
|
|
|
init_data->order = qp;
|
|
init_data->list = &hca->qps[qp].send_free;
|
|
|
|
if(OMPI_SUCCESS != ompi_free_list_init_ex_new(init_data->list,
|
|
sizeof(mca_btl_openib_send_frag_t), CACHE_LINE_SIZE,
|
|
OBJ_CLASS(mca_btl_openib_send_frag_t), length,
|
|
mca_btl_openib_component.buffer_alignment,
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
mca_btl_openib_component.ib_free_list_max,
|
|
mca_btl_openib_component.ib_free_list_inc,
|
|
hca->mpool, mca_btl_openib_frag_init,
|
|
init_data)) {
|
|
return OMPI_ERROR;
|
|
}
|
|
|
|
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
|
|
length = sizeof(mca_btl_openib_header_t) +
|
|
sizeof(mca_btl_openib_header_coalesced_t) +
|
|
sizeof(mca_btl_openib_control_header_t) +
|
|
sizeof(mca_btl_openib_footer_t) +
|
|
mca_btl_openib_component.qp_infos[qp].size;
|
|
|
|
init_data->order = qp;
|
|
init_data->list = &hca->qps[qp].recv_free;
|
|
|
|
if(OMPI_SUCCESS != ompi_free_list_init_ex_new(init_data->list,
|
|
sizeof(mca_btl_openib_recv_frag_t), CACHE_LINE_SIZE,
|
|
OBJ_CLASS(mca_btl_openib_recv_frag_t),
|
|
length, mca_btl_openib_component.buffer_alignment,
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
mca_btl_openib_component.ib_free_list_max,
|
|
mca_btl_openib_component.ib_free_list_inc,
|
|
hca->mpool, mca_btl_openib_frag_init,
|
|
init_data)) {
|
|
return OMPI_ERROR;
|
|
}
|
|
}
|
|
|
|
mca_btl_openib_component.hcas_count++;
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static int
|
|
get_port_list(mca_btl_openib_hca_t *hca, int *allowed_ports)
|
|
{
|
|
int i, j, k, num_ports = 0;
|
|
const char *dev_name;
|
|
char *name;
|
|
|
|
dev_name = ibv_get_device_name(hca->ib_dev);
|
|
name = (char*) malloc(strlen(dev_name) + 4);
|
|
if (NULL == name) {
|
|
return 0;
|
|
}
|
|
|
|
/* Assume that all ports are allowed. num_ports will be adjusted
|
|
below to reflect whether this is true or not. */
|
|
for (i = 1; i <= hca->ib_dev_attr.phys_port_cnt; ++i) {
|
|
allowed_ports[num_ports++] = i;
|
|
}
|
|
num_ports = 0;
|
|
if (NULL != mca_btl_openib_component.if_include_list) {
|
|
/* If only the HCA name is given (eg. mthca0,mthca1) use all
|
|
ports */
|
|
i = 0;
|
|
while (mca_btl_openib_component.if_include_list[i]) {
|
|
if (0 == strcmp(dev_name,
|
|
mca_btl_openib_component.if_include_list[i])) {
|
|
num_ports = hca->ib_dev_attr.phys_port_cnt;
|
|
goto done;
|
|
}
|
|
++i;
|
|
}
|
|
/* Include only requested ports on the HCA */
|
|
for (i = 1; i <= hca->ib_dev_attr.phys_port_cnt; ++i) {
|
|
sprintf(name,"%s:%d",dev_name,i);
|
|
for (j = 0;
|
|
NULL != mca_btl_openib_component.if_include_list[j]; ++j) {
|
|
if (0 == strcmp(name,
|
|
mca_btl_openib_component.if_include_list[j])) {
|
|
allowed_ports[num_ports++] = i;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else if (NULL != mca_btl_openib_component.if_exclude_list) {
|
|
/* If only the HCA name is given (eg. mthca0,mthca1) exclude
|
|
all ports */
|
|
i = 0;
|
|
while (mca_btl_openib_component.if_exclude_list[i]) {
|
|
if (0 == strcmp(dev_name,
|
|
mca_btl_openib_component.if_exclude_list[i])) {
|
|
num_ports = 0;
|
|
goto done;
|
|
}
|
|
++i;
|
|
}
|
|
/* Exclude the specified ports on this HCA */
|
|
for (i = 1; i <= hca->ib_dev_attr.phys_port_cnt; ++i) {
|
|
sprintf(name,"%s:%d",dev_name,i);
|
|
for (j = 0;
|
|
NULL != mca_btl_openib_component.if_exclude_list[j]; ++j) {
|
|
if (0 == strcmp(name,
|
|
mca_btl_openib_component.if_exclude_list[j])) {
|
|
/* If found, set a sentinel value */
|
|
j = -1;
|
|
break;
|
|
}
|
|
}
|
|
/* If we didn't find it, it's ok to include in the list */
|
|
if (-1 != j) {
|
|
allowed_ports[num_ports++] = i;
|
|
}
|
|
}
|
|
} else {
|
|
num_ports = hca->ib_dev_attr.phys_port_cnt;
|
|
}
|
|
|
|
done:
|
|
|
|
/* Remove the following from the error-checking if_list:
|
|
- bare device name
|
|
- device name suffixed with port number */
|
|
if (NULL != mca_btl_openib_component.if_list) {
|
|
for (i = 0; NULL != mca_btl_openib_component.if_list[i]; ++i) {
|
|
|
|
/* Look for raw device name */
|
|
if (0 == strcmp(mca_btl_openib_component.if_list[i], dev_name)) {
|
|
j = opal_argv_count(mca_btl_openib_component.if_list);
|
|
opal_argv_delete(&j, &(mca_btl_openib_component.if_list),
|
|
i, 1);
|
|
--i;
|
|
}
|
|
}
|
|
for (i = 1; i <= hca->ib_dev_attr.phys_port_cnt; ++i) {
|
|
sprintf(name, "%s:%d", dev_name, i);
|
|
for (j = 0; NULL != mca_btl_openib_component.if_list[j]; ++j) {
|
|
if (0 == strcmp(mca_btl_openib_component.if_list[j], name)) {
|
|
k = opal_argv_count(mca_btl_openib_component.if_list);
|
|
opal_argv_delete(&k, &(mca_btl_openib_component.if_list),
|
|
j, 1);
|
|
--j;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
free(name);
|
|
|
|
return num_ports;
|
|
}
|
|
|
|
static void merge_values(ompi_btl_openib_ini_values_t *target,
|
|
ompi_btl_openib_ini_values_t *src)
|
|
{
|
|
if (!target->mtu_set && src->mtu_set) {
|
|
target->mtu = src->mtu;
|
|
target->mtu_set = true;
|
|
}
|
|
|
|
if (!target->use_eager_rdma_set && src->use_eager_rdma_set) {
|
|
target->use_eager_rdma = src->use_eager_rdma;
|
|
target->use_eager_rdma_set = true;
|
|
}
|
|
}
|
|
|
|
static bool inline is_credit_message(const mca_btl_openib_recv_frag_t *frag)
|
|
{
|
|
mca_btl_openib_control_header_t* chdr =
|
|
to_base_frag(frag)->segment.seg_addr.pval;
|
|
return (MCA_BTL_TAG_BTL == frag->hdr->tag) &&
|
|
(MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type);
|
|
}
|
|
|
|
static void init_apm_port(mca_btl_openib_hca_t *hca, int port, uint16_t lid)
|
|
{
|
|
int index;
|
|
struct mca_btl_openib_module_t *btl;
|
|
for(index = 0; index < hca->btls; index++) {
|
|
btl = opal_pointer_array_get_item(hca->hca_btls, index);
|
|
/* Ok, we already have btl for the fist port,
|
|
* second one will be used for APM */
|
|
btl->apm_port = port;
|
|
btl->port_info.apm_lid = lid + btl->src_path_bits;
|
|
mca_btl_openib_component.apm_ports++;
|
|
BTL_VERBOSE(("APM-PORT: Setting alternative port - %d, lid - %d"
|
|
,port ,lid));
|
|
}
|
|
}
|
|
|
|
static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
|
{
|
|
struct mca_mpool_base_resources_t mpool_resources;
|
|
mca_btl_openib_hca_t *hca;
|
|
uint8_t i, k = 0;
|
|
int ret = -1, port_cnt;
|
|
ompi_btl_openib_ini_values_t values, default_values;
|
|
int *allowed_ports = NULL;
|
|
|
|
hca = OBJ_NEW(mca_btl_openib_hca_t);
|
|
if(NULL == hca){
|
|
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
hca->ib_dev = ib_dev;
|
|
hca->ib_dev_context = ibv_open_device(ib_dev);
|
|
hca->ib_pd = NULL;
|
|
hca->hca_btls = OBJ_NEW(opal_pointer_array_t);
|
|
if (OPAL_SUCCESS != opal_pointer_array_init(hca->hca_btls, 2, INT_MAX, 2)) {
|
|
BTL_ERROR(("Failed to initialize hca_btls array: %s:%d", __FILE__, __LINE__));
|
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
|
}
|
|
|
|
if(NULL == hca->ib_dev_context){
|
|
BTL_ERROR(("error obtaining device context for %s errno says %s",
|
|
ibv_get_device_name(ib_dev), strerror(errno)));
|
|
goto error;
|
|
}
|
|
|
|
if(ibv_query_device(hca->ib_dev_context, &hca->ib_dev_attr)){
|
|
BTL_ERROR(("error obtaining device attributes for %s errno says %s",
|
|
ibv_get_device_name(ib_dev), strerror(errno)));
|
|
goto error;
|
|
}
|
|
/* If mca_btl_if_include/exclude were specified, get usable ports */
|
|
allowed_ports = (int*)malloc(hca->ib_dev_attr.phys_port_cnt * sizeof(int));
|
|
port_cnt = get_port_list(hca, allowed_ports);
|
|
if(0 == port_cnt) {
|
|
ret = OMPI_SUCCESS;
|
|
goto error;
|
|
}
|
|
#if HAVE_XRC
|
|
/* if user configured to run with XRC qp and the device don't support it -
|
|
* we should ignore this hca. Maybe we have other one that have XRC support
|
|
*/
|
|
if (!(hca->ib_dev_attr.device_cap_flags & IBV_DEVICE_XRC) &&
|
|
mca_btl_openib_component.num_xrc_qps > 0) {
|
|
opal_show_help("help-mpi-btl-openib.txt",
|
|
"XRC on device without XRC support", true,
|
|
mca_btl_openib_component.num_xrc_qps,
|
|
ibv_get_device_name(ib_dev),
|
|
orte_process_info.nodename);
|
|
ret = OMPI_SUCCESS;
|
|
goto error;
|
|
}
|
|
#endif
|
|
/* Load in vendor/part-specific HCA parameters. Note that even if
|
|
we don't find values for this vendor/part, "values" will be set
|
|
indicating that it does not have good values */
|
|
ret = ompi_btl_openib_ini_query(hca->ib_dev_attr.vendor_id,
|
|
hca->ib_dev_attr.vendor_part_id,
|
|
&values);
|
|
if (OMPI_SUCCESS != ret && OMPI_ERR_NOT_FOUND != ret) {
|
|
/* If we get a serious error, propagate it upwards */
|
|
goto error;
|
|
}
|
|
if (OMPI_ERR_NOT_FOUND == ret) {
|
|
/* If we didn't find a matching HCA in the INI files, output a
|
|
warning that we're using default values (unless overridden
|
|
that we don't want to see these warnings) */
|
|
if (mca_btl_openib_component.warn_no_hca_params_found) {
|
|
opal_show_help("help-mpi-btl-openib.txt",
|
|
"no hca params found", true,
|
|
orte_process_info.nodename,
|
|
hca->ib_dev_attr.vendor_id,
|
|
hca->ib_dev_attr.vendor_part_id);
|
|
}
|
|
}
|
|
/* Note that even if we don't find default values, "values" will
|
|
be set indicating that it does not have good values */
|
|
ret = ompi_btl_openib_ini_query(0, 0, &default_values);
|
|
if (OMPI_SUCCESS != ret && OMPI_ERR_NOT_FOUND != ret) {
|
|
/* If we get a serious error, propagate it upwards */
|
|
goto error;
|
|
}
|
|
|
|
/* If we did find values for this HCA (or in the defaults
|
|
section), handle them */
|
|
merge_values(&values, &default_values);
|
|
if (values.mtu_set) {
|
|
switch (values.mtu) {
|
|
case 256:
|
|
hca->mtu = IBV_MTU_256;
|
|
break;
|
|
case 512:
|
|
hca->mtu = IBV_MTU_512;
|
|
break;
|
|
case 1024:
|
|
hca->mtu = IBV_MTU_1024;
|
|
break;
|
|
case 2048:
|
|
hca->mtu = IBV_MTU_2048;
|
|
break;
|
|
case 4096:
|
|
hca->mtu = IBV_MTU_4096;
|
|
break;
|
|
default:
|
|
BTL_ERROR(("invalid MTU value specified in INI file (%d); ignored", values.mtu));
|
|
hca->mtu = mca_btl_openib_component.ib_mtu;
|
|
break;
|
|
}
|
|
} else {
|
|
hca->mtu = mca_btl_openib_component.ib_mtu;
|
|
}
|
|
|
|
/* If "use eager rdma" was set, then enable it on this HCA */
|
|
if (values.use_eager_rdma_set) {
|
|
hca->use_eager_rdma = values.use_eager_rdma;
|
|
}
|
|
|
|
/* Allocate the protection domain for the HCA */
|
|
hca->ib_pd = ibv_alloc_pd(hca->ib_dev_context);
|
|
if(NULL == hca->ib_pd){
|
|
BTL_ERROR(("error allocating protection domain for %s errno says %s",
|
|
ibv_get_device_name(ib_dev), strerror(errno)));
|
|
goto error;
|
|
}
|
|
|
|
#if HAVE_XRC
|
|
if (MCA_BTL_XRC_ENABLED) {
|
|
if (OMPI_SUCCESS != mca_btl_openib_open_xrc_domain(hca)) {
|
|
BTL_ERROR(("XRC Internal error. Failed to open xrc domain"));
|
|
goto error;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
mpool_resources.reg_data = (void*)hca;
|
|
mpool_resources.sizeof_reg = sizeof(mca_btl_openib_reg_t);
|
|
mpool_resources.register_mem = openib_reg_mr;
|
|
mpool_resources.deregister_mem = openib_dereg_mr;
|
|
hca->mpool =
|
|
mca_mpool_base_module_create(mca_btl_openib_component.ib_mpool_name,
|
|
hca, &mpool_resources);
|
|
if(NULL == hca->mpool){
|
|
BTL_ERROR(("error creating IB memory pool for %s errno says %s",
|
|
ibv_get_device_name(ib_dev), strerror(errno)));
|
|
goto error;
|
|
}
|
|
|
|
#if OMPI_ENABLE_PROGRESS_THREADS
|
|
hca->ib_channel = ibv_create_comp_channel(hca->ib_dev_context);
|
|
if (NULL == hca->ib_channel) {
|
|
BTL_ERROR(("error creating channel for %s errno says %s",
|
|
ibv_get_device_name(hca->ib_dev),
|
|
strerror(errno)));
|
|
goto error;
|
|
}
|
|
#endif
|
|
|
|
ret = OMPI_SUCCESS;
|
|
|
|
/* Note ports are 1 based (i >= 1) */
|
|
for(k = 0; k < port_cnt; k++){
|
|
struct ibv_port_attr ib_port_attr;
|
|
i = allowed_ports[k];
|
|
if(ibv_query_port(hca->ib_dev_context, i, &ib_port_attr)){
|
|
BTL_ERROR(("error getting port attributes for device %s "
|
|
"port number %d errno says %s",
|
|
ibv_get_device_name(ib_dev), i, strerror(errno)));
|
|
break;
|
|
}
|
|
if(IBV_PORT_ACTIVE == ib_port_attr.state) {
|
|
if (mca_btl_openib_component.apm_ports && hca->btls > 0) {
|
|
init_apm_port(hca, i, ib_port_attr.lid);
|
|
break;
|
|
}
|
|
if (0 == mca_btl_openib_component.ib_pkey_val) {
|
|
ret = init_one_port(btl_list, hca, i, mca_btl_openib_component.ib_pkey_ix,
|
|
&ib_port_attr);
|
|
} else {
|
|
uint16_t pkey,j;
|
|
for (j=0; j < hca->ib_dev_attr.max_pkeys; j++) {
|
|
ibv_query_pkey(hca->ib_dev_context, i, j, &pkey);
|
|
pkey=ntohs(pkey);
|
|
if(pkey == mca_btl_openib_component.ib_pkey_val){
|
|
ret = init_one_port(btl_list, hca, i, j, &ib_port_attr);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (OMPI_SUCCESS != ret) {
|
|
/* Out of bounds error indicates that we hit max btl number
|
|
* don't propagate the error to the caller */
|
|
if(OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret)
|
|
ret = OMPI_SUCCESS;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If we made a BTL, we're done. Otherwise, fall through and
|
|
destroy everything */
|
|
if (hca->btls > 0) {
|
|
/* if apm was enabled it should be > 1 */
|
|
if (1 == mca_btl_openib_component.apm_ports) {
|
|
opal_show_help("help-mpi-btl-openib.txt", "apm not enough ports", true);
|
|
mca_btl_openib_component.apm_ports = 0;
|
|
}
|
|
ret = prepare_hca_for_use(hca);
|
|
if(OMPI_SUCCESS == ret) {
|
|
return OMPI_SUCCESS;
|
|
}
|
|
}
|
|
|
|
error:
|
|
#if OMPI_ENABLE_PROGRESS_THREADS
|
|
if (hca->ib_channel) {
|
|
ibv_destroy_comp_channel(hca->ib_channel);
|
|
}
|
|
#endif
|
|
if (hca->mpool) {
|
|
mca_mpool_base_module_destroy(hca->mpool);
|
|
}
|
|
#if HAVE_XRC
|
|
if (MCA_BTL_XRC_ENABLED) {
|
|
if(OMPI_SUCCESS != mca_btl_openib_close_xrc_domain(hca)) {
|
|
BTL_ERROR(("XRC Internal error. Failed to close xrc domain"));
|
|
}
|
|
}
|
|
#endif
|
|
if (hca->ib_pd) {
|
|
ibv_dealloc_pd(hca->ib_pd);
|
|
}
|
|
if (hca->ib_dev_context) {
|
|
ibv_close_device(hca->ib_dev_context);
|
|
}
|
|
OBJ_RELEASE(hca);
|
|
return ret;
|
|
}
|
|
|
|
static int finish_btl_init(mca_btl_openib_module_t *openib_btl)
|
|
{
|
|
int qp;
|
|
openib_btl->num_peers = 0;
|
|
|
|
/* Initialize module state */
|
|
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
|
|
|
|
/* setup the qp structure */
|
|
openib_btl->qps = (mca_btl_openib_module_qp_t*)
|
|
calloc(mca_btl_openib_component.num_qps,
|
|
sizeof(mca_btl_openib_module_qp_t));
|
|
|
|
/* setup all the qps */
|
|
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
|
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
|
OBJ_CONSTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0],
|
|
opal_list_t);
|
|
OBJ_CONSTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1],
|
|
opal_list_t);
|
|
openib_btl->qps[qp].u.srq_qp.sd_credits =
|
|
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
|
|
}
|
|
}
|
|
|
|
/* initialize the memory pool using the hca */
|
|
openib_btl->super.btl_mpool = openib_btl->hca->mpool;
|
|
|
|
openib_btl->eager_rdma_channels = 0;
|
|
|
|
openib_btl->eager_rdma_frag_size = OPAL_ALIGN(
|
|
sizeof(mca_btl_openib_header_t) +
|
|
sizeof(mca_btl_openib_header_coalesced_t) +
|
|
sizeof(mca_btl_openib_control_header_t) +
|
|
sizeof(mca_btl_openib_footer_t) +
|
|
openib_btl->super.btl_eager_limit,
|
|
mca_btl_openib_component.buffer_alignment, size_t);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static struct ibv_device **ibv_get_device_list_compat(int *num_devs)
|
|
{
|
|
struct ibv_device **ib_devs;
|
|
|
|
#ifdef HAVE_IBV_GET_DEVICE_LIST
|
|
ib_devs = ibv_get_device_list(num_devs);
|
|
#else
|
|
struct dlist *dev_list;
|
|
struct ibv_device *ib_dev;
|
|
*num_devs = 0;
|
|
|
|
/* Determine the number of hca's available on the host */
|
|
dev_list = ibv_get_devices();
|
|
if (NULL == dev_list)
|
|
return NULL;
|
|
|
|
dlist_start(dev_list);
|
|
|
|
dlist_for_each_data(dev_list, ib_dev, struct ibv_device)
|
|
(*num_devs)++;
|
|
|
|
/* Allocate space for the ib devices */
|
|
ib_devs = (struct ibv_device**)malloc(*num_devs * sizeof(struct ibv_dev*));
|
|
if(NULL == ib_devs) {
|
|
*num_devs = 0;
|
|
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
|
|
return NULL;
|
|
}
|
|
|
|
dlist_start(dev_list);
|
|
|
|
dlist_for_each_data(dev_list, ib_dev, struct ibv_device)
|
|
*(++ib_devs) = ib_dev;
|
|
#endif
|
|
|
|
return ib_devs;
|
|
}
|
|
|
|
static void ibv_free_device_list_compat(struct ibv_device **ib_devs)
|
|
{
|
|
#ifdef HAVE_IBV_GET_DEVICE_LIST
|
|
ibv_free_device_list(ib_devs);
|
|
#else
|
|
free(ib_devs);
|
|
#endif
|
|
}
|
|
|
|
static opal_carto_graph_t *host_topo;
|
|
|
|
static int get_ib_dev_distance(struct ibv_device *dev)
|
|
{
|
|
opal_paffinity_base_cpu_set_t cpus;
|
|
opal_carto_base_node_t *hca_node;
|
|
int min_distance = -1, i, max_proc_id, num_processors;
|
|
const char *hca = ibv_get_device_name(dev);
|
|
|
|
if(opal_paffinity_base_get_processor_info(&num_processors, &max_proc_id) != OMPI_SUCCESS)
|
|
max_proc_id = 100; /* Choose something big enough */
|
|
|
|
hca_node = carto_base_find_node(host_topo, hca);
|
|
|
|
/* no topology info for HCA found. Assume that it is close */
|
|
if(NULL == hca_node)
|
|
return 0;
|
|
|
|
OPAL_PAFFINITY_CPU_ZERO(cpus);
|
|
opal_paffinity_base_get(&cpus);
|
|
|
|
for(i = 0; i < max_proc_id; i++) {
|
|
opal_carto_base_node_t *slot_node;
|
|
int distance, socket, core;
|
|
char *slot;
|
|
|
|
if(!OPAL_PAFFINITY_CPU_ISSET(i, cpus))
|
|
continue;
|
|
|
|
opal_paffinity_base_map_to_socket_core(i, &socket, &core);
|
|
asprintf(&slot, "slot%d", socket);
|
|
|
|
slot_node = carto_base_find_node(host_topo, slot);
|
|
|
|
free(slot);
|
|
|
|
if(NULL == slot_node)
|
|
return 0;
|
|
|
|
distance = carto_base_spf(host_topo, slot_node, hca_node);
|
|
|
|
if(distance < 0)
|
|
return 0;
|
|
|
|
if(min_distance < 0 || min_distance < distance)
|
|
min_distance = distance;
|
|
}
|
|
|
|
return min_distance;
|
|
}
|
|
|
|
struct dev_distance {
|
|
struct ibv_device *ib_dev;
|
|
int distance;
|
|
};
|
|
|
|
static int compare_distance(const void *p1, const void *p2)
|
|
{
|
|
const struct dev_distance *d1 = p1;
|
|
const struct dev_distance *d2 = p2;
|
|
|
|
return d1->distance - d2->distance;
|
|
}
|
|
|
|
static struct dev_distance *
|
|
sort_devs_by_distance(struct ibv_device **ib_devs, int count)
|
|
{
|
|
int i;
|
|
struct dev_distance *devs = malloc(count * sizeof(struct dev_distance));
|
|
|
|
carto_base_get_host_graph(&host_topo, "Infiniband");
|
|
|
|
for(i = 0; i < count; i++) {
|
|
devs[i].ib_dev = ib_devs[i];
|
|
devs[i].distance = get_ib_dev_distance(ib_devs[i]);
|
|
}
|
|
|
|
qsort(devs, count, sizeof(struct dev_distance), compare_distance);
|
|
|
|
carto_base_free_graph(host_topo);
|
|
|
|
return devs;
|
|
}
|
|
|
|
/*
|
|
* IB component initialization:
|
|
* (1) read interface list from kernel and compare against component parameters
|
|
* then create a BTL instance for selected interfaces
|
|
* (2) setup IB listen socket for incoming connection attempts
|
|
* (3) register BTL parameters with the MCA
|
|
*/
|
|
|
|
static mca_btl_base_module_t**
|
|
btl_openib_component_init(int *num_btl_modules,
|
|
bool enable_progress_threads,
|
|
bool enable_mpi_threads)
|
|
{
|
|
struct ibv_device **ib_devs;
|
|
mca_btl_base_module_t** btls;
|
|
int i, ret, num_devs, length;
|
|
opal_list_t btl_list;
|
|
mca_btl_openib_module_t * openib_btl;
|
|
mca_btl_base_selected_module_t* ib_selected;
|
|
opal_list_item_t* item;
|
|
unsigned short seedv[3];
|
|
mca_btl_openib_frag_init_data_t *init_data;
|
|
struct dev_distance *dev_sorted;
|
|
int distance;
|
|
|
|
/* initialization */
|
|
*num_btl_modules = 0;
|
|
num_devs = 0;
|
|
|
|
seedv[0] = ORTE_PROC_MY_NAME->vpid;
|
|
seedv[1] = opal_sys_timer_get_cycles();
|
|
seedv[2] = opal_sys_timer_get_cycles();
|
|
seed48(seedv);
|
|
|
|
/* Read in INI files with HCA-specific parameters */
|
|
if (OMPI_SUCCESS != (ret = ompi_btl_openib_ini_init())) {
|
|
goto no_btls;
|
|
}
|
|
|
|
/* Initialize FD listening */
|
|
if (OMPI_SUCCESS != ompi_btl_openib_fd_init()) {
|
|
goto no_btls;
|
|
}
|
|
|
|
/* Init CPC components */
|
|
if (OMPI_SUCCESS != (ret = ompi_btl_openib_connect_base_init())) {
|
|
goto no_btls;
|
|
}
|
|
|
|
if (MCA_BTL_XRC_ENABLED) {
|
|
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_addr_table,
|
|
opal_hash_table_t);
|
|
}
|
|
|
|
OBJ_CONSTRUCT(&mca_btl_openib_component.send_free_coalesced, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_openib_component.send_user_free, ompi_free_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_openib_component.recv_user_free, ompi_free_list_t);
|
|
|
|
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
|
|
|
|
init_data->order = mca_btl_openib_component.rdma_qp;
|
|
init_data->list = &mca_btl_openib_component.send_user_free;
|
|
|
|
if(OMPI_SUCCESS != ompi_free_list_init_ex_new(
|
|
&mca_btl_openib_component.send_user_free,
|
|
sizeof(mca_btl_openib_put_frag_t), 2,
|
|
OBJ_CLASS(mca_btl_openib_put_frag_t),
|
|
0, 0,
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
mca_btl_openib_component.ib_free_list_max,
|
|
mca_btl_openib_component.ib_free_list_inc,
|
|
NULL, mca_btl_openib_frag_init, init_data)) {
|
|
goto no_btls;
|
|
}
|
|
|
|
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
|
|
|
|
init_data->order = mca_btl_openib_component.rdma_qp;
|
|
init_data->list = &mca_btl_openib_component.recv_user_free;
|
|
|
|
if(OMPI_SUCCESS != ompi_free_list_init_ex_new(
|
|
&mca_btl_openib_component.recv_user_free,
|
|
sizeof(mca_btl_openib_get_frag_t), 2,
|
|
OBJ_CLASS(mca_btl_openib_get_frag_t),
|
|
0, 0,
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
mca_btl_openib_component.ib_free_list_max,
|
|
mca_btl_openib_component.ib_free_list_inc,
|
|
NULL, mca_btl_openib_frag_init, init_data)) {
|
|
goto no_btls;
|
|
}
|
|
|
|
init_data = malloc(sizeof(mca_btl_openib_frag_init_data_t));
|
|
length = sizeof(mca_btl_openib_coalesced_frag_t);
|
|
|
|
init_data->list = &mca_btl_openib_component.send_free_coalesced;
|
|
|
|
if(OMPI_SUCCESS != ompi_free_list_init_ex(
|
|
&mca_btl_openib_component.send_free_coalesced,
|
|
length, 2, OBJ_CLASS(mca_btl_openib_coalesced_frag_t),
|
|
mca_btl_openib_component.ib_free_list_num,
|
|
mca_btl_openib_component.ib_free_list_max,
|
|
mca_btl_openib_component.ib_free_list_inc,
|
|
NULL, mca_btl_openib_frag_init, init_data)) {
|
|
goto no_btls;
|
|
}
|
|
|
|
/* If we want fork support, try to enable it */
|
|
#ifdef HAVE_IBV_FORK_INIT
|
|
if (0 != mca_btl_openib_component.want_fork_support) {
|
|
if (0 != ibv_fork_init()) {
|
|
/* If the want_fork_support MCA parameter is >0, then the
|
|
user was specifically asking for fork support and we
|
|
couldn't provide it. So print an error and deactivate
|
|
this BTL. */
|
|
if (mca_btl_openib_component.want_fork_support > 0) {
|
|
opal_show_help("help-mpi-btl-openib.txt",
|
|
"ibv_fork_init fail", true,
|
|
orte_process_info.nodename);
|
|
goto no_btls;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/* Parse the include and exclude lists, checking for errors */
|
|
mca_btl_openib_component.if_include_list =
|
|
mca_btl_openib_component.if_exclude_list =
|
|
mca_btl_openib_component.if_list = NULL;
|
|
if (NULL != mca_btl_openib_component.if_include &&
|
|
NULL != mca_btl_openib_component.if_exclude) {
|
|
opal_show_help("help-mpi-btl-openib.txt",
|
|
"specified include and exclude", true,
|
|
mca_btl_openib_component.if_include,
|
|
mca_btl_openib_component.if_exclude, NULL);
|
|
goto no_btls;
|
|
} else if (NULL != mca_btl_openib_component.if_include) {
|
|
mca_btl_openib_component.if_include_list =
|
|
opal_argv_split(mca_btl_openib_component.if_include, ',');
|
|
mca_btl_openib_component.if_list =
|
|
opal_argv_copy(mca_btl_openib_component.if_include_list);
|
|
} else if (NULL != mca_btl_openib_component.if_exclude) {
|
|
mca_btl_openib_component.if_exclude_list =
|
|
opal_argv_split(mca_btl_openib_component.if_exclude, ',');
|
|
mca_btl_openib_component.if_list =
|
|
opal_argv_copy(mca_btl_openib_component.if_exclude_list);
|
|
}
|
|
|
|
ib_devs = ibv_get_device_list_compat(&num_devs);
|
|
|
|
if(0 == num_devs || NULL == ib_devs) {
|
|
mca_btl_base_error_no_nics("OpenFabrics (openib)", "HCA");
|
|
goto no_btls;
|
|
}
|
|
|
|
dev_sorted = sort_devs_by_distance(ib_devs, num_devs);
|
|
|
|
/* We must loop through all the hca id's, get their handles and
|
|
for each hca we query the number of ports on the hca and set up
|
|
a distinct btl module for each hca port */
|
|
|
|
OBJ_CONSTRUCT(&btl_list, opal_list_t);
|
|
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t);
|
|
#if OMPI_HAVE_THREADS
|
|
mca_btl_openib_component.async_thread = 0;
|
|
#endif
|
|
for(i = 0; i < num_devs && (-1 == mca_btl_openib_component.ib_max_btls ||
|
|
mca_btl_openib_component.ib_num_btls <
|
|
mca_btl_openib_component.ib_max_btls); i++) {
|
|
if(0 == mca_btl_openib_component.ib_num_btls)
|
|
distance = dev_sorted[i].distance;
|
|
else if(distance != dev_sorted[i].distance)
|
|
break;
|
|
|
|
if(OMPI_SUCCESS !=
|
|
(ret = init_one_hca(&btl_list, dev_sorted[i].ib_dev)))
|
|
break;
|
|
}
|
|
|
|
if(ret != OMPI_SUCCESS) {
|
|
opal_show_help("help-mpi-btl-openib.txt",
|
|
"error in hca init", true, orte_process_info.nodename);
|
|
}
|
|
|
|
free(dev_sorted);
|
|
|
|
/* If we got back from checking all the HCAs and find that there
|
|
are still items in the component.if_list, that means that they
|
|
didn't exist. Show an appropriate warning if the warning was
|
|
not disabled. */
|
|
|
|
if (0 != opal_argv_count(mca_btl_openib_component.if_list) &&
|
|
mca_btl_openib_component.warn_nonexistent_if) {
|
|
char *str = opal_argv_join(mca_btl_openib_component.if_list, ',');
|
|
opal_show_help("help-mpi-btl-openib.txt", "nonexistent port",
|
|
true, orte_process_info.nodename,
|
|
((NULL != mca_btl_openib_component.if_include) ?
|
|
"in" : "ex"), str);
|
|
free(str);
|
|
}
|
|
|
|
if(0 == mca_btl_openib_component.ib_num_btls) {
|
|
opal_show_help("help-mpi-btl-openib.txt",
|
|
"no active ports found", true, orte_process_info.nodename);
|
|
return NULL;
|
|
}
|
|
|
|
/* Allocate space for btl modules */
|
|
mca_btl_openib_component.openib_btls =
|
|
malloc(sizeof(mca_btl_openib_module_t*) *
|
|
mca_btl_openib_component.ib_num_btls);
|
|
if(NULL == mca_btl_openib_component.openib_btls) {
|
|
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
|
|
return NULL;
|
|
}
|
|
btls = (struct mca_btl_base_module_t **)
|
|
malloc(mca_btl_openib_component.ib_num_btls *
|
|
sizeof(struct mca_btl_base_module_t*));
|
|
if(NULL == btls) {
|
|
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
|
|
return NULL;
|
|
}
|
|
|
|
/* Copy the btl module structs into a contiguous array and fully
|
|
initialize them */
|
|
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++){
|
|
item = opal_list_remove_first(&btl_list);
|
|
ib_selected = (mca_btl_base_selected_module_t*)item;
|
|
mca_btl_openib_component.openib_btls[i] =
|
|
(mca_btl_openib_module_t*)ib_selected->btl_module;
|
|
OBJ_RELEASE(ib_selected);
|
|
openib_btl = mca_btl_openib_component.openib_btls[i];
|
|
btls[i] = &openib_btl->super;
|
|
if(finish_btl_init(openib_btl) != OMPI_SUCCESS)
|
|
return NULL;
|
|
}
|
|
|
|
btl_openib_modex_send();
|
|
|
|
*num_btl_modules = mca_btl_openib_component.ib_num_btls;
|
|
ibv_free_device_list_compat(ib_devs);
|
|
if (NULL != mca_btl_openib_component.if_include_list) {
|
|
opal_argv_free(mca_btl_openib_component.if_include_list);
|
|
mca_btl_openib_component.if_include_list = NULL;
|
|
}
|
|
if (NULL != mca_btl_openib_component.if_exclude_list) {
|
|
opal_argv_free(mca_btl_openib_component.if_exclude_list);
|
|
mca_btl_openib_component.if_exclude_list = NULL;
|
|
}
|
|
return btls;
|
|
|
|
no_btls:
|
|
/* If we fail early enough in the setup, we just modex around that
|
|
there are no openib BTL's in this process and return NULL. */
|
|
|
|
if (MCA_BTL_XRC_ENABLED) {
|
|
OBJ_DESTRUCT(&mca_btl_openib_component.ib_addr_table);
|
|
}
|
|
/* Be sure to shut down the fd listener */
|
|
ompi_btl_openib_fd_finalize();
|
|
|
|
mca_btl_openib_component.ib_num_btls = 0;
|
|
btl_openib_modex_send();
|
|
return NULL;
|
|
}
|
|
|
|
static void progress_pending_eager_rdma(mca_btl_base_endpoint_t *ep)
|
|
{
|
|
int qp;
|
|
opal_list_item_t *frag;
|
|
|
|
/* Go over all QPs and try to send high prio packets over eager rdma
|
|
* channel */
|
|
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
|
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
|
while(ep->qps[qp].qp->sd_wqe > 0 && ep->eager_rdma_remote.tokens > 0) {
|
|
frag = opal_list_remove_first(&ep->qps[qp].pending_frags[0]);
|
|
if(NULL == frag)
|
|
break;
|
|
mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag));
|
|
}
|
|
if(ep->eager_rdma_remote.tokens == 0)
|
|
break;
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
|
}
|
|
|
|
static inline int
|
|
get_enpoint_credits(mca_btl_base_endpoint_t *ep, const int qp)
|
|
{
|
|
return BTL_OPENIB_QP_TYPE_PP(qp) ? ep->qps[qp].u.pp_qp.sd_credits : 1;
|
|
}
|
|
|
|
static void progress_pending_frags_pp(mca_btl_base_endpoint_t *ep, const int qp)
|
|
{
|
|
int i;
|
|
opal_list_item_t *frag;
|
|
|
|
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
|
for(i = 0; i < 2; i++) {
|
|
while((get_enpoint_credits(ep, qp) +
|
|
(1 - i) * ep->eager_rdma_remote.tokens) > 0) {
|
|
frag = opal_list_remove_first(&ep->qps[qp].pending_frags[i]);
|
|
if(NULL == frag)
|
|
break;
|
|
mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag));
|
|
}
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
|
}
|
|
|
|
void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep,
|
|
const int qp)
|
|
{
|
|
mca_btl_openib_module_t* openib_btl = ep->endpoint_btl;
|
|
opal_list_item_t *frag;
|
|
size_t i, len = opal_list_get_size(&ep->pending_get_frags);
|
|
|
|
for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++)
|
|
{
|
|
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
|
frag = opal_list_remove_first(&(ep->pending_get_frags));
|
|
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
|
if(NULL == frag)
|
|
break;
|
|
if(mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep,
|
|
&to_base_frag(frag)->base) == OMPI_ERR_OUT_OF_RESOURCE)
|
|
break;
|
|
}
|
|
|
|
len = opal_list_get_size(&ep->pending_put_frags);
|
|
for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0; i++) {
|
|
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
|
frag = opal_list_remove_first(&(ep->pending_put_frags));
|
|
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
|
if(NULL == frag)
|
|
break;
|
|
if(mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep,
|
|
&to_base_frag(frag)->base) == OMPI_ERR_OUT_OF_RESOURCE)
|
|
break;
|
|
}
|
|
}
|
|
|
|
static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
|
|
mca_btl_openib_endpoint_t *ep,
|
|
mca_btl_openib_recv_frag_t *frag,
|
|
size_t byte_len)
|
|
{
|
|
mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
|
|
mca_btl_openib_header_t *hdr = frag->hdr;
|
|
int rqp = to_base_frag(frag)->base.order, cqp;
|
|
uint16_t rcredits = 0, credits;
|
|
bool is_credit_msg;
|
|
|
|
if(ep->nbo) {
|
|
BTL_OPENIB_HEADER_NTOH(*hdr);
|
|
}
|
|
|
|
/* advance the segment address past the header and subtract from the
|
|
* length.*/
|
|
des->des_dst->seg_len = byte_len - sizeof(mca_btl_openib_header_t);
|
|
|
|
if(OPAL_LIKELY(!(is_credit_msg = is_credit_message(frag)))) {
|
|
/* call registered callback */
|
|
mca_btl_active_message_callback_t* reg;
|
|
reg = mca_btl_base_active_message_trigger + hdr->tag;
|
|
reg->cbfunc( &openib_btl->super, hdr->tag, des, reg->cbdata );
|
|
if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
|
|
cqp = (hdr->credits >> 11) & 0x0f;
|
|
hdr->credits &= 0x87ff;
|
|
} else {
|
|
cqp = rqp;
|
|
}
|
|
if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
|
|
rcredits = BTL_OPENIB_CREDITS(hdr->credits);
|
|
hdr->credits = 0;
|
|
}
|
|
} else {
|
|
mca_btl_openib_rdma_credits_header_t *chdr=des->des_dst->seg_addr.pval;
|
|
if(ep->nbo) {
|
|
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*chdr);
|
|
}
|
|
cqp = chdr->qpn;
|
|
rcredits = chdr->rdma_credits;
|
|
}
|
|
|
|
credits = hdr->credits;
|
|
|
|
if(hdr->cm_seen)
|
|
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_sent, -hdr->cm_seen);
|
|
|
|
/* Now return fragment. Don't touch hdr after this point! */
|
|
if(MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
|
|
mca_btl_openib_eager_rdma_local_t *erl = &ep->eager_rdma_local;
|
|
OPAL_THREAD_LOCK(&erl->lock);
|
|
MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
|
|
while(erl->tail != erl->head) {
|
|
mca_btl_openib_recv_frag_t *tf;
|
|
tf = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(ep, erl->tail);
|
|
if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(tf))
|
|
break;
|
|
OPAL_THREAD_ADD32(&erl->credits, 1);
|
|
MCA_BTL_OPENIB_RDMA_NEXT_INDEX(erl->tail);
|
|
}
|
|
OPAL_THREAD_UNLOCK(&erl->lock);
|
|
} else {
|
|
MCA_BTL_IB_FRAG_RETURN(frag);
|
|
if(BTL_OPENIB_QP_TYPE_PP(rqp)) {
|
|
if(OPAL_UNLIKELY(is_credit_msg))
|
|
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.cm_received, 1);
|
|
else
|
|
OPAL_THREAD_ADD32(&ep->qps[rqp].u.pp_qp.rd_posted, -1);
|
|
mca_btl_openib_endpoint_post_rr(ep, cqp);
|
|
} else {
|
|
mca_btl_openib_module_t *btl = ep->endpoint_btl;
|
|
OPAL_THREAD_ADD32(&btl->qps[rqp].u.srq_qp.rd_posted, -1);
|
|
mca_btl_openib_post_srr(btl, rqp);
|
|
}
|
|
}
|
|
|
|
if(rcredits > 0) {
|
|
OPAL_THREAD_ADD32(&ep->eager_rdma_remote.tokens, rcredits);
|
|
progress_pending_eager_rdma(ep);
|
|
}
|
|
|
|
assert((cqp != MCA_BTL_NO_ORDER && BTL_OPENIB_QP_TYPE_PP(cqp)) || !credits);
|
|
|
|
if(credits) {
|
|
OPAL_THREAD_ADD32(&ep->qps[cqp].u.pp_qp.sd_credits, credits);
|
|
progress_pending_frags_pp(ep, cqp);
|
|
}
|
|
|
|
send_credits(ep, cqp);
|
|
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
static char* btl_openib_component_status_to_string(enum ibv_wc_status status)
|
|
{
|
|
switch(status) {
|
|
case IBV_WC_SUCCESS:
|
|
return "SUCCESS";
|
|
break;
|
|
case IBV_WC_LOC_LEN_ERR:
|
|
return "LOCAL LENGTH ERROR";
|
|
break;
|
|
case IBV_WC_LOC_QP_OP_ERR:
|
|
return "LOCAL QP OPERATION ERROR";
|
|
break;
|
|
case IBV_WC_LOC_EEC_OP_ERR:
|
|
return "LOCAL EEC OPERATION ERROR";
|
|
break;
|
|
case IBV_WC_LOC_PROT_ERR:
|
|
return "LOCAL PROTOCOL ERROR";
|
|
break;
|
|
case IBV_WC_WR_FLUSH_ERR:
|
|
return "WORK REQUEST FLUSHED ERROR";
|
|
break;
|
|
case IBV_WC_MW_BIND_ERR:
|
|
return "MEMORY WINDOW BIND ERROR";
|
|
break;
|
|
case IBV_WC_BAD_RESP_ERR:
|
|
return "BAD RESPONSE ERROR";
|
|
break;
|
|
case IBV_WC_LOC_ACCESS_ERR:
|
|
return "LOCAL ACCESS ERROR";
|
|
break;
|
|
case IBV_WC_REM_INV_REQ_ERR:
|
|
return "INVALID REQUEST ERROR";
|
|
break;
|
|
case IBV_WC_REM_ACCESS_ERR:
|
|
return "REMOTE ACCESS ERROR";
|
|
break;
|
|
case IBV_WC_REM_OP_ERR:
|
|
return "REMOTE OPERATION ERROR";
|
|
break;
|
|
case IBV_WC_RETRY_EXC_ERR:
|
|
return "RETRY EXCEEDED ERROR";
|
|
break;
|
|
case IBV_WC_RNR_RETRY_EXC_ERR:
|
|
return "RECEIVER NOT READY RETRY EXCEEDED ERROR";
|
|
break;
|
|
case IBV_WC_LOC_RDD_VIOL_ERR:
|
|
return "LOCAL RDD VIOLATION ERROR";
|
|
break;
|
|
case IBV_WC_REM_INV_RD_REQ_ERR:
|
|
return "INVALID READ REQUEST ERROR";
|
|
break;
|
|
case IBV_WC_REM_ABORT_ERR:
|
|
return "REMOTE ABORT ERROR";
|
|
break;
|
|
case IBV_WC_INV_EECN_ERR:
|
|
return "INVALID EECN ERROR";
|
|
break;
|
|
case IBV_WC_INV_EEC_STATE_ERR:
|
|
return "INVALID EEC STATE ERROR";
|
|
break;
|
|
case IBV_WC_FATAL_ERR:
|
|
return "FATAL ERROR";
|
|
break;
|
|
case IBV_WC_RESP_TIMEOUT_ERR:
|
|
return "RESPONSE TIMEOUT ERROR";
|
|
break;
|
|
case IBV_WC_GENERAL_ERR:
|
|
return "GENERAL ERROR";
|
|
break;
|
|
default:
|
|
return "STATUS UNDEFINED";
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void
|
|
progress_pending_frags_wqe(mca_btl_base_endpoint_t *ep, const int qpn)
|
|
{
|
|
int i;
|
|
opal_list_item_t *frag;
|
|
mca_btl_openib_qp_t *qp = ep->qps[qpn].qp;
|
|
|
|
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
|
for(i = 0; i < 2; i++) {
|
|
while(qp->sd_wqe > 0) {
|
|
mca_btl_base_endpoint_t *ep;
|
|
OPAL_THREAD_LOCK(&qp->lock);
|
|
frag = opal_list_remove_first(&qp->pending_frags[i]);
|
|
OPAL_THREAD_UNLOCK(&qp->lock);
|
|
if(NULL == frag)
|
|
break;
|
|
ep = to_com_frag(frag)->endpoint;
|
|
mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag));
|
|
}
|
|
}
|
|
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
|
}
|
|
|
|
static void progress_pending_frags_srq(mca_btl_openib_module_t* openib_btl,
|
|
const int qp)
|
|
{
|
|
opal_list_item_t *frag;
|
|
int i;
|
|
|
|
assert(BTL_OPENIB_QP_TYPE_SRQ(qp) || BTL_OPENIB_QP_TYPE_XRC(qp));
|
|
|
|
for(i = 0; i < 2; i++) {
|
|
while(openib_btl->qps[qp].u.srq_qp.sd_credits > 0) {
|
|
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
|
|
frag = opal_list_remove_first(
|
|
&openib_btl->qps[qp].u.srq_qp.pending_frags[i]);
|
|
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
|
|
|
if(NULL == frag)
|
|
break;
|
|
|
|
mca_btl_openib_endpoint_send(to_com_frag(frag)->endpoint,
|
|
to_send_frag(frag));
|
|
}
|
|
}
|
|
}
|
|
|
|
static char *cq_name[] = {"HP CQ", "LP CQ"};
|
|
static void handle_wc(mca_btl_openib_hca_t* hca, const uint32_t cq,
|
|
struct ibv_wc *wc)
|
|
{
|
|
static int flush_err_printed[] = {0, 0};
|
|
mca_btl_openib_com_frag_t* frag;
|
|
mca_btl_base_descriptor_t *des;
|
|
mca_btl_openib_endpoint_t* endpoint;
|
|
mca_btl_openib_module_t *openib_btl = NULL;
|
|
ompi_proc_t* remote_proc = NULL;
|
|
int qp, btl_ownership;
|
|
|
|
des = (mca_btl_base_descriptor_t*)(uintptr_t)wc->wr_id;
|
|
frag = to_com_frag(des);
|
|
|
|
/* For receive fragments "order" contains QP idx the fragment was posted
|
|
* to. For send fragments "order" contains QP idx the fragment was send
|
|
* through */
|
|
qp = des->order;
|
|
endpoint = frag->endpoint;
|
|
|
|
if(endpoint)
|
|
openib_btl = endpoint->endpoint_btl;
|
|
|
|
if(wc->status != IBV_WC_SUCCESS)
|
|
goto error;
|
|
|
|
/* Handle work completions */
|
|
switch(wc->opcode) {
|
|
case IBV_WC_RDMA_READ:
|
|
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
|
|
/* fall through */
|
|
|
|
case IBV_WC_RDMA_WRITE:
|
|
case IBV_WC_SEND:
|
|
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
|
|
opal_list_item_t *i;
|
|
while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
|
|
btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
|
|
&to_base_frag(i)->base, OMPI_SUCCESS);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
|
|
}
|
|
}
|
|
}
|
|
/* Process a completed send/put/get */
|
|
btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
|
des->des_cbfunc(&openib_btl->super, endpoint, des,OMPI_SUCCESS);
|
|
if( btl_ownership ) {
|
|
mca_btl_openib_free(&openib_btl->super, des);
|
|
}
|
|
|
|
/* return send wqe */
|
|
qp_put_wqe(endpoint, qp);
|
|
|
|
if(IBV_WC_SEND == wc->opcode && !BTL_OPENIB_QP_TYPE_PP(qp)) {
|
|
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
|
|
|
|
/* new SRQ credit available. Try to progress pending frags*/
|
|
progress_pending_frags_srq(openib_btl, qp);
|
|
}
|
|
/* new wqe or/and get token available. Try to progress pending frags */
|
|
progress_pending_frags_wqe(endpoint, qp);
|
|
mca_btl_openib_frag_progress_pending_put_get(endpoint, qp);
|
|
break;
|
|
case IBV_WC_RECV:
|
|
if(wc->wc_flags & IBV_WC_WITH_IMM) {
|
|
endpoint = (mca_btl_openib_endpoint_t*)
|
|
opal_pointer_array_get_item(hca->endpoints, wc->imm_data);
|
|
frag->endpoint = endpoint;
|
|
openib_btl = endpoint->endpoint_btl;
|
|
}
|
|
|
|
/* Process a RECV */
|
|
if(btl_openib_handle_incoming(openib_btl, endpoint, to_recv_frag(frag),
|
|
wc->byte_len) != OMPI_SUCCESS) {
|
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
|
break;
|
|
}
|
|
|
|
/* decide if it is time to setup an eager rdma channel */
|
|
if(!endpoint->eager_rdma_local.base.pval && endpoint->use_eager_rdma &&
|
|
wc->byte_len < mca_btl_openib_component.eager_limit &&
|
|
openib_btl->eager_rdma_channels <
|
|
mca_btl_openib_component.max_eager_rdma &&
|
|
OPAL_THREAD_ADD32(&endpoint->eager_recv_count, 1) ==
|
|
mca_btl_openib_component.eager_rdma_threshold) {
|
|
mca_btl_openib_endpoint_connect_eager_rdma(endpoint);
|
|
}
|
|
break;
|
|
default:
|
|
BTL_ERROR(("Unhandled work completion opcode is %d", wc->opcode));
|
|
if(openib_btl)
|
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
|
break;
|
|
}
|
|
|
|
return;
|
|
|
|
error:
|
|
if(endpoint && endpoint->endpoint_proc && endpoint->endpoint_proc->proc_ompi)
|
|
remote_proc = endpoint->endpoint_proc->proc_ompi;
|
|
|
|
/* For iWARP, the TCP connection is tied to the QP once the QP is in RTS.
|
|
* And destroying the QP is thus tied to connection teardown for iWARP.
|
|
* To destroy the connection in iWARP you must move the QP out of RTS, either
|
|
* into CLOSING for a nice graceful close, or to ERROR if you want to be rude.
|
|
* In both cases, all pending non-completed SQ and RQ WRs must be flushed.
|
|
*/
|
|
if (wc->status == IBV_WC_WR_FLUSH_ERR && IBV_TRANSPORT_IWARP == hca->ib_dev->transport_type) {
|
|
return;
|
|
}
|
|
|
|
if(wc->status != IBV_WC_WR_FLUSH_ERR || !flush_err_printed[cq]++) {
|
|
BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
|
|
"status number %d for wr_id %llu opcode %d qp_idx %d",
|
|
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
|
wc->status, wc->wr_id, wc->opcode, qp));
|
|
}
|
|
|
|
if(IBV_WC_RETRY_EXC_ERR == wc->status)
|
|
opal_show_help("help-mpi-btl-openib.txt", "btl_openib:retry-exceeded", true);
|
|
|
|
if(openib_btl)
|
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
|
}
|
|
|
|
static int poll_hca(mca_btl_openib_hca_t* hca, int count)
|
|
{
|
|
int ne = 0, cq;
|
|
uint32_t hp_iter = 0;
|
|
struct ibv_wc wc;
|
|
|
|
hca->pollme = false;
|
|
for(cq = 0; cq < 2 && hp_iter < mca_btl_openib_component.cq_poll_progress;)
|
|
{
|
|
ne = ibv_poll_cq(hca->ib_cq[cq], 1, &wc);
|
|
if(0 == ne) {
|
|
/* don't check low prio cq if there was something in high prio cq,
|
|
* but for each cq_poll_ratio hp cq polls poll lp cq once */
|
|
if(count && hca->hp_cq_polls)
|
|
break;
|
|
cq++;
|
|
hca->hp_cq_polls = mca_btl_openib_component.cq_poll_ratio;
|
|
continue;
|
|
}
|
|
|
|
if(ne < 0)
|
|
goto error;
|
|
|
|
count++;
|
|
|
|
if(BTL_OPENIB_HP_CQ == cq) {
|
|
hca->pollme = true;
|
|
hp_iter++;
|
|
hca->hp_cq_polls--;
|
|
}
|
|
|
|
handle_wc(hca, cq, &wc);
|
|
}
|
|
|
|
return count;
|
|
error:
|
|
BTL_ERROR(("error polling %s with %d errno says %s", cq_name[cq], ne,
|
|
strerror(errno)));
|
|
return count;
|
|
}
|
|
|
|
#if OMPI_ENABLE_PROGRESS_THREADS
|
|
void* mca_btl_openib_progress_thread(opal_object_t* arg)
|
|
{
|
|
opal_thread_t* thread = (opal_thread_t*)arg;
|
|
mca_btl_openib_hca_t* hca = thread->t_arg;
|
|
struct ibv_cq *ev_cq;
|
|
void *ev_ctx;
|
|
|
|
/* This thread enter in a cancel enabled state */
|
|
pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
|
|
pthread_setcanceltype( PTHREAD_CANCEL_ASYNCHRONOUS, NULL );
|
|
|
|
opal_output(-1, "WARNING: the openib btl progress thread code *does not yet work*. Your run is likely to hang, crash, break the kitchen sink, and/or eat your cat. You have been warned.");
|
|
|
|
while (hca->progress) {
|
|
while(opal_progress_threads()) {
|
|
while(opal_progress_threads())
|
|
sched_yield();
|
|
usleep(100); /* give app a chance to re-enter library */
|
|
}
|
|
|
|
if(ibv_get_cq_event(hca->ib_channel, &ev_cq, &ev_ctx))
|
|
BTL_ERROR(("Failed to get CQ event with error %s",
|
|
strerror(errno)));
|
|
if(ibv_req_notify_cq(ev_cq, 0)) {
|
|
BTL_ERROR(("Couldn't request CQ notification with error %s",
|
|
strerror(errno)));
|
|
}
|
|
|
|
ibv_ack_cq_events(ev_cq, 1);
|
|
|
|
while(poll_hca(hca, 0));
|
|
}
|
|
|
|
return PTHREAD_CANCELED;
|
|
}
|
|
#endif
|
|
|
|
static int progress_one_hca(mca_btl_openib_hca_t *hca)
|
|
{
|
|
int i, c, count = 0, ret;
|
|
mca_btl_openib_recv_frag_t* frag;
|
|
mca_btl_openib_endpoint_t* endpoint;
|
|
uint32_t non_eager_rdma_endpoints = 0;
|
|
|
|
c = hca->eager_rdma_buffers_count;
|
|
non_eager_rdma_endpoints += (hca->non_eager_rdma_endpoints + hca->pollme);
|
|
|
|
for(i = 0; i < c; i++) {
|
|
endpoint = hca->eager_rdma_buffers[i];
|
|
|
|
if(!endpoint)
|
|
continue;
|
|
|
|
OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
|
|
frag = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(endpoint,
|
|
endpoint->eager_rdma_local.head);
|
|
|
|
if(MCA_BTL_OPENIB_RDMA_FRAG_LOCAL(frag)) {
|
|
uint32_t size;
|
|
mca_btl_openib_module_t *btl = endpoint->endpoint_btl;
|
|
|
|
opal_atomic_rmb();
|
|
|
|
if(endpoint->nbo) {
|
|
BTL_OPENIB_FOOTER_NTOH(*frag->ftr);
|
|
}
|
|
size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);
|
|
#if OMPI_ENABLE_DEBUG
|
|
if (frag->ftr->seq != endpoint->eager_rdma_local.seq)
|
|
BTL_ERROR(("Eager RDMA wrong SEQ: received %d expected %d",
|
|
frag->ftr->seq,
|
|
endpoint->eager_rdma_local.seq));
|
|
endpoint->eager_rdma_local.seq++;
|
|
#endif
|
|
MCA_BTL_OPENIB_RDMA_NEXT_INDEX(endpoint->eager_rdma_local.head);
|
|
|
|
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
|
|
frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
|
|
size + sizeof(mca_btl_openib_footer_t));
|
|
to_base_frag(frag)->segment.seg_addr.pval =
|
|
((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
|
|
|
|
ret = btl_openib_handle_incoming(btl, to_com_frag(frag)->endpoint,
|
|
frag, size - sizeof(mca_btl_openib_footer_t));
|
|
if (ret != MPI_SUCCESS) {
|
|
btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
|
return 0;
|
|
}
|
|
|
|
count++;
|
|
} else
|
|
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
|
|
}
|
|
|
|
hca->eager_rdma_polls--;
|
|
|
|
if(0 == count || non_eager_rdma_endpoints != 0 || !hca->eager_rdma_polls) {
|
|
count += poll_hca(hca, count);
|
|
hca->eager_rdma_polls = mca_btl_openib_component.eager_rdma_poll_ratio;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
/*
|
|
* IB component progress.
|
|
*/
|
|
static int btl_openib_component_progress(void)
|
|
{
|
|
int i;
|
|
int count = 0;
|
|
|
|
#if OMPI_HAVE_THREADS
|
|
if(OPAL_UNLIKELY(mca_btl_openib_component.use_async_event_thread &&
|
|
mca_btl_openib_component.fatal_counter)) {
|
|
goto error;
|
|
}
|
|
#endif
|
|
|
|
for(i = 0; i < mca_btl_openib_component.hcas_count; i++) {
|
|
mca_btl_openib_hca_t *hca =
|
|
opal_pointer_array_get_item(&mca_btl_openib_component.hcas, i);
|
|
count += progress_one_hca(hca);
|
|
}
|
|
|
|
return count;
|
|
|
|
#if OMPI_HAVE_THREADS
|
|
error:
|
|
/* Set the fatal counter to zero */
|
|
mca_btl_openib_component.fatal_counter = 0;
|
|
/* Lets found all fatal events */
|
|
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
|
mca_btl_openib_module_t* openib_btl =
|
|
mca_btl_openib_component.openib_btls[i];
|
|
if(openib_btl->hca->got_fatal_event) {
|
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
|
}
|
|
}
|
|
return count;
|
|
#endif
|
|
}
|
|
|
|
int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl, const int qp)
|
|
{
|
|
int rd_low = mca_btl_openib_component.qp_infos[qp].rd_low;
|
|
int rd_num = mca_btl_openib_component.qp_infos[qp].rd_num;
|
|
int num_post, i, rc;
|
|
struct ibv_recv_wr *bad_wr, *wr_list = NULL, *wr = NULL;
|
|
|
|
assert(!BTL_OPENIB_QP_TYPE_PP(qp));
|
|
|
|
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
|
|
if(openib_btl->qps[qp].u.srq_qp.rd_posted > rd_low) {
|
|
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
|
return OMPI_SUCCESS;
|
|
}
|
|
num_post = rd_num - openib_btl->qps[qp].u.srq_qp.rd_posted;
|
|
|
|
for(i = 0; i < num_post; i++) {
|
|
ompi_free_list_item_t* item;
|
|
OMPI_FREE_LIST_WAIT(&openib_btl->hca->qps[qp].recv_free, item, rc);
|
|
to_base_frag(item)->base.order = qp;
|
|
to_com_frag(item)->endpoint = NULL;
|
|
if(NULL == wr)
|
|
wr = wr_list = &to_recv_frag(item)->rd_desc;
|
|
else
|
|
wr = wr->next = &to_recv_frag(item)->rd_desc;
|
|
}
|
|
|
|
wr->next = NULL;
|
|
|
|
rc = ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq, wr_list, &bad_wr);
|
|
if(OPAL_LIKELY(0 == rc)) {
|
|
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post);
|
|
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
|
return OMPI_SUCCESS;
|
|
}
|
|
|
|
for(i = 0; wr_list && wr_list != bad_wr; i++, wr_list = wr_list->next);
|
|
|
|
BTL_ERROR(("error posting receive descriptors to shared receive "
|
|
"queue %d (%d from %d)", qp, i, num_post));
|
|
|
|
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
|
|
return OMPI_ERROR;
|
|
}
|