Removing trailing white spaces in all the openib btl code.
This commit was SVN r24855.
Этот коммит содержится в:
родитель
5cae33503d
Коммит
4fbe68dd86
@ -5,16 +5,16 @@
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
@ -116,10 +116,10 @@ endif
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component)
|
||||
mca_btl_openib_la_SOURCES = $(component_sources)
|
||||
mca_btl_openib_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS)
|
||||
mca_btl_openib_la_LDFLAGS = -module -avoid-version $(btl_openib_LDFLAGS)
|
||||
mca_btl_openib_la_LIBADD = $(btl_openib_LIBS)
|
||||
|
||||
noinst_LTLIBRARIES = $(lib)
|
||||
libmca_btl_openib_la_SOURCES = $(lib_sources)
|
||||
libmca_btl_openib_la_LDFLAGS= -module -avoid-version $(btl_openib_LDFLAGS)
|
||||
libmca_btl_openib_la_LDFLAGS= -module -avoid-version $(btl_openib_LDFLAGS)
|
||||
libmca_btl_openib_la_LIBADD = $(btl_openib_LIBS)
|
||||
|
@ -413,8 +413,8 @@ static int mca_btl_openib_size_queues(struct mca_btl_openib_module_t* openib_btl
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (0 == openib_btl->num_peers &&
|
||||
(mca_btl_openib_component.num_srq_qps > 0 ||
|
||||
if (0 == openib_btl->num_peers &&
|
||||
(mca_btl_openib_component.num_srq_qps > 0 ||
|
||||
mca_btl_openib_component.num_xrc_qps > 0)) {
|
||||
rc = create_srq(openib_btl);
|
||||
}
|
||||
@ -426,7 +426,7 @@ out:
|
||||
|
||||
mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib_module_t* openib_btl)
|
||||
{
|
||||
/* If we have a driver with RDMAoE supporting as the device struct contains the same type (IB) for
|
||||
/* If we have a driver with RDMAoE supporting as the device struct contains the same type (IB) for
|
||||
IBV_LINK_LAYER_INFINIBAND and IBV_LINK_LAYER_ETHERNET link layers and the single way
|
||||
to detect this fact is to check their link_layer fields in a port_attr struct.
|
||||
If our driver doesn't support this feature => the checking of transport type in device struct will be enough.
|
||||
@ -455,7 +455,7 @@ mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib
|
||||
case IBV_TRANSPORT_IWARP:
|
||||
return MCA_BTL_OPENIB_TRANSPORT_IWARP;
|
||||
|
||||
case IBV_TRANSPORT_UNKNOWN:
|
||||
case IBV_TRANSPORT_UNKNOWN:
|
||||
default:
|
||||
return MCA_BTL_OPENIB_TRANSPORT_UNKNOWN;
|
||||
}
|
||||
@ -464,7 +464,7 @@ mca_btl_openib_transport_type_t mca_btl_openib_get_transport_type(mca_btl_openib
|
||||
#endif
|
||||
}
|
||||
|
||||
static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
int ret = OMPI_SUCCESS;
|
||||
@ -485,7 +485,7 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
endpoint->rem_info.rem_vendor_id,
|
||||
endpoint->rem_info.rem_vendor_part_id,
|
||||
mca_btl_openib_transport_name_strings[endpoint->rem_info.rem_transport_type]);
|
||||
|
||||
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
@ -503,7 +503,7 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
}
|
||||
|
||||
if(openib_btl->device->mtu < endpoint->rem_info.rem_mtu) {
|
||||
endpoint->rem_info.rem_mtu = openib_btl->device->mtu;
|
||||
endpoint->rem_info.rem_mtu = openib_btl->device->mtu;
|
||||
}
|
||||
|
||||
endpoint->use_eager_rdma = openib_btl->device->use_eager_rdma &
|
||||
@ -520,11 +520,11 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
case BTL_OPENIB_RQ_SOURCE_MAX:
|
||||
break;
|
||||
|
||||
/* If the queues configuration was set from command line
|
||||
/* If the queues configuration was set from command line
|
||||
(with --mca btl_openib_receive_queues parameter) => both sides have a same configuration */
|
||||
|
||||
/* In this case the local queues configuration was gotten from INI file =>
|
||||
not possible that remote side got its queues configuration from command line =>
|
||||
not possible that remote side got its queues configuration from command line =>
|
||||
(by prio) the configuration was set from INI file or (if not configure)
|
||||
by default queues configuration */
|
||||
case BTL_OPENIB_RQ_SOURCE_DEVICE_INI:
|
||||
@ -552,7 +552,7 @@ static int mca_btl_openib_tune_endpoint(mca_btl_openib_module_t* openib_btl,
|
||||
}
|
||||
break;
|
||||
|
||||
/* If the local queues configuration was set
|
||||
/* If the local queues configuration was set
|
||||
by default queues => check all possible cases for remote side and compare */
|
||||
case BTL_OPENIB_RQ_SOURCE_DEFAULT:
|
||||
if(NULL != values.receive_queues) {
|
||||
@ -628,7 +628,7 @@ int mca_btl_openib_add_procs(
|
||||
|
||||
opal_output(-1, "add procs: adding proc %d", i);
|
||||
|
||||
/* OOB, XOOB, RDMACM, IBCM does not support SELF comunication, so
|
||||
/* OOB, XOOB, RDMACM, IBCM does not support SELF comunication, so
|
||||
* mark the prco as unreachable by openib btl */
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields
|
||||
(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, &ompi_proc->proc_name)) {
|
||||
@ -705,7 +705,7 @@ int mca_btl_openib_add_procs(
|
||||
on the peer has a matching CPC. */
|
||||
assert(btl_rank <= ib_proc->proc_port_count);
|
||||
assert(remote_matching_port != -1);
|
||||
if (OMPI_SUCCESS !=
|
||||
if (OMPI_SUCCESS !=
|
||||
ompi_btl_openib_connect_base_find_match(openib_btl,
|
||||
&(ib_proc->proc_ports[remote_matching_port]),
|
||||
&local_cpc,
|
||||
@ -751,8 +751,8 @@ int mca_btl_openib_add_procs(
|
||||
}
|
||||
}
|
||||
#endif
|
||||
mca_btl_openib_endpoint_init(openib_btl, endpoint,
|
||||
local_cpc,
|
||||
mca_btl_openib_endpoint_init(openib_btl, endpoint,
|
||||
local_cpc,
|
||||
&(ib_proc->proc_ports[remote_matching_port]),
|
||||
remote_cpc_data);
|
||||
|
||||
@ -1139,7 +1139,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
|
||||
|
||||
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64
|
||||
" frag->segment.seg_key.key32[0] = %" PRIu32,
|
||||
frag->sg_entry.lkey,
|
||||
frag->sg_entry.lkey,
|
||||
frag->sg_entry.addr,
|
||||
frag->sg_entry.lkey));
|
||||
|
||||
@ -1260,7 +1260,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
|
||||
|
||||
BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64 " "
|
||||
"frag->segment.seg_key.key32[0] = %" PRIu32,
|
||||
frag->sg_entry.lkey,
|
||||
frag->sg_entry.lkey,
|
||||
frag->sg_entry.addr,
|
||||
openib_reg->mr->rkey));
|
||||
|
||||
@ -1317,7 +1317,7 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
|
||||
opal_hash_table_t *srq_addr_table =
|
||||
&mca_btl_openib_component.srq_manager.srq_addr_table;
|
||||
|
||||
opal_mutex_lock(lock);
|
||||
opal_mutex_lock(lock);
|
||||
if (OPAL_SUCCESS !=
|
||||
opal_hash_table_remove_value_ptr(srq_addr_table,
|
||||
&openib_btl->qps[qp].u.srq_qp.srq,
|
||||
@ -1353,7 +1353,7 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
|
||||
}
|
||||
|
||||
if (NULL != openib_btl->qps) {
|
||||
free(openib_btl->qps);
|
||||
free(openib_btl->qps);
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -1399,7 +1399,7 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
|
||||
|
||||
/*
|
||||
* Send immediate - Minimum function calls minimum checks, send the data ASAP.
|
||||
* If BTL can't to send the messages imidiate, it creates messages descriptor
|
||||
* If BTL can't to send the messages imidiate, it creates messages descriptor
|
||||
* returns it to PML.
|
||||
*/
|
||||
int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
@ -1411,14 +1411,14 @@ int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
uint8_t order,
|
||||
uint32_t flags,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t** descriptor)
|
||||
mca_btl_base_descriptor_t** descriptor)
|
||||
{
|
||||
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
|
||||
size_t size = payload_size + header_size;
|
||||
size_t eager_limit;
|
||||
int rc,
|
||||
qp = frag_size_to_order(obtl, size),
|
||||
prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY),
|
||||
int rc,
|
||||
qp = frag_size_to_order(obtl, size),
|
||||
prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY),
|
||||
ib_rc;
|
||||
int32_t cm_return;
|
||||
bool do_rdma = false;
|
||||
|
@ -142,7 +142,7 @@ typedef enum {
|
||||
/* The structer for manage all BTL SRQs */
|
||||
typedef struct mca_btl_openib_srq_manager_t {
|
||||
opal_mutex_t lock;
|
||||
/* The keys of this hash table are addresses of
|
||||
/* The keys of this hash table are addresses of
|
||||
SRQs structures, and the elements are BTL modules
|
||||
pointers that associated with these SRQs */
|
||||
opal_hash_table_t srq_addr_table;
|
||||
@ -405,7 +405,7 @@ struct mca_btl_openib_module_srq_qp_t {
|
||||
/** We post additional WQEs only if a number of WQEs (in specific SRQ) is less of this value.
|
||||
The value increased together with rd_curr_num. The value is unique for every SRQ. */
|
||||
int32_t rd_low_local;
|
||||
/** The flag points if we want to get the
|
||||
/** The flag points if we want to get the
|
||||
IBV_EVENT_SRQ_LIMIT_REACHED events for dynamically resizing SRQ */
|
||||
bool srq_limit_event_flag;
|
||||
/**< In difference of the "--mca enable_srq_resize" parameter that says, if we want(or no)
|
||||
@ -580,7 +580,7 @@ extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl,
|
||||
uint32_t flags,
|
||||
mca_btl_base_tag_t tag,
|
||||
mca_btl_base_descriptor_t** descriptor
|
||||
);
|
||||
);
|
||||
|
||||
/**
|
||||
* PML->BTL Initiate a put of the specified size.
|
||||
|
@ -141,7 +141,7 @@ static int btl_openib_async_poll_init(struct mca_btl_openib_async_poll *devices_
|
||||
}
|
||||
|
||||
/* Send command completion to main thread */
|
||||
static int send_command_comp(int in)
|
||||
static int send_command_comp(int in)
|
||||
{
|
||||
if (write(mca_btl_openib_component.async_comp_pipe[1], &in, sizeof(int)) < 0) {
|
||||
BTL_ERROR(("Write failed [%d]",errno));
|
||||
@ -227,7 +227,7 @@ static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *devices_p
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/* The main idea of resizing SRQ algorithm -
|
||||
/* The main idea of resizing SRQ algorithm -
|
||||
We create a SRQ with size = rd_num, but for efficient usage of resources
|
||||
the number of WQEs that we post = rd_curr_num < rd_num and this value is
|
||||
increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function),
|
||||
@ -327,7 +327,7 @@ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_po
|
||||
BTL_ERROR(("Alternative path migration event reported"));
|
||||
if (APM_ENABLED) {
|
||||
BTL_ERROR(("Trying to find additional path..."));
|
||||
if (!xrc_event)
|
||||
if (!xrc_event)
|
||||
mca_btl_openib_load_apm(event.element.qp,
|
||||
qp2endpoint(event.element.qp, device));
|
||||
#if HAVE_XRC
|
||||
@ -472,7 +472,7 @@ void* btl_openib_async_thread(void * async)
|
||||
return PTHREAD_CANCELED;
|
||||
}
|
||||
|
||||
int btl_openib_async_command_done(int exp)
|
||||
int btl_openib_async_command_done(int exp)
|
||||
{
|
||||
int comp;
|
||||
if (read(mca_btl_openib_component.async_comp_pipe[0], &comp,
|
||||
|
@ -284,27 +284,27 @@ static int btl_openib_modex_send(void)
|
||||
* c. a uint8_t indicating the length of the blob to follow
|
||||
* d. a blob that is only meaningful to that CPC
|
||||
*/
|
||||
msg_size =
|
||||
msg_size =
|
||||
/* uint8_t for number of modules in the message */
|
||||
1 +
|
||||
/* For each module: */
|
||||
mca_btl_openib_component.ib_num_btls *
|
||||
mca_btl_openib_component.ib_num_btls *
|
||||
(
|
||||
/* Common module data */
|
||||
modex_message_size +
|
||||
modex_message_size +
|
||||
/* uint8_t for how many CPCs follow */
|
||||
1
|
||||
);
|
||||
/* For each module, add in the size of the per-CPC data */
|
||||
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
||||
for (j = 0;
|
||||
for (j = 0;
|
||||
j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
|
||||
++j) {
|
||||
msg_size +=
|
||||
msg_size +=
|
||||
/* uint8_t for the index of the CPC */
|
||||
1 +
|
||||
/* uint8_t for the CPC's priority */
|
||||
1 +
|
||||
1 +
|
||||
/* uint8_t for the blob length */
|
||||
1 +
|
||||
/* blob length */
|
||||
@ -337,15 +337,15 @@ static int btl_openib_modex_send(void)
|
||||
(mca_btl_openib_component.openib_btls[i]->port_info).transport_type =
|
||||
mca_btl_openib_get_transport_type(mca_btl_openib_component.openib_btls[i]);
|
||||
|
||||
memcpy(offset,
|
||||
&(mca_btl_openib_component.openib_btls[i]->port_info),
|
||||
memcpy(offset,
|
||||
&(mca_btl_openib_component.openib_btls[i]->port_info),
|
||||
size);
|
||||
opal_output(-1, "modex packed btl port modex message: 0x%" PRIx64 ", %d, %d (size: %d)",
|
||||
mca_btl_openib_component.openib_btls[i]->port_info.subnet_id,
|
||||
mca_btl_openib_component.openib_btls[i]->port_info.mtu,
|
||||
mca_btl_openib_component.openib_btls[i]->port_info.lid,
|
||||
(int) size);
|
||||
|
||||
|
||||
#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
|
||||
MCA_BTL_OPENIB_MODEX_MSG_HTON(*(mca_btl_openib_modex_message_t *)offset);
|
||||
#endif
|
||||
@ -354,20 +354,20 @@ static int btl_openib_modex_send(void)
|
||||
i, (int) (offset -message));
|
||||
|
||||
/* Pack the number of CPCs that follow */
|
||||
pack8(&offset,
|
||||
pack8(&offset,
|
||||
mca_btl_openib_component.openib_btls[i]->num_cpcs);
|
||||
opal_output(-1, "modex packed btl %d: to pack %d cpcs (packed: %d, offset now %d)",
|
||||
i, mca_btl_openib_component.openib_btls[i]->num_cpcs,
|
||||
*((uint8_t*) (offset - 1)), (int) (offset-message));
|
||||
|
||||
/* Pack each CPC */
|
||||
for (j = 0;
|
||||
for (j = 0;
|
||||
j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
|
||||
++j) {
|
||||
uint8_t u8;
|
||||
|
||||
cpc = mca_btl_openib_component.openib_btls[i]->cpcs[j];
|
||||
opal_output(-1, "modex packed btl %d: packing cpc %s",
|
||||
opal_output(-1, "modex packed btl %d: packing cpc %s",
|
||||
i, cpc->data.cbm_component->cbc_name);
|
||||
/* Pack the CPC index */
|
||||
u8 = ompi_btl_openib_connect_base_get_cpc_index(cpc->data.cbm_component);
|
||||
@ -434,7 +434,7 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
|
||||
case MCA_BTL_OPENIB_CONTROL_RDMA:
|
||||
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)ctl_hdr;
|
||||
|
||||
BTL_VERBOSE(("prior to NTOH received rkey %" PRIu32
|
||||
BTL_VERBOSE(("prior to NTOH received rkey %" PRIu32
|
||||
", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32,
|
||||
rdma_hdr->rkey,
|
||||
rdma_hdr->rdma_start.lval,
|
||||
@ -446,7 +446,7 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
|
||||
BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(*rdma_hdr);
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("received rkey %" PRIu32
|
||||
BTL_VERBOSE(("received rkey %" PRIu32
|
||||
", rdma_start.lval %" PRIx64 ", pval %p,"
|
||||
" ival %" PRIu32, rdma_hdr->rkey,
|
||||
rdma_hdr->rdma_start.lval,
|
||||
@ -633,7 +633,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
BTL_VERBOSE(("my iWARP subnet_id is %016" PRIx64, subnet_id));
|
||||
} else {
|
||||
memset(&gid, 0, sizeof(gid));
|
||||
if (0 != ibv_query_gid(device->ib_dev_context, port_num,
|
||||
if (0 != ibv_query_gid(device->ib_dev_context, port_num,
|
||||
mca_btl_openib_component.gid_index, &gid)) {
|
||||
BTL_ERROR(("ibv_query_gid failed (%s:%d, %d)\n",
|
||||
ibv_get_device_name(device->ib_dev), port_num,
|
||||
@ -643,7 +643,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
|
||||
#ifdef OMPI_HAVE_RDMAOE
|
||||
if (IBV_LINK_LAYER_ETHERNET == ib_port_attr->link_layer) {
|
||||
subnet_id = mca_btl_openib_get_ip_subnet_id(device->ib_dev,
|
||||
subnet_id = mca_btl_openib_get_ip_subnet_id(device->ib_dev,
|
||||
port_num);
|
||||
} else {
|
||||
subnet_id = ntoh64(gid.global.subnet_prefix);
|
||||
@ -652,11 +652,11 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
subnet_id = ntoh64(gid.global.subnet_prefix);
|
||||
#endif
|
||||
|
||||
BTL_VERBOSE(("my IB subnet_id for HCA %s port %d is %016" PRIx64,
|
||||
BTL_VERBOSE(("my IB subnet_id for HCA %s port %d is %016" PRIx64,
|
||||
ibv_get_device_name(device->ib_dev), port_num, subnet_id));
|
||||
}
|
||||
#else
|
||||
if (0 != ibv_query_gid(device->ib_dev_context, port_num,
|
||||
if (0 != ibv_query_gid(device->ib_dev_context, port_num,
|
||||
mca_btl_openib_component.gid_index, &gid)) {
|
||||
BTL_ERROR(("ibv_query_gid failed (%s:%d, %d)\n",
|
||||
ibv_get_device_name(device->ib_dev), port_num,
|
||||
@ -664,7 +664,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
return OMPI_ERR_NOT_FOUND;
|
||||
}
|
||||
subnet_id = ntoh64(gid.global.subnet_prefix);
|
||||
BTL_VERBOSE(("my IB-only subnet_id for HCA %s port %d is %016" PRIx64,
|
||||
BTL_VERBOSE(("my IB-only subnet_id for HCA %s port %d is %016" PRIx64,
|
||||
ibv_get_device_name(device->ib_dev), port_num, subnet_id));
|
||||
#endif
|
||||
|
||||
@ -868,9 +868,9 @@ static void device_construct(mca_btl_openib_device_t *device)
|
||||
#endif
|
||||
device->qps = NULL;
|
||||
#if OPAL_HAVE_THREADS
|
||||
mca_btl_openib_component.async_pipe[0] =
|
||||
mca_btl_openib_component.async_pipe[0] =
|
||||
mca_btl_openib_component.async_pipe[1] = -1;
|
||||
mca_btl_openib_component.async_comp_pipe[0] =
|
||||
mca_btl_openib_component.async_comp_pipe[0] =
|
||||
mca_btl_openib_component.async_comp_pipe[1] = -1;
|
||||
#endif
|
||||
OBJ_CONSTRUCT(&device->device_lock, opal_mutex_t);
|
||||
@ -910,7 +910,7 @@ static void device_destruct(mca_btl_openib_device_t *device)
|
||||
/* wait for ok from thread */
|
||||
if (OMPI_SUCCESS != btl_openib_async_command_done(device_to_remove)){
|
||||
goto device_error;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1006,10 +1006,10 @@ static int prepare_device_for_use(mca_btl_openib_device_t *device)
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
/* wait for ok from thread */
|
||||
if (OMPI_SUCCESS !=
|
||||
if (OMPI_SUCCESS !=
|
||||
btl_openib_async_command_done(device->ib_dev_context->async_fd)) {
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||
/* Prepare data for thread, but not starting it */
|
||||
@ -1338,7 +1338,7 @@ static int setup_qps(void)
|
||||
if (0 == opal_argv_count(queues)) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"no qps in receive_queues", true,
|
||||
orte_process_info.nodename,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues);
|
||||
ret = OMPI_ERROR;
|
||||
goto error;
|
||||
@ -1357,7 +1357,7 @@ static int setup_qps(void)
|
||||
num_xrc_qps++;
|
||||
#else
|
||||
orte_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
|
||||
orte_process_info.nodename,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues);
|
||||
ret = OMPI_ERR_NOT_AVAILABLE;
|
||||
goto error;
|
||||
@ -1365,7 +1365,7 @@ static int setup_qps(void)
|
||||
} else {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"invalid qp type in receive_queues", true,
|
||||
orte_process_info.nodename,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues,
|
||||
queues[qp]);
|
||||
ret = OMPI_ERR_BAD_PARAM;
|
||||
@ -1377,7 +1377,7 @@ static int setup_qps(void)
|
||||
and SRQ */
|
||||
if (num_xrc_qps > 0 && (num_pp_qps > 0 || num_srq_qps > 0)) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "XRC with PP or SRQ", true,
|
||||
orte_process_info.nodename,
|
||||
orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues);
|
||||
ret = OMPI_ERR_BAD_PARAM;
|
||||
goto error;
|
||||
@ -1385,8 +1385,8 @@ static int setup_qps(void)
|
||||
|
||||
/* Current XRC implementation can't used with btls_per_lid > 1 */
|
||||
if (num_xrc_qps > 0 && mca_btl_openib_component.btls_per_lid > 1) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID",
|
||||
true, orte_process_info.nodename,
|
||||
orte_show_help("help-mpi-btl-openib.txt", "XRC with BTLs per LID",
|
||||
true, orte_process_info.nodename,
|
||||
mca_btl_openib_component.receive_queues, num_xrc_qps);
|
||||
ret = OMPI_ERR_BAD_PARAM;
|
||||
goto error;
|
||||
@ -1676,13 +1676,13 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
ports and QPs on this device */
|
||||
need_search = false;
|
||||
if(-2 != mca_btl_openib_component.ib_max_inline_data) {
|
||||
/* User has explicitly set btl_openib_max_inline_data MCA parameter
|
||||
/* User has explicitly set btl_openib_max_inline_data MCA parameter
|
||||
Per setup in _mca.c, we know that the MCA param value is guaranteed
|
||||
to be >= -1 */
|
||||
if (-1 == mca_btl_openib_component.ib_max_inline_data) {
|
||||
need_search = true;
|
||||
} else {
|
||||
device->max_inline_data = (uint32_t)
|
||||
device->max_inline_data = (uint32_t)
|
||||
mca_btl_openib_component.ib_max_inline_data;
|
||||
}
|
||||
} else if (values.max_inline_data_set) {
|
||||
@ -1691,20 +1691,20 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
} else if (values.max_inline_data >= 0) {
|
||||
device->max_inline_data = (uint32_t) values.max_inline_data;
|
||||
} else {
|
||||
if(default_values.max_inline_data_set &&
|
||||
if(default_values.max_inline_data_set &&
|
||||
default_values.max_inline_data >= -1) {
|
||||
BTL_ERROR(("Invalid max_inline_data value specified "
|
||||
"in INI file (%d); using default value (%d)",
|
||||
values.max_inline_data,
|
||||
"in INI file (%d); using default value (%d)",
|
||||
values.max_inline_data,
|
||||
default_values.max_inline_data));
|
||||
device->max_inline_data = (uint32_t)
|
||||
device->max_inline_data = (uint32_t)
|
||||
default_values.max_inline_data;
|
||||
} else {
|
||||
BTL_ERROR(("Invalid max_inline_data value specified "
|
||||
"in INI file (%d)", values.max_inline_data));
|
||||
ret = OMPI_ERR_BAD_PARAM;
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Horrible. :-( Per the thread starting here:
|
||||
@ -1729,7 +1729,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
orte_show_help("help-mpi-btl-openib.txt", "init-fail-create-q",
|
||||
true, orte_process_info.nodename,
|
||||
__FILE__, __LINE__, "ibv_create_cq",
|
||||
strerror(errno), errno,
|
||||
strerror(errno), errno,
|
||||
ibv_get_device_name(device->ib_dev));
|
||||
ret = OMPI_ERR_NOT_AVAILABLE;
|
||||
goto error;
|
||||
@ -1749,7 +1749,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
that's good enough */
|
||||
init_attr.cap.max_inline_data = max_inline_data = 1 << 20;
|
||||
while (max_inline_data > 0) {
|
||||
qp = ibv_create_qp(device->ib_pd, &init_attr);
|
||||
qp = ibv_create_qp(device->ib_pd, &init_attr);
|
||||
if (NULL != qp) {
|
||||
break;
|
||||
}
|
||||
@ -1779,7 +1779,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
/* Eager RDMA is not currently supported with progress threads */
|
||||
if (device->use_eager_rdma && OMPI_ENABLE_PROGRESS_THREADS) {
|
||||
device->use_eager_rdma = 0;
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"eager RDMA and progress threads", true);
|
||||
}
|
||||
|
||||
@ -1896,7 +1896,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
modules in this process. This is an unfortunate artifact
|
||||
of the openib BTL startup sequence (see below for more
|
||||
details). The first device will choose the receive_queues
|
||||
value from: (in priority order):
|
||||
value from: (in priority order):
|
||||
|
||||
1. If the btl_openib_receive_queues MCA param was
|
||||
specified, use that.
|
||||
@ -1991,7 +1991,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
shares one of the problems cited in case 8, below. So
|
||||
we need to fail this scenario; print an error and
|
||||
abort.
|
||||
|
||||
|
||||
Case 8: one INI value, different than default
|
||||
- MCA parameter: not specified
|
||||
- default receive_queues: value A
|
||||
@ -2036,7 +2036,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
Server 2:
|
||||
HCA B: no receive_queues in INI file
|
||||
HCA C: receive_queues specified in INI file
|
||||
|
||||
|
||||
A will therefore use the default receive_queues
|
||||
value. B and C will use C's INI receive_queues.
|
||||
But note that modex [currently] only sends around
|
||||
@ -2078,7 +2078,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
- device 1: receive_queues value B in INI file
|
||||
- device 2: receive_queues value B in INI file
|
||||
--> per case 8, fail with a show_help message.
|
||||
|
||||
|
||||
Case 10: two devices with different INI values
|
||||
- MCA parameter: not specified
|
||||
- default receive_queues: value A
|
||||
@ -2121,7 +2121,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
|
||||
in the INI, then it must agree with
|
||||
component.receive_queues. */
|
||||
if (NULL != values.receive_queues) {
|
||||
if (0 != strcmp(values.receive_queues,
|
||||
if (0 != strcmp(values.receive_queues,
|
||||
mca_btl_openib_component.receive_queues)) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"locally conflicting receive_queues", true,
|
||||
@ -2187,7 +2187,7 @@ error:
|
||||
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"error in device init", true,
|
||||
"error in device init", true,
|
||||
orte_process_info.nodename,
|
||||
ibv_get_device_name(device->ib_dev));
|
||||
}
|
||||
@ -2453,7 +2453,7 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
support, so the following test is [currently] good enough... */
|
||||
value = opal_mem_hooks_support_level();
|
||||
#if !OPAL_HAVE_THREADS
|
||||
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
|
||||
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
|
||||
((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & value)) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"ptmalloc2 with no threads", true,
|
||||
@ -2468,7 +2468,7 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
|
||||
We have a memory manager if we have both FREE and MUNMAP
|
||||
support */
|
||||
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
|
||||
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) ==
|
||||
((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & value)) {
|
||||
ret = 0;
|
||||
index = mca_base_param_find("mpi", NULL, "leave_pinned");
|
||||
@ -2481,7 +2481,7 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
index = mca_base_param_find("mpi", NULL, "leave_pinned_pipeline");
|
||||
if (index >= 0) {
|
||||
if (OPAL_SUCCESS == mca_base_param_lookup_int(index, &value) &&
|
||||
OPAL_SUCCESS == mca_base_param_lookup_source(index, &source,
|
||||
OPAL_SUCCESS == mca_base_param_lookup_source(index, &source,
|
||||
NULL)) {
|
||||
if (0 == value && MCA_BASE_PARAM_SOURCE_DEFAULT == source) {
|
||||
++ret;
|
||||
@ -2498,10 +2498,10 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
if (index >= 0) {
|
||||
if (OPAL_SUCCESS == mca_base_param_lookup_source(index, &source,
|
||||
NULL)) {
|
||||
if (-1 == mca_btl_openib_component.ib_max_inline_data &&
|
||||
if (-1 == mca_btl_openib_component.ib_max_inline_data &&
|
||||
MCA_BASE_PARAM_SOURCE_DEFAULT == source) {
|
||||
/* If the user has not explicitly set this MCA parameter
|
||||
use max_inline_data value specified in the
|
||||
use max_inline_data value specified in the
|
||||
device-specific parameters INI file */
|
||||
mca_btl_openib_component.ib_max_inline_data = -2;
|
||||
}
|
||||
@ -2644,7 +2644,7 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
mca_btl_openib_component.async_thread = 0;
|
||||
#endif
|
||||
distance = dev_sorted[0].distance;
|
||||
for (found = false, i = 0;
|
||||
for (found = false, i = 0;
|
||||
i < num_devs && (-1 == mca_btl_openib_component.ib_max_btls ||
|
||||
mca_btl_openib_component.ib_num_btls <
|
||||
mca_btl_openib_component.ib_max_btls); i++) {
|
||||
@ -2740,7 +2740,7 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
}
|
||||
#endif
|
||||
|
||||
/* For XRC:
|
||||
/* For XRC:
|
||||
* from this point we know if MCA_BTL_XRC_ENABLED it true or false */
|
||||
|
||||
/* Init XRC IB Addr hash table */
|
||||
@ -2807,11 +2807,11 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
base device that doesn't have device->qps setup on it yet (remember
|
||||
that some modules may share the same device, so when going through
|
||||
to loop, we may hit a device that was already setup earlier in
|
||||
the loop).
|
||||
|
||||
the loop).
|
||||
|
||||
We may to call for prepare_device_for_use() only after adding the btl
|
||||
to mca_btl_openib_component.openib_btls, since the prepare_device_for_use
|
||||
adds device to async thread that require access to
|
||||
to mca_btl_openib_component.openib_btls, since the prepare_device_for_use
|
||||
adds device to async thread that require access to
|
||||
mca_btl_openib_component.openib_btls.
|
||||
*/
|
||||
|
||||
@ -2829,7 +2829,7 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
ret = prepare_device_for_use(device);
|
||||
if (OMPI_SUCCESS != ret) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"error in device init", true,
|
||||
"error in device init", true,
|
||||
orte_process_info.nodename,
|
||||
ibv_get_device_name(device->ib_dev));
|
||||
goto no_btls;
|
||||
@ -2856,12 +2856,12 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
opal_argv_free(mca_btl_openib_component.if_exclude_list);
|
||||
mca_btl_openib_component.if_exclude_list = NULL;
|
||||
}
|
||||
|
||||
|
||||
/* setup the fork warning message as we are sensitive
|
||||
* to memory corruption issues when fork is called
|
||||
*/
|
||||
ompi_warn_fork();
|
||||
|
||||
|
||||
return btls;
|
||||
|
||||
no_btls:
|
||||
@ -2923,7 +2923,7 @@ static int progress_no_credits_pending_frags(mca_btl_base_endpoint_t *ep)
|
||||
retransmit for us).
|
||||
*/
|
||||
for (len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
|
||||
len > 0 &&
|
||||
len > 0 &&
|
||||
(ep->eager_rdma_remote.tokens > 0 ||
|
||||
ep->qps[qp].u.pp_qp.sd_credits > 0 ||
|
||||
!BTL_OPENIB_QP_TYPE_PP(qp)); --len) {
|
||||
@ -3095,7 +3095,7 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
|
||||
if (rcredits + credits > 0) {
|
||||
int rc;
|
||||
|
||||
if (OMPI_SUCCESS !=
|
||||
if (OMPI_SUCCESS !=
|
||||
(rc = progress_no_credits_pending_frags(ep))) {
|
||||
return rc;
|
||||
}
|
||||
@ -3363,7 +3363,7 @@ error:
|
||||
* SQ and RQ WRs will automatically be flushed.
|
||||
*/
|
||||
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
|
||||
if (IBV_WC_WR_FLUSH_ERR == wc->status &&
|
||||
if (IBV_WC_WR_FLUSH_ERR == wc->status &&
|
||||
IBV_TRANSPORT_IWARP == device->ib_dev->transport_type) {
|
||||
return;
|
||||
}
|
||||
@ -3373,47 +3373,47 @@ error:
|
||||
BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
|
||||
"status number %d for wr_id %" PRIx64 " opcode %d vendor error %d qp_idx %d",
|
||||
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
||||
wc->status, wc->wr_id,
|
||||
wc->status, wc->wr_id,
|
||||
wc->opcode, wc->vendor_err, qp));
|
||||
orte_notifier.log_peer(ORTE_NOTIFIER_CRIT, ORTE_ERR_COMM_FAILURE,
|
||||
remote_proc ? &remote_proc->proc_name : NULL,
|
||||
"\n\tIB polling %s with status %s "
|
||||
"status number %d for wr_id %llu opcode %d vendor error %d qp_idx %d",
|
||||
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
||||
wc->status, wc->wr_id,
|
||||
wc->status, wc->wr_id,
|
||||
wc->opcode, wc->vendor_err, qp);
|
||||
}
|
||||
|
||||
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
|
||||
IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||
char *peer_hostname =
|
||||
char *peer_hostname =
|
||||
(NULL != endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
endpoint->endpoint_proc->proc_ompi->proc_hostname :
|
||||
endpoint->endpoint_proc->proc_ompi->proc_hostname :
|
||||
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
|
||||
const char *device_name =
|
||||
const char *device_name =
|
||||
ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device);
|
||||
|
||||
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
BTL_OPENIB_QP_TYPE_PP(qp) ?
|
||||
"pp rnr retry exceeded" :
|
||||
BTL_OPENIB_QP_TYPE_PP(qp) ?
|
||||
"pp rnr retry exceeded" :
|
||||
"srq rnr retry exceeded", true,
|
||||
orte_process_info.nodename, device_name,
|
||||
peer_hostname);
|
||||
orte_notifier.show_help(ORTE_NOTIFIER_CRIT, ORTE_ERR_COMM_FAILURE,
|
||||
"help-mpi-btl-openib.txt",
|
||||
BTL_OPENIB_QP_TYPE_PP(qp) ?
|
||||
"pp rnr retry exceeded" :
|
||||
BTL_OPENIB_QP_TYPE_PP(qp) ?
|
||||
"pp rnr retry exceeded" :
|
||||
"srq rnr retry exceeded",
|
||||
orte_process_info.nodename, device_name,
|
||||
peer_hostname);
|
||||
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"pp retry exceeded", true,
|
||||
orte_process_info.nodename,
|
||||
device_name, peer_hostname);
|
||||
orte_notifier.show_help(ORTE_NOTIFIER_CRIT, ORTE_ERR_COMM_FAILURE,
|
||||
"help-mpi-btl-openib.txt",
|
||||
"help-mpi-btl-openib.txt",
|
||||
"pp retry exceeded",
|
||||
orte_process_info.nodename,
|
||||
device_name, peer_hostname);
|
||||
|
@ -310,7 +310,7 @@ void mca_btl_openib_endpoint_init(mca_btl_openib_module_t *btl,
|
||||
ep->rem_info.rem_lid = remote_proc_info->pm_port_info.lid;
|
||||
ep->rem_info.rem_subnet_id = remote_proc_info->pm_port_info.subnet_id;
|
||||
ep->rem_info.rem_mtu = remote_proc_info->pm_port_info.mtu;
|
||||
opal_output(-1, "Got remote LID, subnet, MTU: %d, 0x%" PRIx64 ", %d",
|
||||
opal_output(-1, "Got remote LID, subnet, MTU: %d, 0x%" PRIx64 ", %d",
|
||||
ep->rem_info.rem_lid,
|
||||
ep->rem_info.rem_subnet_id,
|
||||
ep->rem_info.rem_mtu);
|
||||
@ -508,7 +508,7 @@ static void cts_sent(mca_btl_base_module_t* btl,
|
||||
/*
|
||||
* Send CTS control fragment
|
||||
*/
|
||||
void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
|
||||
void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
|
||||
{
|
||||
mca_btl_openib_send_control_frag_t *sc_frag;
|
||||
mca_btl_base_descriptor_t *base_des;
|
||||
@ -665,8 +665,8 @@ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
|
||||
while(master && !opal_list_is_empty(&endpoint->ib_addr->pending_ep)) {
|
||||
ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep);
|
||||
ep = (mca_btl_openib_endpoint_t *)ep_item;
|
||||
if (OMPI_SUCCESS !=
|
||||
ompi_btl_openib_connect_base_start(endpoint->endpoint_local_cpc,
|
||||
if (OMPI_SUCCESS !=
|
||||
ompi_btl_openib_connect_base_start(endpoint->endpoint_local_cpc,
|
||||
ep)) {
|
||||
BTL_ERROR(("Failed to connect pending endpoint\n"));
|
||||
}
|
||||
@ -874,7 +874,7 @@ static int mca_btl_openib_endpoint_send_eager_rdma(
|
||||
rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
|
||||
rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
|
||||
rdma_hdr->rdma_start.lval = ompi_ptr_ptol(endpoint->eager_rdma_local.base.pval);
|
||||
BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64
|
||||
BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64
|
||||
", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n",
|
||||
rdma_hdr->rkey,
|
||||
rdma_hdr->rdma_start.lval,
|
||||
|
@ -208,7 +208,7 @@ struct mca_btl_base_endpoint_t {
|
||||
/** list of pending rget ops */
|
||||
opal_list_t pending_get_frags;
|
||||
/** list of pending rput ops */
|
||||
opal_list_t pending_put_frags;
|
||||
opal_list_t pending_put_frags;
|
||||
|
||||
/** number of available get tokens */
|
||||
int32_t get_tokens;
|
||||
@ -503,8 +503,8 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
do {
|
||||
ftr->seq = ep->eager_rdma_remote.seq;
|
||||
} while (!OPAL_ATOMIC_CMPSET_32((int32_t*) &ep->eager_rdma_remote.seq,
|
||||
(int32_t) ftr->seq,
|
||||
} while (!OPAL_ATOMIC_CMPSET_32((int32_t*) &ep->eager_rdma_remote.seq,
|
||||
(int32_t) ftr->seq,
|
||||
(int32_t) (ftr->seq+1)));
|
||||
#endif
|
||||
if(ep->nbo)
|
||||
|
@ -56,7 +56,7 @@ static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint);
|
||||
* @param qp Queue pair that had the error
|
||||
* @param remote_proc Pointer to process that had the error
|
||||
* @param endpoint Pointer to endpoint that had the error
|
||||
*/
|
||||
*/
|
||||
void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
||||
mca_btl_base_descriptor_t *des,
|
||||
int qp,
|
||||
@ -84,7 +84,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
||||
* way to figure out what type of message created the error because
|
||||
* we need the information in the wc->imm_data field which does not
|
||||
* exist when we have an error. So, nothing to do here but return. */
|
||||
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
||||
if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
||||
!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
||||
"SRQ RECV type=%d", openib_frag_type(des));
|
||||
@ -108,7 +108,7 @@ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
|
||||
* B. It was some type of openib specific control message.
|
||||
* Therefore, just drop the fragments and call up into the PML to
|
||||
* disable this endpoint for future communication. */
|
||||
if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
||||
if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
|
||||
(BTL_OPENIB_QP_TYPE_PP(qp))) ||
|
||||
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) ||
|
||||
(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) {
|
||||
@ -244,7 +244,7 @@ void mca_btl_openib_handle_btl_error(mca_btl_openib_module_t* openib_btl) {
|
||||
NULL, btlname);
|
||||
if (NULL != btlname) free(btlname);
|
||||
|
||||
/* Now send out messages to all endpoints that we are disconnecting.
|
||||
/* Now send out messages to all endpoints that we are disconnecting.
|
||||
* Only do this to endpoints that are connected. Otherwise, the
|
||||
* remote side does not yet have the information on this endpoint. */
|
||||
for (i = 0; i < opal_pointer_array_get_size(openib_btl->device->endpoints); i++) {
|
||||
@ -372,7 +372,7 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
||||
opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
|
||||
"IB: rank=%d, control message (remote=%d), "
|
||||
"moved local head by one (new=%d)",
|
||||
ORTE_PROC_MY_NAME->vpid,
|
||||
ORTE_PROC_MY_NAME->vpid,
|
||||
newep->endpoint_proc->proc_ompi->proc_name.vpid,
|
||||
newep->eager_rdma_local.head);
|
||||
} else {
|
||||
@ -399,9 +399,9 @@ void btl_openib_handle_failover_control_messages(mca_btl_openib_control_header_t
|
||||
* and call the callback function with OMPI_ERROR. It walks through
|
||||
* each qp with each priority and looks for both no_credits_pending_frags
|
||||
* and no_wqe_pending_frags. It then looks for any pending_lazy_frags,
|
||||
* pending_put_frags, and pending_get_frags. This function is only
|
||||
* pending_put_frags, and pending_get_frags. This function is only
|
||||
* called when running with failover support enabled. Note that
|
||||
* the errout parameter allows the function to also be used as a
|
||||
* the errout parameter allows the function to also be used as a
|
||||
* debugging tool to see if there are any fragments on any of the
|
||||
* queues.
|
||||
* @param ep Pointer to endpoint that had error
|
||||
@ -753,7 +753,7 @@ void mca_btl_openib_dump_all_local_rdma_frags(mca_btl_openib_device_t *device) {
|
||||
|
||||
/**
|
||||
* This function is a debugging tool. If you notify a hang, you can
|
||||
* call this function from a debugger and see if there are any
|
||||
* call this function from a debugger and see if there are any
|
||||
* messages stuck in any of the queues. If you call it with
|
||||
* errout=true, then it will error them out. Otherwise, it will
|
||||
* just print out the size of the queues with data in them.
|
||||
@ -769,7 +769,7 @@ void mca_btl_openib_dump_all_internal_queues(bool errout) {
|
||||
btl = mca_btl_openib_component.openib_btls[i];
|
||||
module = &btl->super;
|
||||
num_eps = opal_pointer_array_get_size(btl->device->endpoints);
|
||||
|
||||
|
||||
/* Now, find the endpoint associated with it */
|
||||
for (j = 0; j < num_eps; j++) {
|
||||
ep = (mca_btl_openib_endpoint_t*)
|
||||
|
@ -3,9 +3,9 @@
|
||||
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -174,7 +174,7 @@ static int service_pipe_cmd_add_fd(bool use_libevent, cmd_t *cmd)
|
||||
if (use_libevent) {
|
||||
/* Make an event for this fd */
|
||||
ri->ri_event_used = true;
|
||||
opal_event_set(opal_event_base, &ri->ri_event, ri->ri_fd,
|
||||
opal_event_set(opal_event_base, &ri->ri_event, ri->ri_fd,
|
||||
ri->ri_flags | OPAL_EV_PERSIST, service_fd_callback,
|
||||
ri);
|
||||
opal_event_add(&ri->ri_event, 0);
|
||||
@ -248,13 +248,13 @@ static int service_pipe_cmd_remove_fd(cmd_t *cmd)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Let the caller know that we have stopped monitoring
|
||||
this fd (if they care) */
|
||||
if (NULL != cmd->pc_fn.event) {
|
||||
cmd->pc_fn.event(cmd->pc_fd, 0, cmd->pc_context);
|
||||
}
|
||||
|
||||
|
||||
/* Remove this item from the list of registered items and
|
||||
release it */
|
||||
opal_list_remove_item(®istered_items, item);
|
||||
@ -344,7 +344,7 @@ static bool service_pipe_cmd(void)
|
||||
--waiting_for_ack_from_main_thread;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
OPAL_OUTPUT((-1, "fd service thread: unknown pipe command!"));
|
||||
break;
|
||||
@ -390,7 +390,7 @@ static void *service_thread_start(void *context)
|
||||
break;
|
||||
}
|
||||
OPAL_OUTPUT((-1, "fd service thread: back from pipe command"));
|
||||
}
|
||||
}
|
||||
|
||||
/* Go through all the registered events and see who had
|
||||
activity */
|
||||
@ -414,7 +414,7 @@ static void *service_thread_start(void *context)
|
||||
/* If either was ready, invoke the callback */
|
||||
if (0 != flags) {
|
||||
OPAL_OUTPUT((-1, "fd service thread: invoking callback for registered fd %d", ri->ri_fd));
|
||||
ri->ri_callback.event(ri->ri_fd, flags,
|
||||
ri->ri_callback.event(ri->ri_fd, flags,
|
||||
ri->ri_context);
|
||||
OPAL_OUTPUT((-1, "fd service thread: back from callback for registered fd %d", ri->ri_fd));
|
||||
}
|
||||
@ -443,7 +443,7 @@ static void main_thread_event_callback(int fd, short event, void *context)
|
||||
break;
|
||||
|
||||
default:
|
||||
OPAL_OUTPUT((-1, "fd main thread: unknown pipe command: %d",
|
||||
OPAL_OUTPUT((-1, "fd main thread: unknown pipe command: %d",
|
||||
cmd.pc_cmd));
|
||||
break;
|
||||
}
|
||||
@ -481,12 +481,12 @@ int ompi_btl_openib_fd_init(void)
|
||||
/* Create a libevent event that is used in the main thread
|
||||
to watch its pipe */
|
||||
opal_event_set(opal_event_base, &main_thread_event, pipe_to_main_thread[0],
|
||||
OPAL_EV_READ | OPAL_EV_PERSIST,
|
||||
OPAL_EV_READ | OPAL_EV_PERSIST,
|
||||
main_thread_event_callback, NULL);
|
||||
opal_event_add(&main_thread_event, 0);
|
||||
|
||||
|
||||
/* Start the service thread */
|
||||
if (0 != pthread_create(&thread, NULL, service_thread_start,
|
||||
if (0 != pthread_create(&thread, NULL, service_thread_start,
|
||||
NULL)) {
|
||||
int errno_save = errno;
|
||||
opal_event_del(&main_thread_event);
|
||||
@ -509,7 +509,7 @@ int ompi_btl_openib_fd_init(void)
|
||||
* Start monitoring an fd
|
||||
* Called by main or service thread; callback will be in service thread
|
||||
*/
|
||||
int ompi_btl_openib_fd_monitor(int fd, int flags,
|
||||
int ompi_btl_openib_fd_monitor(int fd, int flags,
|
||||
ompi_btl_openib_fd_event_callback_fn_t *callback,
|
||||
void *context)
|
||||
{
|
||||
@ -542,7 +542,7 @@ int ompi_btl_openib_fd_monitor(int fd, int flags,
|
||||
* Stop monitoring an fd
|
||||
* Called by main or service thread; callback will be in service thread
|
||||
*/
|
||||
int ompi_btl_openib_fd_unmonitor(int fd,
|
||||
int ompi_btl_openib_fd_unmonitor(int fd,
|
||||
ompi_btl_openib_fd_event_callback_fn_t *callback,
|
||||
void *context)
|
||||
{
|
||||
@ -552,7 +552,7 @@ int ompi_btl_openib_fd_unmonitor(int fd,
|
||||
if (fd < 0) {
|
||||
return OMPI_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
|
||||
cmd.pc_cmd = CMD_REMOVE_FD;
|
||||
cmd.pc_fd = fd;
|
||||
cmd.pc_flags = 0;
|
||||
@ -630,7 +630,7 @@ ompi_btl_openib_fd_main_thread_drain(void)
|
||||
int nfds, ret;
|
||||
fd_set rfds;
|
||||
struct timeval tv;
|
||||
|
||||
|
||||
while (1) {
|
||||
FD_ZERO(&rfds);
|
||||
FD_SET(pipe_to_main_thread[0], &rfds);
|
||||
@ -665,10 +665,10 @@ int ompi_btl_openib_fd_finalize(void)
|
||||
memset(&cmd, 0, cmd_size);
|
||||
cmd.pc_cmd = CMD_TIME_TO_QUIT;
|
||||
opal_fd_write(pipe_to_service_thread[1], cmd_size, &cmd);
|
||||
|
||||
|
||||
pthread_join(thread, NULL);
|
||||
opal_atomic_rmb();
|
||||
|
||||
|
||||
opal_event_del(&main_thread_event);
|
||||
|
||||
close(pipe_to_service_thread[0]);
|
||||
|
@ -3,9 +3,9 @@
|
||||
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -19,7 +19,7 @@ BEGIN_C_DECLS
|
||||
/**
|
||||
* Typedef for fd callback function
|
||||
*/
|
||||
typedef void *(ompi_btl_openib_fd_event_callback_fn_t)(int fd, int flags,
|
||||
typedef void *(ompi_btl_openib_fd_event_callback_fn_t)(int fd, int flags,
|
||||
void *context);
|
||||
|
||||
/**
|
||||
@ -37,7 +37,7 @@ int ompi_btl_openib_fd_init(void);
|
||||
* Start monitoring an fd.
|
||||
* Called by main or service thread; callback will be in service thread.
|
||||
*/
|
||||
int ompi_btl_openib_fd_monitor(int fd, int flags,
|
||||
int ompi_btl_openib_fd_monitor(int fd, int flags,
|
||||
ompi_btl_openib_fd_event_callback_fn_t *callback,
|
||||
void *context);
|
||||
|
||||
@ -45,7 +45,7 @@ int ompi_btl_openib_fd_monitor(int fd, int flags,
|
||||
* Stop monitoring an fd.
|
||||
* Called by main or service thread; callback will be in service thread.
|
||||
*/
|
||||
int ompi_btl_openib_fd_unmonitor(int fd,
|
||||
int ompi_btl_openib_fd_unmonitor(int fd,
|
||||
ompi_btl_openib_fd_event_callback_fn_t *callback,
|
||||
void *context);
|
||||
|
||||
|
@ -110,7 +110,7 @@ int ompi_btl_openib_ini_init(void)
|
||||
#ifndef __WINDOWS__
|
||||
char separator = ':';
|
||||
#else
|
||||
/* ':' is part of the path on Windows,
|
||||
/* ':' is part of the path on Windows,
|
||||
so use ';' instead. */
|
||||
char separator = ';';
|
||||
#endif
|
||||
@ -414,7 +414,7 @@ static int parse_line(parsed_section_values_t *sv)
|
||||
|
||||
else if (0 == strcasecmp(key_buffer, "rdmacm_reject_causes_connect_error")) {
|
||||
/* Single value */
|
||||
sv->values.rdmacm_reject_causes_connect_error =
|
||||
sv->values.rdmacm_reject_causes_connect_error =
|
||||
(bool) ompi_btl_openib_ini_intify(value);
|
||||
sv->values.rdmacm_reject_causes_connect_error_set = true;
|
||||
}
|
||||
@ -558,7 +558,7 @@ static int save_section(parsed_section_values_t *s)
|
||||
}
|
||||
|
||||
if (NULL != s->values.receive_queues) {
|
||||
h->values.receive_queues =
|
||||
h->values.receive_queues =
|
||||
strdup(s->values.receive_queues);
|
||||
}
|
||||
|
||||
@ -568,9 +568,9 @@ static int save_section(parsed_section_values_t *s)
|
||||
}
|
||||
|
||||
if (s->values.rdmacm_reject_causes_connect_error_set) {
|
||||
h->values.rdmacm_reject_causes_connect_error =
|
||||
h->values.rdmacm_reject_causes_connect_error =
|
||||
s->values.rdmacm_reject_causes_connect_error;
|
||||
h->values.rdmacm_reject_causes_connect_error_set =
|
||||
h->values.rdmacm_reject_causes_connect_error_set =
|
||||
true;
|
||||
}
|
||||
|
||||
|
@ -31,9 +31,9 @@
|
||||
#include "btl_openib_ip.h"
|
||||
#if OMPI_HAVE_RDMACM
|
||||
|
||||
/*
|
||||
/*
|
||||
* The cruft below maintains the linked list of rdma ipv4 addresses and their
|
||||
* associated rdma device names and device port numbers.
|
||||
* associated rdma device names and device port numbers.
|
||||
*/
|
||||
struct rdma_addr_list {
|
||||
opal_list_item_t super;
|
||||
@ -45,7 +45,7 @@ struct rdma_addr_list {
|
||||
};
|
||||
typedef struct rdma_addr_list rdma_addr_list_t;
|
||||
|
||||
static OBJ_CLASS_INSTANCE(rdma_addr_list_t, opal_list_item_t,
|
||||
static OBJ_CLASS_INSTANCE(rdma_addr_list_t, opal_list_item_t,
|
||||
NULL, NULL);
|
||||
static opal_list_t *myaddrs = NULL;
|
||||
|
||||
@ -54,7 +54,7 @@ static char *stringify(uint32_t addr)
|
||||
{
|
||||
static char line[64];
|
||||
memset(line, 0, sizeof(line));
|
||||
snprintf(line, sizeof(line) - 1, "%d.%d.%d.%d (0x%x)",
|
||||
snprintf(line, sizeof(line) - 1, "%d.%d.%d.%d (0x%x)",
|
||||
#if defined(WORDS_BIGENDIAN)
|
||||
(addr >> 24),
|
||||
(addr >> 16) & 0xff,
|
||||
@ -119,7 +119,7 @@ uint64_t mca_btl_openib_get_ip_subnet_id(struct ibv_device *ib_dev,
|
||||
* mismatch if IP Aliases are being used. For more information on
|
||||
* this, please read comment above mca_btl_openib_get_ip_subnet_id.
|
||||
*/
|
||||
uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs,
|
||||
uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs,
|
||||
uint8_t port)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
@ -135,7 +135,7 @@ uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs,
|
||||
item != opal_list_get_end(myaddrs);
|
||||
item = opal_list_get_next(item)) {
|
||||
struct rdma_addr_list *addr = (struct rdma_addr_list *)item;
|
||||
if (!strcmp(addr->dev_name, verbs->device->name) &&
|
||||
if (!strcmp(addr->dev_name, verbs->device->name) &&
|
||||
port == addr->dev_port) {
|
||||
BTL_VERBOSE(("FOUND: %s:%d is %s",
|
||||
ibv_get_device_name(verbs->device), port,
|
||||
@ -219,7 +219,7 @@ static int ipaddr_specified(struct sockaddr_in *ipaddr, uint32_t netmask)
|
||||
subnet = ntohl(ipaddr->sin_addr.s_addr) & ~(all >> netmask);
|
||||
opal_argv_free(temp);
|
||||
|
||||
if (subnet == list_subnet) {
|
||||
if (subnet == list_subnet) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -261,7 +261,7 @@ static int ipaddr_specified(struct sockaddr_in *ipaddr, uint32_t netmask)
|
||||
subnet = ntohl(ipaddr->sin_addr.s_addr) & ~(all >> netmask);
|
||||
opal_argv_free(temp);
|
||||
|
||||
if (subnet == list_subnet) {
|
||||
if (subnet == list_subnet) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@ -282,7 +282,7 @@ static int add_rdma_addr(struct sockaddr *ipaddr, uint32_t netmask)
|
||||
/* Ensure that this IP address is not in 127.0.0.1/8. If it is,
|
||||
skip it because we never want loopback addresses to be
|
||||
considered RDMA devices that remote peers can use to connect
|
||||
to.
|
||||
to.
|
||||
|
||||
This check is necessary because of a change that almost went
|
||||
into RDMA CM in OFED 1.5.1. We asked for a delay so that we
|
||||
@ -356,11 +356,11 @@ static int add_rdma_addr(struct sockaddr *ipaddr, uint32_t netmask)
|
||||
|
||||
myaddr->addr = sinp->sin_addr.s_addr;
|
||||
myaddr->subnet = ntohl(myaddr->addr) & ~(all >> netmask);
|
||||
inet_ntop(sinp->sin_family, &sinp->sin_addr,
|
||||
inet_ntop(sinp->sin_family, &sinp->sin_addr,
|
||||
myaddr->addr_str, sizeof(myaddr->addr_str));
|
||||
memcpy(myaddr->dev_name, cm_id->verbs->device->name, IBV_SYSFS_NAME_MAX);
|
||||
myaddr->dev_port = cm_id->port_num;
|
||||
BTL_VERBOSE(("Adding addr %s (0x%x) subnet 0x%x as %s:%d",
|
||||
BTL_VERBOSE(("Adding addr %s (0x%x) subnet 0x%x as %s:%d",
|
||||
myaddr->addr_str, myaddr->addr, myaddr->subnet,
|
||||
myaddr->dev_name, myaddr->dev_port));
|
||||
|
||||
@ -400,7 +400,7 @@ int mca_btl_openib_build_rdma_addr_list(void)
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
void mca_btl_openib_free_rdma_addr_list(void)
|
||||
{
|
||||
opal_list_item_t *item, *next;
|
||||
@ -419,27 +419,27 @@ void mca_btl_openib_free_rdma_addr_list(void)
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
#else
|
||||
/* !OMPI_HAVE_RDMACM case */
|
||||
|
||||
uint64_t mca_btl_openib_get_ip_subnet_id(struct ibv_device *ib_dev,
|
||||
uint8_t port)
|
||||
uint8_t port)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs,
|
||||
uint8_t port)
|
||||
uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs,
|
||||
uint8_t port)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mca_btl_openib_build_rdma_addr_list(void)
|
||||
int mca_btl_openib_build_rdma_addr_list(void)
|
||||
{
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
void mca_btl_openib_free_rdma_addr_list(void)
|
||||
void mca_btl_openib_free_rdma_addr_list(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
@ -32,7 +32,7 @@ extern uint64_t mca_btl_openib_get_ip_subnet_id(struct ibv_device *ib_dev,
|
||||
* @param port (IN) physical port of the IBV device
|
||||
* @return IPv4 Address
|
||||
*/
|
||||
extern uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs,
|
||||
extern uint32_t mca_btl_openib_rdma_get_ipv4addr(struct ibv_context *verbs,
|
||||
uint8_t port);
|
||||
|
||||
/**
|
||||
|
@ -6,15 +6,15 @@
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -72,13 +72,13 @@ NAME_CHAR [A-Za-z0-9_\-\.\\\/]
|
||||
<comment>[^*\n]* ; /* Eat up non '*'s */
|
||||
<comment>"*"+[^*/\n]* ; /* Eat '*'s not followed by a '/' */
|
||||
<comment>\n { ++btl_openib_ini_yynewlines;
|
||||
return BTL_OPENIB_INI_PARSE_NEWLINE; }
|
||||
return BTL_OPENIB_INI_PARSE_NEWLINE; }
|
||||
<comment>"*"+"/" { BEGIN(INITIAL); /* Done with block comment */
|
||||
return BTL_OPENIB_INI_PARSE_NEWLINE; }
|
||||
|
||||
{WHITE}*\[{WHITE}* { BEGIN(section_name); }
|
||||
<section_name>({NAME_CHAR}|{WHITE})*{NAME_CHAR}/{WHITE}*\] {
|
||||
BEGIN(section_end);
|
||||
BEGIN(section_end);
|
||||
return BTL_OPENIB_INI_PARSE_SECTION; }
|
||||
<section_name>\n { ++btl_openib_ini_yynewlines;
|
||||
return BTL_OPENIB_INI_PARSE_ERROR; }
|
||||
@ -87,7 +87,7 @@ NAME_CHAR [A-Za-z0-9_\-\.\\\/]
|
||||
++btl_openib_ini_yynewlines;
|
||||
return BTL_OPENIB_INI_PARSE_NEWLINE; }
|
||||
|
||||
{WHITE}*"="{WHITE}* { BEGIN(value);
|
||||
{WHITE}*"="{WHITE}* { BEGIN(value);
|
||||
return BTL_OPENIB_INI_PARSE_EQUAL; }
|
||||
{WHITE}+ ; /* whitespace */
|
||||
{CHAR}+ { return BTL_OPENIB_INI_PARSE_SINGLE_WORD; }
|
||||
@ -95,7 +95,7 @@ NAME_CHAR [A-Za-z0-9_\-\.\\\/]
|
||||
<value>{WHITE}*\n { BEGIN(INITIAL);
|
||||
++btl_openib_ini_yynewlines;
|
||||
return BTL_OPENIB_INI_PARSE_NEWLINE; }
|
||||
<value>[^\n]*[^\t \n]/[\t ]* {
|
||||
<value>[^\n]*[^\t \n]/[\t ]* {
|
||||
return BTL_OPENIB_INI_PARSE_VALUE; }
|
||||
|
||||
. { return BTL_OPENIB_INI_PARSE_ERROR; }
|
||||
@ -107,10 +107,10 @@ NAME_CHAR [A-Za-z0-9_\-\.\\\/]
|
||||
* This cleans up at the end of the parse (since, in this case, we
|
||||
* always parse the entire file) and prevents a memory leak.
|
||||
*/
|
||||
static int finish_parsing(void)
|
||||
static int finish_parsing(void)
|
||||
{
|
||||
if (NULL != YY_CURRENT_BUFFER) {
|
||||
yy_delete_buffer(YY_CURRENT_BUFFER);
|
||||
yy_delete_buffer(YY_CURRENT_BUFFER);
|
||||
#if defined(YY_CURRENT_BUFFER_LVALUE)
|
||||
YY_CURRENT_BUFFER_LVALUE = NULL;
|
||||
#else
|
||||
|
@ -64,7 +64,7 @@ enum {
|
||||
/*
|
||||
* utility routine for string parameter registration
|
||||
*/
|
||||
static int reg_string(const char* param_name,
|
||||
static int reg_string(const char* param_name,
|
||||
const char* deprecated_param_name,
|
||||
const char* param_desc,
|
||||
const char* default_value, char **out_value,
|
||||
@ -76,8 +76,8 @@ static int reg_string(const char* param_name,
|
||||
param_name, param_desc, false, false,
|
||||
default_value, &value);
|
||||
if (NULL != deprecated_param_name) {
|
||||
mca_base_param_reg_syn(index,
|
||||
&mca_btl_openib_component.super.btl_version,
|
||||
mca_base_param_reg_syn(index,
|
||||
&mca_btl_openib_component.super.btl_version,
|
||||
deprecated_param_name, true);
|
||||
}
|
||||
mca_base_param_lookup_string(index, &value);
|
||||
@ -95,7 +95,7 @@ static int reg_string(const char* param_name,
|
||||
/*
|
||||
* utility routine for integer parameter registration
|
||||
*/
|
||||
static int reg_int(const char* param_name,
|
||||
static int reg_int(const char* param_name,
|
||||
const char* deprecated_param_name,
|
||||
const char* param_desc,
|
||||
int default_value, int *out_value, int flags)
|
||||
@ -105,12 +105,12 @@ static int reg_int(const char* param_name,
|
||||
param_name, param_desc, false, false,
|
||||
default_value, NULL);
|
||||
if (NULL != deprecated_param_name) {
|
||||
mca_base_param_reg_syn(index,
|
||||
&mca_btl_openib_component.super.btl_version,
|
||||
mca_base_param_reg_syn(index,
|
||||
&mca_btl_openib_component.super.btl_version,
|
||||
deprecated_param_name, true);
|
||||
}
|
||||
mca_base_param_lookup_int(index, &value);
|
||||
|
||||
|
||||
if (0 != (flags & REGINT_NEG_ONE_OK) && -1 == value) {
|
||||
*out_value = value;
|
||||
return OMPI_SUCCESS;
|
||||
@ -193,7 +193,7 @@ int btl_openib_register_mca_params(void)
|
||||
}
|
||||
CHECK(reg_string("device_param_files", "hca_param_files",
|
||||
"Colon-delimited list of INI-style files that contain device vendor/part-specific parameters (use semicolon for Windows)",
|
||||
str, &mca_btl_openib_component.device_params_file_names,
|
||||
str, &mca_btl_openib_component.device_params_file_names,
|
||||
0));
|
||||
free(str);
|
||||
|
||||
@ -264,11 +264,11 @@ int btl_openib_register_mca_params(void)
|
||||
-1, &ival, REGINT_NEG_ONE_OK | REGINT_GE_ZERO));
|
||||
mca_btl_openib_component.ib_max_inline_data = (int32_t) ival;
|
||||
|
||||
CHECK(reg_string("pkey", "ib_pkey_val",
|
||||
CHECK(reg_string("pkey", "ib_pkey_val",
|
||||
"OpenFabrics partition key (pkey) value. "
|
||||
"Unsigned integer decimal or hex values are allowed (e.g., \"3\" or \"0x3f\") and will be masked against the maximum allowable IB partition key value (0x7fff)",
|
||||
"0", &pkey, 0));
|
||||
mca_btl_openib_component.ib_pkey_val =
|
||||
mca_btl_openib_component.ib_pkey_val =
|
||||
ompi_btl_openib_ini_intify(pkey) & MCA_BTL_IB_PKEY_MASK;
|
||||
free(pkey);
|
||||
|
||||
@ -278,7 +278,7 @@ int btl_openib_register_mca_params(void)
|
||||
0, &ival, REGINT_GE_ZERO));
|
||||
mca_btl_openib_component.ib_psn = (uint32_t) ival;
|
||||
|
||||
CHECK(reg_int("ib_qp_ous_rd_atom", NULL,
|
||||
CHECK(reg_int("ib_qp_ous_rd_atom", NULL,
|
||||
"InfiniBand outstanding atomic reads "
|
||||
"(must be >= 0)",
|
||||
4, &ival, REGINT_GE_ZERO));
|
||||
@ -402,7 +402,7 @@ int btl_openib_register_mca_params(void)
|
||||
CHECK(reg_int("ib_path_record_service_level", NULL,
|
||||
"Enable getting InfiniBand service level from PathRecord "
|
||||
"(must be >= 0, 0 = disabled, positive = try to get the "
|
||||
"service level from PathRecord)",
|
||||
"service level from PathRecord)",
|
||||
0, &ival, REGINT_GE_ZERO));
|
||||
mca_btl_openib_component.ib_path_record_service_level = (uint32_t) ival;
|
||||
#endif
|
||||
@ -582,11 +582,11 @@ int btl_openib_register_mca_params(void)
|
||||
|
||||
CHECK(reg_string("receive_queues", NULL,
|
||||
"Colon-delimited, comma-delimited list of receive queues: P,4096,8,6,4:P,32768,8,6,4",
|
||||
default_qps, &mca_btl_openib_component.receive_queues,
|
||||
default_qps, &mca_btl_openib_component.receive_queues,
|
||||
0));
|
||||
mca_btl_openib_component.receive_queues_source =
|
||||
(0 == strcmp(default_qps,
|
||||
mca_btl_openib_component.receive_queues)) ?
|
||||
mca_btl_openib_component.receive_queues_source =
|
||||
(0 == strcmp(default_qps,
|
||||
mca_btl_openib_component.receive_queues)) ?
|
||||
BTL_OPENIB_RQ_SOURCE_DEFAULT : BTL_OPENIB_RQ_SOURCE_MCA;
|
||||
|
||||
CHECK(reg_string("if_include", NULL,
|
||||
|
@ -172,7 +172,7 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
|
||||
BTL_VERBOSE(("unpack: %d btls", module_proc->proc_port_count));
|
||||
if (module_proc->proc_port_count > 0) {
|
||||
module_proc->proc_ports = (mca_btl_openib_proc_modex_t *)
|
||||
malloc(sizeof(mca_btl_openib_proc_modex_t) *
|
||||
malloc(sizeof(mca_btl_openib_proc_modex_t) *
|
||||
module_proc->proc_port_count);
|
||||
} else {
|
||||
module_proc->proc_ports = NULL;
|
||||
@ -194,7 +194,7 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
|
||||
/* Unpack the number of CPCs that follow */
|
||||
unpack8(&offset, &(module_proc->proc_ports[i].pm_cpc_data_count));
|
||||
BTL_VERBOSE(("unpacked btl %d: number of cpcs to follow %d (offset now %d)",
|
||||
i, module_proc->proc_ports[i].pm_cpc_data_count,
|
||||
i, module_proc->proc_ports[i].pm_cpc_data_count,
|
||||
(int)(offset-((char*)message))));
|
||||
module_proc->proc_ports[i].pm_cpc_data = (ompi_btl_openib_connect_base_module_data_t *)
|
||||
calloc(module_proc->proc_ports[i].pm_cpc_data_count,
|
||||
@ -211,15 +211,15 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
|
||||
unpack8(&offset, &u8);
|
||||
BTL_VERBOSE(("unpacked btl %d: cpc %d: index %d (offset now %d)",
|
||||
i, j, u8, (int)(offset-(char*)message)));
|
||||
cpcd->cbm_component =
|
||||
cpcd->cbm_component =
|
||||
ompi_btl_openib_connect_base_get_cpc_byindex(u8);
|
||||
BTL_VERBOSE(("unpacked btl %d: cpc %d: component %s",
|
||||
i, j, cpcd->cbm_component->cbc_name));
|
||||
|
||||
|
||||
unpack8(&offset, &cpcd->cbm_priority);
|
||||
unpack8(&offset, &cpcd->cbm_modex_message_len);
|
||||
BTL_VERBOSE(("unpacked btl %d: cpc %d: priority %d, msg len %d (offset now %d)",
|
||||
i, j, cpcd->cbm_priority,
|
||||
i, j, cpcd->cbm_priority,
|
||||
cpcd->cbm_modex_message_len,
|
||||
(int)(offset-(char*)message)));
|
||||
if (cpcd->cbm_modex_message_len > 0) {
|
||||
@ -228,7 +228,7 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
|
||||
BTL_ERROR(("Failed to malloc"));
|
||||
return NULL;
|
||||
}
|
||||
memcpy(cpcd->cbm_modex_message, offset,
|
||||
memcpy(cpcd->cbm_modex_message, offset,
|
||||
cpcd->cbm_modex_message_len);
|
||||
offset += cpcd->cbm_modex_message_len;
|
||||
BTL_VERBOSE(("unpacked btl %d: cpc %d: blob unpacked %d %x (offset now %d)",
|
||||
@ -244,7 +244,7 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
|
||||
module_proc->proc_endpoints = NULL;
|
||||
} else {
|
||||
module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
|
||||
malloc(module_proc->proc_port_count *
|
||||
malloc(module_proc->proc_port_count *
|
||||
sizeof(mca_btl_base_endpoint_t*));
|
||||
}
|
||||
if (NULL == module_proc->proc_endpoints) {
|
||||
|
@ -47,7 +47,7 @@ typedef struct mca_btl_openib_proc_modex_t {
|
||||
|
||||
/** Array of the peer's CPCs available on this port */
|
||||
ompi_btl_openib_connect_base_module_data_t *pm_cpc_data;
|
||||
|
||||
|
||||
/** Length of the pm_cpc_data array */
|
||||
uint8_t pm_cpc_data_count;
|
||||
} mca_btl_openib_proc_modex_t;
|
||||
|
@ -6,7 +6,7 @@
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
@ -14,9 +14,9 @@
|
||||
# Copyright (c) 2008 Mellanox Technologies. All rights reserved.
|
||||
# Copyright (c) 2011 Oracle and/or its affiliates. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
@ -29,7 +29,7 @@ AC_DEFUN([MCA_ompi_btl_openib_POST_CONFIG], [
|
||||
])
|
||||
|
||||
|
||||
# MCA_btl_openib_CONFIG([action-if-can-compile],
|
||||
# MCA_btl_openib_CONFIG([action-if-can-compile],
|
||||
# [action-if-cant-compile])
|
||||
# ------------------------------------------------
|
||||
AC_DEFUN([MCA_ompi_btl_openib_CONFIG],[
|
||||
|
@ -3,9 +3,9 @@
|
||||
* Copyright (c) 2009 Mellanox Technologies. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -74,13 +74,13 @@ int ompi_btl_openib_connect_base_get_cpc_index
|
||||
ompi_btl_openib_connect_base_component_t *
|
||||
ompi_btl_openib_connect_base_get_cpc_byindex(uint8_t index);
|
||||
|
||||
/*
|
||||
/*
|
||||
* Allocate a CTS frag
|
||||
*/
|
||||
int ompi_btl_openib_connect_base_alloc_cts(
|
||||
struct mca_btl_base_endpoint_t *endpoint);
|
||||
|
||||
/*
|
||||
/*
|
||||
* Free a CTS frag
|
||||
*/
|
||||
int ompi_btl_openib_connect_base_free_cts(
|
||||
|
@ -3,9 +3,9 @@
|
||||
* Copyright (c) 2007 Mellanox Technologies, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
#include "ompi_config.h"
|
||||
@ -88,7 +88,7 @@ int ompi_btl_openib_connect_base_register(void)
|
||||
all_cpc_names);
|
||||
|
||||
mca_base_param_reg_string(&mca_btl_openib_component.super.btl_version,
|
||||
"cpc_include", string, false, false,
|
||||
"cpc_include", string, false, false,
|
||||
NULL, &cpc_include);
|
||||
free(string);
|
||||
|
||||
@ -97,7 +97,7 @@ int ompi_btl_openib_connect_base_register(void)
|
||||
all_cpc_names);
|
||||
|
||||
mca_base_param_reg_string(&mca_btl_openib_component.super.btl_version,
|
||||
"cpc_exclude", string, false, false,
|
||||
"cpc_exclude", string, false, false,
|
||||
NULL, &cpc_exclude);
|
||||
free(string);
|
||||
|
||||
@ -112,7 +112,7 @@ int ompi_btl_openib_connect_base_register(void)
|
||||
temp = opal_argv_split(cpc_include, ',');
|
||||
for (save = j = 0; NULL != temp[j]; ++j) {
|
||||
for (i = 0; NULL != all[i]; ++i) {
|
||||
if (0 == strcmp(temp[j], all[i]->cbc_name)) {
|
||||
if (0 == strcmp(temp[j], all[i]->cbc_name)) {
|
||||
opal_output(-1, "include: saving %s", all[i]->cbc_name);
|
||||
available[save++] = all[i];
|
||||
++num_available;
|
||||
@ -123,7 +123,7 @@ int ompi_btl_openib_connect_base_register(void)
|
||||
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
|
||||
"cpc name not found", true,
|
||||
"include", orte_process_info.nodename,
|
||||
"include", cpc_include, temp[j],
|
||||
"include", cpc_include, temp[j],
|
||||
all_cpc_names);
|
||||
opal_argv_free(temp);
|
||||
free(all_cpc_names);
|
||||
@ -141,7 +141,7 @@ int ompi_btl_openib_connect_base_register(void)
|
||||
/* First: error check -- ensure that all the names are valid */
|
||||
for (j = 0; NULL != temp[j]; ++j) {
|
||||
for (i = 0; NULL != all[i]; ++i) {
|
||||
if (0 == strcmp(temp[j], all[i]->cbc_name)) {
|
||||
if (0 == strcmp(temp[j], all[i]->cbc_name)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -149,7 +149,7 @@ int ompi_btl_openib_connect_base_register(void)
|
||||
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
|
||||
"cpc name not found", true,
|
||||
"exclude", orte_process_info.nodename,
|
||||
"exclude", cpc_exclude, temp[j],
|
||||
"exclude", cpc_exclude, temp[j],
|
||||
all_cpc_names);
|
||||
opal_argv_free(temp);
|
||||
free(all_cpc_names);
|
||||
@ -171,13 +171,13 @@ int ompi_btl_openib_connect_base_register(void)
|
||||
}
|
||||
}
|
||||
opal_argv_free(temp);
|
||||
}
|
||||
}
|
||||
|
||||
/* If there's no include/exclude list, copy all[] into available[] */
|
||||
else {
|
||||
opal_output(-1, "no include or exclude: saving all");
|
||||
memcpy(available, all, sizeof(all));
|
||||
num_available = (sizeof(all) /
|
||||
num_available = (sizeof(all) /
|
||||
sizeof(ompi_btl_openib_connect_base_module_t *)) - 1;
|
||||
}
|
||||
|
||||
@ -241,7 +241,7 @@ int ompi_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t *
|
||||
int i, rc, cpc_index, len;
|
||||
ompi_btl_openib_connect_base_module_t **cpcs;
|
||||
|
||||
cpcs = (ompi_btl_openib_connect_base_module_t **) calloc(num_available,
|
||||
cpcs = (ompi_btl_openib_connect_base_module_t **) calloc(num_available,
|
||||
sizeof(ompi_btl_openib_connect_base_module_t *));
|
||||
if (NULL == cpcs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -311,7 +311,7 @@ int ompi_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t *
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* This function is invoked when determining whether we have a CPC in
|
||||
* common with a specific remote port. We already know that the
|
||||
* subnet ID is the same between a specific local port and the target
|
||||
@ -398,7 +398,7 @@ int ompi_btl_openib_connect_base_get_cpc_index(ompi_btl_openib_connect_base_comp
|
||||
ompi_btl_openib_connect_base_component_t *
|
||||
ompi_btl_openib_connect_base_get_cpc_byindex(uint8_t index)
|
||||
{
|
||||
return (index >= (sizeof(all) /
|
||||
return (index >= (sizeof(all) /
|
||||
sizeof(ompi_btl_openib_connect_base_module_t *))) ?
|
||||
NULL : all[index];
|
||||
}
|
||||
@ -421,8 +421,8 @@ int ompi_btl_openib_connect_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
endpoint->endpoint_cts_mr =
|
||||
ibv_reg_mr(endpoint->endpoint_btl->device->ib_pd,
|
||||
endpoint->endpoint_cts_mr =
|
||||
ibv_reg_mr(endpoint->endpoint_btl->device->ib_pd,
|
||||
fli->ptr, length,
|
||||
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
|
||||
IBV_ACCESS_REMOTE_READ);
|
||||
@ -438,14 +438,14 @@ int ompi_btl_openib_connect_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
|
||||
from underneath us. */
|
||||
|
||||
/* Copy the lkey where it needs to go */
|
||||
endpoint->endpoint_cts_frag.super.sg_entry.lkey =
|
||||
endpoint->endpoint_cts_frag.super.super.segment.seg_key.key32[0] =
|
||||
endpoint->endpoint_cts_frag.super.sg_entry.lkey =
|
||||
endpoint->endpoint_cts_frag.super.super.segment.seg_key.key32[0] =
|
||||
endpoint->endpoint_cts_mr->lkey;
|
||||
endpoint->endpoint_cts_frag.super.sg_entry.length = length;
|
||||
|
||||
/* Construct the rest of the recv_frag_t */
|
||||
OBJ_CONSTRUCT(&(endpoint->endpoint_cts_frag), mca_btl_openib_recv_frag_t);
|
||||
endpoint->endpoint_cts_frag.super.super.base.order =
|
||||
endpoint->endpoint_cts_frag.super.super.base.order =
|
||||
mca_btl_openib_component.credits_qp;
|
||||
endpoint->endpoint_cts_frag.super.endpoint = endpoint;
|
||||
OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d",
|
||||
|
@ -2,9 +2,9 @@
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
|
||||
static void empty_component_register(void);
|
||||
static int empty_component_init(void);
|
||||
static int empty_component_query(mca_btl_openib_module_t *btl,
|
||||
static int empty_component_query(mca_btl_openib_module_t *btl,
|
||||
ompi_btl_openib_connect_base_module_t **cpc);
|
||||
|
||||
ompi_btl_openib_connect_base_component_t ompi_btl_openib_connect_empty = {
|
||||
@ -38,7 +38,7 @@ static int empty_component_init(void)
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
static int empty_component_query(mca_btl_openib_module_t *btl,
|
||||
static int empty_component_query(mca_btl_openib_module_t *btl,
|
||||
ompi_btl_openib_connect_base_module_t **cpc)
|
||||
{
|
||||
/* Never let this CPC run */
|
||||
|
@ -2,9 +2,9 @@
|
||||
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
@ -4,9 +4,9 @@
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
* TO-DO:
|
||||
*
|
||||
* - audit control values passed to req_send()
|
||||
* - More show_help() throughout
|
||||
* - More show_help() throughout
|
||||
* - error handling in case of broken connection is not good; need to
|
||||
* notify btl module safely
|
||||
*/
|
||||
@ -91,7 +91,7 @@
|
||||
* tell IBCM (among other things) when the first message arrives on
|
||||
* a QP when the RTU has not yet been received. This can happen, of
|
||||
* course, since IBCM traffic is UD.
|
||||
* - Also, note that IBCM "listener" IDs are per DEVICE, not per port.
|
||||
* - Also, note that IBCM "listener" IDs are per DEVICE, not per port.
|
||||
* - CM ID's are persistent throughout the life of a QP. If you
|
||||
* destroy a CM ID (ib_cm_destroy_id), the IBCM system will tear
|
||||
* down the connection. So the CM ID you get when receiving a
|
||||
@ -179,7 +179,7 @@
|
||||
*
|
||||
* 1. The "wrong" process will send a single IBCM connection request
|
||||
* to its peer on a bogus QP that was created just for this
|
||||
* request.
|
||||
* request.
|
||||
* 2. The receiver will get the request, detect that it came
|
||||
* in from the "wrong" direction, and reject it (IBCM has an
|
||||
* explicit provision for rejecting incoming connections).
|
||||
@ -366,7 +366,7 @@ typedef struct {
|
||||
|
||||
static void ibcm_listen_cm_id_constructor(ibcm_listen_cm_id_t *h);
|
||||
static void ibcm_listen_cm_id_destructor(ibcm_listen_cm_id_t *h);
|
||||
static OBJ_CLASS_INSTANCE(ibcm_listen_cm_id_t, opal_list_item_t,
|
||||
static OBJ_CLASS_INSTANCE(ibcm_listen_cm_id_t, opal_list_item_t,
|
||||
ibcm_listen_cm_id_constructor,
|
||||
ibcm_listen_cm_id_destructor);
|
||||
|
||||
@ -452,7 +452,7 @@ typedef struct {
|
||||
ibcm_module_t *ibcm_module;
|
||||
} ibcm_module_list_item_t;
|
||||
|
||||
static OBJ_CLASS_INSTANCE(ibcm_module_list_item_t, opal_list_item_t,
|
||||
static OBJ_CLASS_INSTANCE(ibcm_module_list_item_t, opal_list_item_t,
|
||||
NULL, NULL);
|
||||
|
||||
/*
|
||||
@ -506,7 +506,7 @@ typedef struct {
|
||||
/*--------------------------------------------------------------------*/
|
||||
|
||||
static void ibcm_component_register(void);
|
||||
static int ibcm_component_query(mca_btl_openib_module_t *btl,
|
||||
static int ibcm_component_query(mca_btl_openib_module_t *btl,
|
||||
ompi_btl_openib_connect_base_module_t **cpc);
|
||||
static int ibcm_component_finalize(void);
|
||||
|
||||
@ -544,13 +544,13 @@ ompi_btl_openib_connect_base_component_t ompi_btl_openib_connect_ibcm = {
|
||||
|
||||
#define ENABLE_TIMERS (OPAL_ENABLE_DEBUG && 0)
|
||||
|
||||
#if ENABLE_TIMERS
|
||||
#if ENABLE_TIMERS
|
||||
#include MCA_timer_IMPLEMENTATION_HEADER
|
||||
|
||||
enum {
|
||||
QUERY,
|
||||
START_CONNECT,
|
||||
QP_TO_RTR,
|
||||
QP_TO_RTR,
|
||||
QP_TO_RTS,
|
||||
REQUEST_RECEIVED,
|
||||
REPLY_RECEIVED,
|
||||
@ -616,7 +616,7 @@ static void ibcm_component_register(void)
|
||||
mca_base_param_reg_int(&mca_btl_openib_component.super.btl_version,
|
||||
"connect_ibcm_gid_index",
|
||||
"GID table index to use to obtain each port's GUID",
|
||||
false, false, ibcm_gid_table_index,
|
||||
false, false, ibcm_gid_table_index,
|
||||
&ibcm_gid_table_index);
|
||||
if (ibcm_gid_table_index < 0) {
|
||||
ibcm_gid_table_index = 0;
|
||||
@ -625,15 +625,15 @@ static void ibcm_component_register(void)
|
||||
|
||||
/*--------------------------------------------------------------------*/
|
||||
|
||||
/* The IB_CM_ASSIGN_SERVICE_ID value passed to ib_cm_listen function asks,
|
||||
/* The IB_CM_ASSIGN_SERVICE_ID value passed to ib_cm_listen function asks,
|
||||
* from IBCM , to assign service_id.
|
||||
* The value was taken from IBCM kernel level
|
||||
* The value was taken from IBCM kernel level
|
||||
*/
|
||||
#ifndef IB_CM_ASSIGN_SERVICE_ID
|
||||
#define IB_CM_ASSIGN_SERVICE_ID hton64(0x0200000000000000ULL)
|
||||
#endif
|
||||
|
||||
static int ibcm_component_query(mca_btl_openib_module_t *btl,
|
||||
static int ibcm_component_query(mca_btl_openib_module_t *btl,
|
||||
ompi_btl_openib_connect_base_module_t **cpc)
|
||||
{
|
||||
int rc;
|
||||
@ -760,11 +760,11 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
|
||||
goto error;
|
||||
}
|
||||
OPAL_OUTPUT((-1, "opened ibcm device 0x%" PRIx64 " (%s:%d)",
|
||||
(uint64_t) cmh->cm_device,
|
||||
(uint64_t) cmh->cm_device,
|
||||
ibv_get_device_name(cmh->ib_context->device),
|
||||
openib_btl->port_num));
|
||||
|
||||
if (0 != (rc = ib_cm_create_id(cmh->cm_device,
|
||||
if (0 != (rc = ib_cm_create_id(cmh->cm_device,
|
||||
&cmh->listen_cm_id, NULL))) {
|
||||
/* Same rationale as above */
|
||||
OBJ_RELEASE(cmh);
|
||||
@ -774,12 +774,12 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
|
||||
}
|
||||
|
||||
if (0 != (rc = ib_cm_listen(cmh->listen_cm_id, IB_CM_ASSIGN_SERVICE_ID, 0))) {
|
||||
/* Same rationale as above */
|
||||
OBJ_RELEASE(cmh);
|
||||
BTL_ERROR(("failed to ib_cm_listen : rc=%d, errno=%d", rc, errno));
|
||||
rc = OMPI_ERR_NOT_SUPPORTED;
|
||||
goto error;
|
||||
}
|
||||
/* Same rationale as above */
|
||||
OBJ_RELEASE(cmh);
|
||||
BTL_ERROR(("failed to ib_cm_listen : rc=%d, errno=%d", rc, errno));
|
||||
rc = OMPI_ERR_NOT_SUPPORTED;
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (0 != (rc = ib_cm_attr_id(cmh->listen_cm_id, &(cmh->param)))) {
|
||||
OBJ_RELEASE(cmh);
|
||||
@ -822,7 +822,7 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
|
||||
rc = OMPI_ERR_UNREACH;
|
||||
goto error;
|
||||
}
|
||||
rc = ibv_query_gid(btl->device->ib_dev_context, btl->port_num, ibcm_gid_table_index,
|
||||
rc = ibv_query_gid(btl->device->ib_dev_context, btl->port_num, ibcm_gid_table_index,
|
||||
&gid);
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("system error (ibv_query_gid failed)"));
|
||||
@ -863,8 +863,8 @@ static int ibcm_component_query(mca_btl_openib_module_t *btl,
|
||||
btl->port_num));
|
||||
} else {
|
||||
BTL_VERBOSE(("unavailable for use on %s:%d; fatal error %d (%s)",
|
||||
ibv_get_device_name(btl->device->ib_dev),
|
||||
btl->port_num, rc,
|
||||
ibv_get_device_name(btl->device->ib_dev),
|
||||
btl->port_num, rc,
|
||||
opal_strerror(rc)));
|
||||
}
|
||||
return rc;
|
||||
@ -895,8 +895,8 @@ static uint32_t max_inline_size(int qp, mca_btl_openib_device_t *device)
|
||||
* Create the local side of one qp. The remote side will be connected
|
||||
* later.
|
||||
*/
|
||||
static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
struct ibv_srq *srq, uint32_t max_recv_wr,
|
||||
static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
struct ibv_srq *srq, uint32_t max_recv_wr,
|
||||
uint32_t max_send_wr)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
|
||||
@ -910,7 +910,7 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ];
|
||||
init_attr.recv_cq = openib_btl->device->ib_cq[qp_cq_prio(qp)];
|
||||
init_attr.srq = srq;
|
||||
init_attr.cap.max_inline_data = req_inline =
|
||||
init_attr.cap.max_inline_data = req_inline =
|
||||
max_inline_size(qp, openib_btl->device);
|
||||
init_attr.cap.max_send_sge = 1;
|
||||
init_attr.cap.max_recv_sge = 1; /* we do not use SG list */
|
||||
@ -922,10 +922,10 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
}
|
||||
init_attr.cap.max_send_wr = max_send_wr;
|
||||
|
||||
my_qp = ibv_create_qp(openib_btl->device->ib_pd, &init_attr);
|
||||
if (NULL == my_qp) {
|
||||
BTL_ERROR(("error creating qp errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
my_qp = ibv_create_qp(openib_btl->device->ib_pd, &init_attr);
|
||||
if (NULL == my_qp) {
|
||||
BTL_ERROR(("error creating qp errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
endpoint->qps[qp].qp->lcl_qp = my_qp;
|
||||
if (init_attr.cap.max_inline_data < req_inline) {
|
||||
@ -971,7 +971,7 @@ static int qp_create_all(mca_btl_base_endpoint_t* endpoint,
|
||||
pp_qp_num = 1;
|
||||
}
|
||||
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
|
||||
struct ibv_srq *srq = NULL;
|
||||
uint32_t max_recv_wr, max_send_wr;
|
||||
int32_t rd_rsv, rd_num_credits;
|
||||
@ -985,7 +985,7 @@ static int qp_create_all(mca_btl_base_endpoint_t* endpoint,
|
||||
}
|
||||
|
||||
if (BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
max_recv_wr = mca_btl_openib_component.qp_infos[qp].rd_num +
|
||||
max_recv_wr = mca_btl_openib_component.qp_infos[qp].rd_num +
|
||||
rd_rsv;
|
||||
max_send_wr = mca_btl_openib_component.qp_infos[qp].rd_num +
|
||||
rd_num_credits;
|
||||
@ -1019,14 +1019,14 @@ static int fill_path_record(ibcm_module_t *m,
|
||||
mca_btl_base_endpoint_t *endpoint,
|
||||
struct ibv_sa_path_rec *path_rec)
|
||||
{
|
||||
modex_msg_t *remote_msg =
|
||||
modex_msg_t *remote_msg =
|
||||
(modex_msg_t*) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
|
||||
modex_msg_t *local_msg =
|
||||
modex_msg_t *local_msg =
|
||||
(modex_msg_t*) m->cpc.data.cbm_modex_message;
|
||||
|
||||
/* Global attributes */
|
||||
path_rec->dgid.global.subnet_prefix =
|
||||
path_rec->sgid.global.subnet_prefix =
|
||||
path_rec->dgid.global.subnet_prefix =
|
||||
path_rec->sgid.global.subnet_prefix =
|
||||
hton64(m->btl->port_info.subnet_id);
|
||||
path_rec->dgid.global.interface_id = hton64(remote_msg->mm_port_guid);
|
||||
path_rec->sgid.global.interface_id = hton64(local_msg->mm_port_guid);
|
||||
@ -1064,7 +1064,7 @@ static int fill_path_record(ibcm_module_t *m,
|
||||
path_rec->pkey = mca_btl_openib_component.ib_pkey_val;
|
||||
if (0 == path_rec->pkey) {
|
||||
uint16_t pkey;
|
||||
ibv_query_pkey(endpoint->endpoint_btl->device->ib_dev_context,
|
||||
ibv_query_pkey(endpoint->endpoint_btl->device->ib_dev_context,
|
||||
endpoint->endpoint_btl->port_num, 0, &pkey);
|
||||
path_rec->pkey = ntohs(pkey);
|
||||
}
|
||||
@ -1103,10 +1103,10 @@ static int fill_path_record(ibcm_module_t *m,
|
||||
BTL_VERBOSE(("Got src/dest subnet id: 0x%" PRIx64 " / 0x%" PRIx64,
|
||||
path_rec->sgid.global.subnet_prefix,
|
||||
path_rec->dgid.global.subnet_prefix));
|
||||
BTL_VERBOSE(("Got src/dest interface id: 0x%" PRIx64 " / 0x%" PRIx64,
|
||||
BTL_VERBOSE(("Got src/dest interface id: 0x%" PRIx64 " / 0x%" PRIx64,
|
||||
path_rec->sgid.global.interface_id,
|
||||
path_rec->dgid.global.interface_id));
|
||||
BTL_VERBOSE(("Got src/dest lid: 0x%x / 0x%x",
|
||||
BTL_VERBOSE(("Got src/dest lid: 0x%x / 0x%x",
|
||||
path_rec->slid, path_rec->dlid));
|
||||
BTL_VERBOSE(("Got raw_traffic: %d", path_rec->raw_traffic));
|
||||
|
||||
@ -1131,7 +1131,7 @@ static int fill_path_record(ibcm_module_t *m,
|
||||
|
||||
static int ibcm_endpoint_init(struct mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
ibcm_endpoint_t *ie = endpoint->endpoint_local_cpc_data =
|
||||
ibcm_endpoint_t *ie = endpoint->endpoint_local_cpc_data =
|
||||
calloc(1, sizeof(ibcm_endpoint_t));
|
||||
if (NULL == ie) {
|
||||
BTL_ERROR(("malloc failed!"));
|
||||
@ -1141,7 +1141,7 @@ static int ibcm_endpoint_init(struct mca_btl_base_endpoint_t *endpoint)
|
||||
BTL_VERBOSE(("endpoint %p / %p", (void*)endpoint, (void*)ie));
|
||||
ie->ie_cpc = endpoint->endpoint_local_cpc;
|
||||
ie->ie_endpoint = endpoint;
|
||||
ie->ie_qps_created =
|
||||
ie->ie_qps_created =
|
||||
ie->ie_recv_buffers_posted = false;
|
||||
ie->ie_qps_to_connect = mca_btl_openib_component.num_qps;
|
||||
|
||||
@ -1159,12 +1159,12 @@ static int ibcm_endpoint_init(struct mca_btl_base_endpoint_t *endpoint)
|
||||
static bool i_initiate(ibcm_module_t *m,
|
||||
mca_btl_openib_endpoint_t *endpoint)
|
||||
{
|
||||
modex_msg_t *msg =
|
||||
modex_msg_t *msg =
|
||||
(modex_msg_t*) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
|
||||
uint64_t my_port_guid = ntoh64(m->btl->device->ib_dev_attr.node_guid) +
|
||||
uint64_t my_port_guid = ntoh64(m->btl->device->ib_dev_attr.node_guid) +
|
||||
m->btl->port_num;
|
||||
uint64_t service_id = m->cmh->param.service_id;
|
||||
|
||||
|
||||
BTL_VERBOSE(("i_initiate: my guid (%0" PRIx64 "), msg guid (%0" PRIx64 ")",
|
||||
my_port_guid, msg->mm_port_guid));
|
||||
BTL_VERBOSE(("i_initiate: my service id (%d), msg service id (%d)",
|
||||
@ -1172,7 +1172,7 @@ static bool i_initiate(ibcm_module_t *m,
|
||||
|
||||
return
|
||||
(my_port_guid == msg->mm_port_guid &&
|
||||
service_id < msg->mm_service_id) ? true :
|
||||
service_id < msg->mm_service_id) ? true :
|
||||
(my_port_guid < msg->mm_port_guid) ? true : false;
|
||||
}
|
||||
|
||||
@ -1187,11 +1187,11 @@ static ibcm_request_t *alloc_request(ibcm_module_t *m, modex_msg_t *msg,
|
||||
struct ib_cm_req_param *cm_req;
|
||||
ibcm_request_t *req = OBJ_NEW(ibcm_request_t);
|
||||
BTL_VERBOSE(("allocated cached req id: 0x%" PRIx64, (void*)req));
|
||||
|
||||
|
||||
if (NULL == req) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* Create this CM ID */
|
||||
if (0 != ib_cm_create_id(m->cmh->cm_device,
|
||||
&(req->super.cm_id),
|
||||
@ -1201,11 +1201,11 @@ static ibcm_request_t *alloc_request(ibcm_module_t *m, modex_msg_t *msg,
|
||||
return NULL;
|
||||
}
|
||||
BTL_VERBOSE(("created CM ID 0x%" PRIx64, &(req->super.cm_id)));
|
||||
|
||||
|
||||
/* This data is constant for all the QP's */
|
||||
req->path_rec = *path_rec;
|
||||
req->endpoint = endpoint;
|
||||
|
||||
|
||||
cm_req = &(req->cm_req);
|
||||
cm_req->qp_type = IBV_QPT_RC;
|
||||
cm_req->alternate_path = NULL;
|
||||
@ -1220,7 +1220,7 @@ static ibcm_request_t *alloc_request(ibcm_module_t *m, modex_msg_t *msg,
|
||||
cm_req->remote_cm_response_timeout = 20;
|
||||
cm_req->local_cm_response_timeout = 20;
|
||||
cm_req->max_cm_retries = 5;
|
||||
|
||||
|
||||
req->private_data.ireqd_pid = m->cmh->param.service_id;
|
||||
req->private_data.ireqd_ep_index = endpoint->index;
|
||||
|
||||
@ -1249,21 +1249,21 @@ static void print_req(struct ib_cm_req_param *cm_req)
|
||||
BTL_VERBOSE(("cm_req->max_cm_retries: %d", cm_req->max_cm_retries));
|
||||
BTL_VERBOSE(("cm_req->srq: %d", cm_req->srq));
|
||||
}
|
||||
|
||||
|
||||
static int ibcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
|
||||
mca_btl_base_endpoint_t *endpoint)
|
||||
{
|
||||
int i, rc;
|
||||
ibcm_module_t *m = (ibcm_module_t *) cpc;
|
||||
ibcm_endpoint_t *ie =
|
||||
ibcm_endpoint_t *ie =
|
||||
(ibcm_endpoint_t *) endpoint->endpoint_local_cpc_data;
|
||||
modex_msg_t *msg =
|
||||
modex_msg_t *msg =
|
||||
(modex_msg_t*) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
|
||||
struct ibv_sa_path_rec path_rec;
|
||||
bool do_initiate;
|
||||
|
||||
TIMER_START(START_CONNECT);
|
||||
BTL_VERBOSE(("endpoint %p (lid %d, ep index %d)",
|
||||
BTL_VERBOSE(("endpoint %p (lid %d, ep index %d)",
|
||||
(void*)endpoint, endpoint->endpoint_btl->port_info.lid,
|
||||
endpoint->index));
|
||||
|
||||
@ -1291,7 +1291,7 @@ static int ibcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
|
||||
rc = OMPI_ERR_NOT_FOUND;
|
||||
goto err;
|
||||
}
|
||||
|
||||
|
||||
/* If we're not the initiator, make a bogus QP (must be done
|
||||
before we make all the other QPs) */
|
||||
|
||||
@ -1303,14 +1303,14 @@ static int ibcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
|
||||
}
|
||||
ie->ie_bogus_qp = endpoint->qps[0].qp->lcl_qp;
|
||||
}
|
||||
|
||||
|
||||
/* Make the local side of all the QP's */
|
||||
if (OMPI_SUCCESS != (rc = qp_create_all(endpoint, m))) {
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* Check initiation direction (see comment above i_initiate()
|
||||
function):
|
||||
function):
|
||||
|
||||
- if this is the side that is not supposed to initiate, then
|
||||
send a single bogus request that we expect to be rejected.
|
||||
@ -1365,12 +1365,12 @@ static int ibcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
|
||||
cm_req->srq = BTL_OPENIB_QP_TYPE_SRQ(i);
|
||||
cm_req->qp_num = endpoint->qps[i].qp->lcl_qp->qp_num;
|
||||
cm_req->starting_psn = endpoint->qps[i].qp->lcl_psn;
|
||||
BTL_VERBOSE(("sending my qpn %d, psn %d",
|
||||
BTL_VERBOSE(("sending my qpn %d, psn %d",
|
||||
cm_req->qp_num, cm_req->starting_psn));
|
||||
|
||||
|
||||
req->private_data.ireqd_request = req;
|
||||
req->private_data.ireqd_qp_index = i;
|
||||
|
||||
|
||||
/* Send the request */
|
||||
BTL_VERBOSE(("sending connect request %d of %d (id %p)",
|
||||
i, mca_btl_openib_component.num_qps,
|
||||
@ -1409,7 +1409,7 @@ static int ibcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
|
||||
cm_req->srq = 0;
|
||||
cm_req->qp_num = ie->ie_bogus_qp->qp_num;
|
||||
cm_req->starting_psn = 0;
|
||||
BTL_VERBOSE(("sending BOGUS qpn %d, psn %d (id %p)",
|
||||
BTL_VERBOSE(("sending BOGUS qpn %d, psn %d (id %p)",
|
||||
cm_req->qp_num, cm_req->starting_psn,
|
||||
(void*)req->super.cm_id));
|
||||
|
||||
@ -1472,7 +1472,7 @@ static void ibcm_listen_cm_id_destructor(ibcm_listen_cm_id_t *cmh)
|
||||
|
||||
/* Remove all the ibcm module items */
|
||||
for (item = opal_list_remove_first(&(cmh->ibcm_modules));
|
||||
NULL != item;
|
||||
NULL != item;
|
||||
item = opal_list_remove_first(&(cmh->ibcm_modules))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
@ -1494,7 +1494,7 @@ static void ibcm_listen_cm_id_destructor(ibcm_listen_cm_id_t *cmh)
|
||||
|
||||
/* Stop monitoring the cm_device's fd (wait for it to be
|
||||
released from the monitoring entity) */
|
||||
ompi_btl_openib_fd_unmonitor(cmh->cm_device->fd,
|
||||
ompi_btl_openib_fd_unmonitor(cmh->cm_device->fd,
|
||||
callback_unlock,
|
||||
|
||||
(void*) &barrier);
|
||||
@ -1527,7 +1527,7 @@ static void ibcm_listen_cm_id_destructor(ibcm_listen_cm_id_t *cmh)
|
||||
/* Close the CM device */
|
||||
if (NULL != cmh->cm_device) {
|
||||
OPAL_OUTPUT((-1, "closing ibcm device 0x%" PRIx64 " (%s)",
|
||||
(uint64_t) cmh->cm_device,
|
||||
(uint64_t) cmh->cm_device,
|
||||
ibv_get_device_name(cmh->ib_context->device)));
|
||||
ib_cm_close_device(cmh->cm_device);
|
||||
}
|
||||
@ -1565,7 +1565,7 @@ static int ibcm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
|
||||
ibcm_endpoint_t *ie =
|
||||
(ibcm_endpoint_t *) endpoint->endpoint_local_cpc_data;
|
||||
BTL_VERBOSE(("endpoint %p", (void*)endpoint));
|
||||
|
||||
|
||||
/* Free the stuff we allocated in ibcm_module_init */
|
||||
if (NULL != ie) {
|
||||
int i;
|
||||
@ -1603,7 +1603,7 @@ static int ibcm_module_finalize(mca_btl_openib_module_t *btl,
|
||||
if (NULL != m && NULL != m->cmh) {
|
||||
OBJ_RELEASE(m->cmh);
|
||||
}
|
||||
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
@ -1655,7 +1655,7 @@ static int qp_to_rtr(int qp_index, struct ib_cm_id *cm_id,
|
||||
(mtu == IBV_MTU_2048) ? "2048" :
|
||||
(mtu == IBV_MTU_4096) ? "4096" :
|
||||
"unknown (!)"));
|
||||
|
||||
|
||||
/* Move the QP into the INIT state */
|
||||
memset(&attr, 0, sizeof(attr));
|
||||
attr.qp_state = IBV_QPS_INIT;
|
||||
@ -1665,9 +1665,9 @@ static int qp_to_rtr(int qp_index, struct ib_cm_id *cm_id,
|
||||
}
|
||||
|
||||
if (0 != ibv_modify_qp(qp, &attr, attr_mask)) {
|
||||
BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno)));
|
||||
BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* Move the QP into the RTR state */
|
||||
attr.qp_state = IBV_QPS_RTR;
|
||||
@ -1687,7 +1687,7 @@ static int qp_to_rtr(int qp_index, struct ib_cm_id *cm_id,
|
||||
/* IBM CM does not set these values for us */
|
||||
attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
|
||||
attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer;
|
||||
|
||||
|
||||
if (0 != ibv_modify_qp(qp, &attr,
|
||||
attr_mask |
|
||||
IBV_QP_PATH_MTU |
|
||||
@ -1696,9 +1696,9 @@ static int qp_to_rtr(int qp_index, struct ib_cm_id *cm_id,
|
||||
)) {
|
||||
BTL_ERROR(("error modifing QP to RTR errno says %s",
|
||||
strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
||||
/* All done */
|
||||
TIMER_STOP(QP_TO_RTR);
|
||||
return OMPI_SUCCESS;
|
||||
@ -1727,9 +1727,9 @@ static int qp_to_rts(int qp_index, struct ib_cm_id *cm_id,
|
||||
if (0 != (rc = ibv_modify_qp(qp, &attr, attr_mask))) {
|
||||
BTL_ERROR(("error modifing QP (index %d) to RTS errno says %s; rc=%d, errno=%d",
|
||||
qp_index, strerror(errno), rc, errno));
|
||||
return OMPI_ERROR;
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
||||
/* All done */
|
||||
BTL_VERBOSE(("successfully set RTS"));
|
||||
TIMER_STOP(QP_TO_RTS);
|
||||
@ -1742,7 +1742,7 @@ static int qp_to_rts(int qp_index, struct ib_cm_id *cm_id,
|
||||
*/
|
||||
static void *callback_start_connect(void *context)
|
||||
{
|
||||
callback_start_connect_data_t *cbdata =
|
||||
callback_start_connect_data_t *cbdata =
|
||||
(callback_start_connect_data_t *) context;
|
||||
|
||||
BTL_VERBOSE(("ibcm scheduled callback: calling start_connect()"));
|
||||
@ -1763,7 +1763,7 @@ static void *callback_start_connect(void *context)
|
||||
/*
|
||||
* Passive has received a connection request from a active
|
||||
*/
|
||||
static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
struct ib_cm_event *event)
|
||||
{
|
||||
int i, rc = OMPI_ERROR;
|
||||
@ -1794,7 +1794,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
device event, and have to find the ibcm_module_t (i.e., local
|
||||
port/openib BTL module ) that corresponds to it. */
|
||||
BTL_VERBOSE(("looking for ibcm module -- source port guid: 0x%" PRIx64 " (%p)",
|
||||
ntoh64(req->primary_path->sgid.global.interface_id),
|
||||
ntoh64(req->primary_path->sgid.global.interface_id),
|
||||
(void*)cmh));
|
||||
for (item = opal_list_get_first(&(cmh->ibcm_modules));
|
||||
item != opal_list_get_end(&(cmh->ibcm_modules));
|
||||
@ -1820,7 +1820,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
|
||||
for (found = false, ib_proc = (mca_btl_openib_proc_t*)
|
||||
opal_list_get_first(&mca_btl_openib_component.ib_procs);
|
||||
!found &&
|
||||
!found &&
|
||||
ib_proc != (mca_btl_openib_proc_t*)
|
||||
opal_list_get_end(&mca_btl_openib_component.ib_procs);
|
||||
ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
|
||||
@ -1828,7 +1828,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
/* Now cycle through all the endpoints on that proc */
|
||||
for (i = 0; !found && i < (int) ib_proc->proc_endpoint_count; ++i) {
|
||||
BTL_VERBOSE(("checking endpoint %d of %d (ep %p, cpc data %p)",
|
||||
i, (int) ib_proc->proc_endpoint_count,
|
||||
i, (int) ib_proc->proc_endpoint_count,
|
||||
(void*)ib_proc->proc_endpoints[i],
|
||||
(void*)ib_proc->proc_endpoints[i]->endpoint_remote_cpc_data));
|
||||
if (NULL == ib_proc->proc_endpoints[i]->endpoint_remote_cpc_data) {
|
||||
@ -1841,7 +1841,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
BTL_VERBOSE(("my LID %d, remote LID %d",
|
||||
msg->mm_lid,
|
||||
ntohs(req->primary_path->dlid)));
|
||||
if (msg->mm_port_guid ==
|
||||
if (msg->mm_port_guid ==
|
||||
ntoh64(req->primary_path->dgid.global.interface_id) &&
|
||||
msg->mm_service_id == active_private_data->ireqd_pid &&
|
||||
msg->mm_port_num == req->port &&
|
||||
@ -1885,7 +1885,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
rc = OMPI_SUCCESS;
|
||||
} else if (ie->ie_connection_flags & CFLAGS_ONGOING) {
|
||||
/* See if the request for this QP already arrived */
|
||||
if (ie->ie_qps_created &&
|
||||
if (ie->ie_qps_created &&
|
||||
IBV_QPS_RESET != endpoint->qps[qp_index].qp->lcl_qp->state) {
|
||||
BTL_VERBOSE(("this QP (%d) already connected",
|
||||
qp_index));
|
||||
@ -1934,7 +1934,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
are filled in during qp_to_rtr (because we don't get them until
|
||||
we call ib_cm_attr_init()). We already have the remote LID,
|
||||
subnet ID, and MTU from the port's modex message. */
|
||||
endpoint->rem_info.rem_qps[qp_index].rem_psn =
|
||||
endpoint->rem_info.rem_qps[qp_index].rem_psn =
|
||||
event->param.req_rcvd.starting_psn;
|
||||
endpoint->rem_info.rem_index = active_private_data->ireqd_ep_index;
|
||||
|
||||
@ -2030,14 +2030,14 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
rep->cm_rep.qp_num = endpoint->qps[qp_index].qp->lcl_qp->qp_num;
|
||||
rep->cm_rep.srq = BTL_OPENIB_QP_TYPE_SRQ(qp_index);
|
||||
rep->cm_rep.starting_psn = endpoint->qps[qp_index].qp->lcl_psn;
|
||||
BTL_VERBOSE(("setting reply psn %d",
|
||||
BTL_VERBOSE(("setting reply psn %d",
|
||||
rep->cm_rep.starting_psn));
|
||||
rep->cm_rep.responder_resources = req->responder_resources;
|
||||
rep->cm_rep.initiator_depth = req->initiator_depth;
|
||||
rep->cm_rep.target_ack_delay = 20;
|
||||
rep->cm_rep.flow_control = req->flow_control;
|
||||
rep->cm_rep.rnr_retry_count = req->rnr_retry_count;
|
||||
|
||||
|
||||
rep->private_data.irepd_request = active_private_data->ireqd_request;
|
||||
rep->private_data.irepd_reply = rep;
|
||||
rep->private_data.irepd_qp_index = qp_index;
|
||||
@ -2050,7 +2050,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
goto reject;
|
||||
}
|
||||
opal_list_append(&ibcm_pending_replies, &(rep->super.super));
|
||||
|
||||
|
||||
TIMER_STOP(REQUEST_RECEIVED);
|
||||
BTL_VERBOSE(("sent reply for qp index %d", qp_index));
|
||||
return OMPI_SUCCESS;
|
||||
@ -2058,21 +2058,21 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
reject:
|
||||
/* Reject the request */
|
||||
BTL_VERBOSE(("rejecting request"));
|
||||
ib_cm_send_rej(event->cm_id, IB_CM_REJ_CONSUMER_DEFINED,
|
||||
ib_cm_send_rej(event->cm_id, IB_CM_REJ_CONSUMER_DEFINED,
|
||||
&rej_reason, sizeof(rej_reason),
|
||||
event->private_data, sizeof(ibcm_req_data_t));
|
||||
|
||||
|
||||
/* If we rejected because of the wrong direction, then initiate a
|
||||
connection going the other direction. */
|
||||
if (REJ_WRONG_DIRECTION == rej_reason) {
|
||||
callback_start_connect_data_t *cbdata = malloc(sizeof(*cbdata));
|
||||
if (NULL != cbdata) {
|
||||
cbdata->cscd_cpc =
|
||||
cbdata->cscd_cpc =
|
||||
(ompi_btl_openib_connect_base_module_t *) imodule;
|
||||
cbdata->cscd_endpoint = endpoint;
|
||||
BTL_VERBOSE(("starting connect in other direction"));
|
||||
ompi_btl_openib_fd_run_in_main(callback_start_connect, cbdata);
|
||||
|
||||
|
||||
TIMER_STOP(REQUEST_RECEIVED);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -2085,7 +2085,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
||||
endpoint);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Callback (from main thread) when the endpoint has been connected
|
||||
*/
|
||||
@ -2102,8 +2102,8 @@ static void *callback_set_endpoint_cpc_complete(void *context)
|
||||
|
||||
/*
|
||||
* Helper function to find a cached CM ID in a list
|
||||
*/
|
||||
static ibcm_base_cm_id_t *find_cm_id(struct ib_cm_id *cm_id,
|
||||
*/
|
||||
static ibcm_base_cm_id_t *find_cm_id(struct ib_cm_id *cm_id,
|
||||
opal_list_t *list)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
@ -2139,7 +2139,7 @@ static int reply_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
|
||||
p->irepd_qp_index, (void*) endpoint));
|
||||
|
||||
ie = (ibcm_endpoint_t*) endpoint->endpoint_local_cpc_data;
|
||||
endpoint->rem_info.rem_qps[p->irepd_qp_index].rem_psn =
|
||||
endpoint->rem_info.rem_qps[p->irepd_qp_index].rem_psn =
|
||||
event->param.rep_rcvd.starting_psn;
|
||||
endpoint->rem_info.rem_index = p->irepd_ep_index;
|
||||
|
||||
@ -2253,7 +2253,7 @@ static int ready_to_use_received(ibcm_listen_cm_id_t *h,
|
||||
static int reject_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
|
||||
{
|
||||
enum ib_cm_rej_reason reason = event->param.rej_rcvd.reason;
|
||||
ibcm_reject_reason_t *rej_reason =
|
||||
ibcm_reject_reason_t *rej_reason =
|
||||
(ibcm_reject_reason_t *) event->param.rej_rcvd.ari;
|
||||
|
||||
TIMER_START(REJECT_RECEIVED);
|
||||
@ -2268,7 +2268,7 @@ static int reject_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
|
||||
(ibcm_req_data_t*) event->private_data;
|
||||
ibcm_request_t *request = my_private_data->ireqd_request;
|
||||
mca_btl_openib_endpoint_t *endpoint = request->endpoint;
|
||||
ibcm_endpoint_t *ie = (ibcm_endpoint_t*)
|
||||
ibcm_endpoint_t *ie = (ibcm_endpoint_t*)
|
||||
endpoint->endpoint_local_cpc_data;
|
||||
|
||||
BTL_VERBOSE(("got WRONG_DIRECTION reject, endpoint: %p, pid %d, ep_index %d, qp_index %d",
|
||||
@ -2283,9 +2283,9 @@ static int reject_received(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
|
||||
/* Remove from the global pending_requests list because we
|
||||
no longer need to handle errors for it */
|
||||
BTL_VERBOSE(("reply received cm id %p -- original cached req %p",
|
||||
(void*)cmh->listen_cm_id,
|
||||
(void*)cmh->listen_cm_id,
|
||||
(void*)request));
|
||||
opal_list_remove_item(&ibcm_pending_requests,
|
||||
opal_list_remove_item(&ibcm_pending_requests,
|
||||
&(request->super.super));
|
||||
|
||||
/* We ack the event and then destroy the CM ID (you *must*
|
||||
@ -2326,13 +2326,13 @@ static int request_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
|
||||
if (IBV_WC_RESP_TIMEOUT_ERR != event->param.send_status) {
|
||||
orte_show_help("help-mpi-btl-openib-cpc-ibcm.txt",
|
||||
"unhandled error", true,
|
||||
"request", orte_process_info.nodename,
|
||||
"request", orte_process_info.nodename,
|
||||
event->param.send_status);
|
||||
} else {
|
||||
ibcm_request_t *req;
|
||||
BTL_ERROR(("Got timeout in IBCM request (CM ID: %p)",
|
||||
BTL_ERROR(("Got timeout in IBCM request (CM ID: %p)",
|
||||
(void*)event->cm_id));
|
||||
req = (ibcm_request_t*) find_cm_id(event->cm_id,
|
||||
req = (ibcm_request_t*) find_cm_id(event->cm_id,
|
||||
&ibcm_pending_requests);
|
||||
if (NULL == req) {
|
||||
orte_show_help("help-mpi-btl-openib-cpc-ibcm.txt",
|
||||
@ -2345,7 +2345,7 @@ static int request_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
|
||||
|
||||
/* Communicate to the upper layer that the connection on this
|
||||
endpoint has failed */
|
||||
ompi_btl_openib_fd_run_in_main(mca_btl_openib_endpoint_invoke_error,
|
||||
ompi_btl_openib_fd_run_in_main(mca_btl_openib_endpoint_invoke_error,
|
||||
endpoint);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -2358,13 +2358,13 @@ static int reply_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
|
||||
if (IBV_WC_RESP_TIMEOUT_ERR != event->param.send_status) {
|
||||
orte_show_help("help-mpi-btl-openib-cpc-ibcm.txt",
|
||||
"unhandled error", true,
|
||||
"reply", orte_process_info.nodename,
|
||||
"reply", orte_process_info.nodename,
|
||||
event->param.send_status);
|
||||
} else {
|
||||
ibcm_reply_t *rep;
|
||||
BTL_ERROR(("Got timeout in IBCM reply (id: %p)",
|
||||
(void*)event->cm_id));
|
||||
rep = (ibcm_reply_t*) find_cm_id(event->cm_id,
|
||||
rep = (ibcm_reply_t*) find_cm_id(event->cm_id,
|
||||
&ibcm_pending_replies);
|
||||
if (NULL == rep) {
|
||||
orte_show_help("help-mpi-btl-openib-cpc-ibcm.txt",
|
||||
@ -2377,7 +2377,7 @@ static int reply_error(ibcm_listen_cm_id_t *cmh, struct ib_cm_event *event)
|
||||
|
||||
/* Communicate to the upper layer that the connection on this
|
||||
endpoint has failed */
|
||||
ompi_btl_openib_fd_run_in_main(mca_btl_openib_endpoint_invoke_error,
|
||||
ompi_btl_openib_fd_run_in_main(mca_btl_openib_endpoint_invoke_error,
|
||||
endpoint);
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
@ -2390,7 +2390,7 @@ static void *ibcm_event_dispatch(int fd, int flags, void *context)
|
||||
ibcm_listen_cm_id_t *cmh = (ibcm_listen_cm_id_t*) context;
|
||||
struct ib_cm_event *e = NULL;
|
||||
|
||||
OPAL_OUTPUT((-1, "ibcm dispatch: on device 0x%" PRIx64", fd %d",
|
||||
OPAL_OUTPUT((-1, "ibcm dispatch: on device 0x%" PRIx64", fd %d",
|
||||
(uint64_t) cmh->cm_device, fd));
|
||||
TIMER_START(CM_GET_EVENT);
|
||||
/* Blocks until next event, which should be immediately (because
|
||||
@ -2415,19 +2415,19 @@ static void *ibcm_event_dispatch(int fd, int flags, void *context)
|
||||
/* Incoming request */
|
||||
rc = request_received(cmh, e);
|
||||
break;
|
||||
|
||||
|
||||
case IB_CM_REP_RECEIVED:
|
||||
OPAL_OUTPUT((-1, "ibcm dispatch: reply received on fd %d", fd));
|
||||
/* Reply received */
|
||||
rc = reply_received(cmh, e);
|
||||
break;
|
||||
|
||||
|
||||
case IB_CM_RTU_RECEIVED:
|
||||
OPAL_OUTPUT((-1, "ibcm dispatch: RTU received on fd %d", fd));
|
||||
/* Ready to use! */
|
||||
rc = ready_to_use_received(cmh, e);
|
||||
break;
|
||||
|
||||
|
||||
case IB_CM_REJ_RECEIVED:
|
||||
OPAL_OUTPUT((-1, "ibcm dispatch: reject received on fd %d", fd));
|
||||
/* Rejected connection */
|
||||
@ -2436,19 +2436,19 @@ static void *ibcm_event_dispatch(int fd, int flags, void *context)
|
||||
ID could be freed */
|
||||
want_ack = false;
|
||||
break;
|
||||
|
||||
|
||||
case IB_CM_REQ_ERROR:
|
||||
OPAL_OUTPUT((-1, "ibcm dispatch: request error received on fd %d", fd));
|
||||
/* Request error */
|
||||
rc = request_error(cmh, e);
|
||||
break;
|
||||
|
||||
|
||||
case IB_CM_REP_ERROR:
|
||||
OPAL_OUTPUT((-1, "ibcm dispatch: reply error received on fd %d", fd));
|
||||
/* Reply error */
|
||||
rc = reply_error(cmh, e);
|
||||
break;
|
||||
|
||||
|
||||
case IB_CM_DREQ_RECEIVED:
|
||||
case IB_CM_DREP_RECEIVED:
|
||||
case IB_CM_DREQ_ERROR:
|
||||
@ -2459,7 +2459,7 @@ static void *ibcm_event_dispatch(int fd, int flags, void *context)
|
||||
/* We don't care */
|
||||
rc = OMPI_SUCCESS;
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
/* This would be odd */
|
||||
OPAL_OUTPUT((-1, "ibcm dispatch: unhandled event received on fd %d", fd));
|
||||
|
@ -2,9 +2,9 @@
|
||||
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
@ -5,21 +5,21 @@
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* reserved.
|
||||
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -39,7 +39,7 @@
|
||||
#include "ompi/mca/dpm/dpm.h"
|
||||
|
||||
#include "btl_openib.h"
|
||||
#include "btl_openib_endpoint.h"
|
||||
#include "btl_openib_endpoint.h"
|
||||
#include "btl_openib_proc.h"
|
||||
#include "connect/connect.h"
|
||||
#include "orte/util/show_help.h"
|
||||
@ -88,7 +88,7 @@ static int oob_priority = 50;
|
||||
static bool rml_recv_posted = false;
|
||||
|
||||
static void oob_component_register(void);
|
||||
static int oob_component_query(mca_btl_openib_module_t *openib_btl,
|
||||
static int oob_component_query(mca_btl_openib_module_t *openib_btl,
|
||||
ompi_btl_openib_connect_base_module_t **cpc);
|
||||
static int oob_component_finalize(void);
|
||||
|
||||
@ -102,14 +102,14 @@ static int qp_connect_all(mca_btl_base_endpoint_t* endpoint);
|
||||
static int qp_create_all(mca_btl_base_endpoint_t* endpoint);
|
||||
static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
struct ibv_srq *srq, uint32_t max_recv_wr, uint32_t max_send_wr);
|
||||
static int send_connect_data(mca_btl_base_endpoint_t* endpoint,
|
||||
static int send_connect_data(mca_btl_base_endpoint_t* endpoint,
|
||||
uint8_t message_type);
|
||||
|
||||
static void rml_send_cb(int status, orte_process_name_t* endpoint,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
static void rml_send_cb(int status, orte_process_name_t* endpoint,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata);
|
||||
|
||||
#if (ENABLE_DYNAMIC_SL)
|
||||
@ -172,7 +172,7 @@ static void oob_component_register(void)
|
||||
* Init function. Post non-blocking RML receive to accept incoming
|
||||
* connection requests.
|
||||
*/
|
||||
static int oob_component_query(mca_btl_openib_module_t *btl,
|
||||
static int oob_component_query(mca_btl_openib_module_t *btl,
|
||||
ompi_btl_openib_connect_base_module_t **cpc)
|
||||
{
|
||||
int rc;
|
||||
@ -180,7 +180,7 @@ static int oob_component_query(mca_btl_openib_module_t *btl,
|
||||
/* If we have the transport_type member, check to ensure we're on
|
||||
IB (this CPC will not work with iWarp). If we do not have the
|
||||
transport_type member, then we must be < OFED v1.2, and
|
||||
therefore we must be IB. */
|
||||
therefore we must be IB. */
|
||||
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
|
||||
if (BTL_OPENIB_CONNECT_BASE_CHECK_IF_NOT_IB(btl)) {
|
||||
opal_output_verbose(5, mca_btl_base_output,
|
||||
@ -202,7 +202,7 @@ static int oob_component_query(mca_btl_openib_module_t *btl,
|
||||
ensure to only post it *once*, because another btl may have
|
||||
come in before this and already posted it. */
|
||||
if (!rml_recv_posted) {
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
OMPI_RML_TAG_OPENIB,
|
||||
ORTE_RML_PERSISTENT,
|
||||
rml_recv_cb,
|
||||
@ -261,7 +261,7 @@ static int oob_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
|
||||
endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
|
||||
if (OMPI_SUCCESS !=
|
||||
(rc = send_connect_data(endpoint, ENDPOINT_CONNECT_REQUEST))) {
|
||||
BTL_ERROR(("error sending connect request, error code %d", rc));
|
||||
BTL_ERROR(("error sending connect request, error code %d", rc));
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -330,7 +330,7 @@ static int reply_start_connect(mca_btl_openib_endpoint_t *endpoint,
|
||||
|
||||
/* Set the remote side info */
|
||||
set_remote_info(endpoint, rem_info);
|
||||
|
||||
|
||||
/* Connect to remote endpoint qp's */
|
||||
if (OMPI_SUCCESS != (rc = qp_connect_all(endpoint))) {
|
||||
return rc;
|
||||
@ -352,14 +352,14 @@ static int set_remote_info(mca_btl_base_endpoint_t* endpoint,
|
||||
mca_btl_openib_rem_info_t* rem_info)
|
||||
{
|
||||
/* copy the rem_info stuff */
|
||||
memcpy(&((mca_btl_openib_endpoint_t*) endpoint)->rem_info,
|
||||
rem_info, sizeof(mca_btl_openib_rem_info_t));
|
||||
|
||||
memcpy(&((mca_btl_openib_endpoint_t*) endpoint)->rem_info,
|
||||
rem_info, sizeof(mca_btl_openib_rem_info_t));
|
||||
|
||||
/* copy over the rem qp info */
|
||||
memcpy(endpoint->rem_info.rem_qps,
|
||||
rem_info->rem_qps, sizeof(mca_btl_openib_rem_qp_info_t) *
|
||||
rem_info->rem_qps, sizeof(mca_btl_openib_rem_qp_info_t) *
|
||||
mca_btl_openib_component.num_qps);
|
||||
|
||||
|
||||
BTL_VERBOSE(("Setting QP info, LID = %d", endpoint->rem_info.rem_lid));
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
@ -433,7 +433,7 @@ static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint)
|
||||
IBV_QP_MIN_RNR_TIMER)) {
|
||||
BTL_ERROR(("error modifing QP to RTR errno says %s",
|
||||
strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
attr.qp_state = IBV_QPS_RTS;
|
||||
attr.timeout = mca_btl_openib_component.ib_timeout;
|
||||
@ -482,7 +482,7 @@ static int qp_create_all(mca_btl_base_endpoint_t* endpoint)
|
||||
if(0 == pp_qp_num && true == endpoint->use_eager_rdma)
|
||||
pp_qp_num = 1;
|
||||
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
|
||||
struct ibv_srq *srq = NULL;
|
||||
uint32_t max_recv_wr, max_send_wr;
|
||||
int32_t rd_rsv, rd_num_credits;
|
||||
@ -540,7 +540,7 @@ static uint32_t max_inline_size(int qp, mca_btl_openib_device_t *device)
|
||||
* Create the local side of one qp. The remote side will be connected
|
||||
* later.
|
||||
*/
|
||||
static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
struct ibv_srq *srq, uint32_t max_recv_wr, uint32_t max_send_wr)
|
||||
{
|
||||
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
|
||||
@ -556,7 +556,7 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ];
|
||||
init_attr.recv_cq = openib_btl->device->ib_cq[qp_cq_prio(qp)];
|
||||
init_attr.srq = srq;
|
||||
init_attr.cap.max_inline_data = req_inline =
|
||||
init_attr.cap.max_inline_data = req_inline =
|
||||
max_inline_size(qp, openib_btl->device);
|
||||
init_attr.cap.max_send_sge = 1;
|
||||
init_attr.cap.max_recv_sge = 1; /* we do not use SG list */
|
||||
@ -567,11 +567,11 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
}
|
||||
init_attr.cap.max_send_wr = max_send_wr;
|
||||
|
||||
my_qp = ibv_create_qp(openib_btl->device->ib_pd, &init_attr);
|
||||
|
||||
if (NULL == my_qp) {
|
||||
BTL_ERROR(("error creating qp errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
my_qp = ibv_create_qp(openib_btl->device->ib_pd, &init_attr);
|
||||
|
||||
if (NULL == my_qp) {
|
||||
BTL_ERROR(("error creating qp errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
endpoint->qps[qp].qp->lcl_qp = my_qp;
|
||||
|
||||
@ -585,21 +585,21 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
} else {
|
||||
endpoint->qps[qp].ib_inline_max = req_inline;
|
||||
}
|
||||
|
||||
|
||||
attr.qp_state = IBV_QPS_INIT;
|
||||
attr.pkey_index = openib_btl->pkey_index;
|
||||
attr.port_num = openib_btl->port_num;
|
||||
attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
|
||||
|
||||
if (ibv_modify_qp(endpoint->qps[qp].qp->lcl_qp,
|
||||
&attr,
|
||||
IBV_QP_STATE |
|
||||
IBV_QP_PKEY_INDEX |
|
||||
IBV_QP_PORT |
|
||||
IBV_QP_ACCESS_FLAGS )) {
|
||||
BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
if (ibv_modify_qp(endpoint->qps[qp].qp->lcl_qp,
|
||||
&attr,
|
||||
IBV_QP_STATE |
|
||||
IBV_QP_PKEY_INDEX |
|
||||
IBV_QP_PORT |
|
||||
IBV_QP_ACCESS_FLAGS )) {
|
||||
BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno)));
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/* Setup meta data on the endpoint */
|
||||
endpoint->qps[qp].qp->lcl_psn = lrand48() & 0xffffff;
|
||||
@ -612,18 +612,18 @@ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp,
|
||||
/*
|
||||
* RML send connect information to remote endpoint
|
||||
*/
|
||||
static int send_connect_data(mca_btl_base_endpoint_t* endpoint,
|
||||
static int send_connect_data(mca_btl_base_endpoint_t* endpoint,
|
||||
uint8_t message_type)
|
||||
{
|
||||
opal_buffer_t* buffer = OBJ_NEW(opal_buffer_t);
|
||||
int rc;
|
||||
|
||||
|
||||
if (NULL == buffer) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/* pack the info in the send buffer */
|
||||
/* pack the info in the send buffer */
|
||||
BTL_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT8));
|
||||
rc = opal_dss.pack(buffer, &message_type, 1, OPAL_UINT8);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -659,7 +659,7 @@ static int send_connect_data(mca_btl_base_endpoint_t* endpoint,
|
||||
if (message_type != ENDPOINT_CONNECT_ACK) {
|
||||
int qp;
|
||||
/* stuff all the QP info into the buffer */
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
BTL_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32));
|
||||
rc = opal_dss.pack(buffer, &endpoint->qps[qp].qp->lcl_qp->qp_num,
|
||||
1, OPAL_UINT32);
|
||||
@ -669,13 +669,13 @@ static int send_connect_data(mca_btl_base_endpoint_t* endpoint,
|
||||
}
|
||||
BTL_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32));
|
||||
rc = opal_dss.pack(buffer, &endpoint->qps[qp].qp->lcl_psn, 1,
|
||||
OPAL_UINT32);
|
||||
OPAL_UINT32);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
BTL_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT16));
|
||||
rc = opal_dss.pack(buffer, &endpoint->endpoint_btl->lid, 1, OPAL_UINT16);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -698,7 +698,7 @@ static int send_connect_data(mca_btl_base_endpoint_t* endpoint,
|
||||
}
|
||||
|
||||
/* send to remote endpoint */
|
||||
rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_ompi->proc_name,
|
||||
rc = orte_rml.send_buffer_nb(&endpoint->endpoint_proc->proc_ompi->proc_name,
|
||||
buffer, OMPI_RML_TAG_OPENIB, 0,
|
||||
rml_send_cb, NULL);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -706,7 +706,7 @@ static int send_connect_data(mca_btl_base_endpoint_t* endpoint,
|
||||
return rc;
|
||||
}
|
||||
BTL_VERBOSE(("Sent QP Info, LID = %d, SUBNET = %" PRIx64 "\n",
|
||||
endpoint->endpoint_btl->lid,
|
||||
endpoint->endpoint_btl->lid,
|
||||
endpoint->subnet_id));
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
@ -717,8 +717,8 @@ static int send_connect_data(mca_btl_base_endpoint_t* endpoint,
|
||||
* Callback when we have finished RML sending the connect data to a
|
||||
* remote peer
|
||||
*/
|
||||
static void rml_send_cb(int status, orte_process_name_t* endpoint,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
static void rml_send_cb(int status, orte_process_name_t* endpoint,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
OBJ_RELEASE(buffer);
|
||||
@ -730,8 +730,8 @@ static void rml_send_cb(int status, orte_process_name_t* endpoint,
|
||||
* and if this endpoint is trying to connect, reply with our QP info,
|
||||
* otherwise try to modify QP's and establish reliable connection
|
||||
*/
|
||||
static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
opal_buffer_t* buffer, orte_rml_tag_t tag,
|
||||
void* cbdata)
|
||||
{
|
||||
mca_btl_openib_proc_t *ib_proc;
|
||||
@ -744,9 +744,9 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
mca_btl_openib_rem_info_t rem_info;
|
||||
uint8_t message_type;
|
||||
bool master;
|
||||
|
||||
/* start by unpacking data first so we know who is knocking at
|
||||
our door */
|
||||
|
||||
/* start by unpacking data first so we know who is knocking at
|
||||
our door */
|
||||
BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8));
|
||||
rc = opal_dss.unpack(buffer, &message_type, &cnt, OPAL_UINT8);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -754,7 +754,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
mca_btl_openib_endpoint_invoke_error(NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT64));
|
||||
rc = opal_dss.unpack(buffer, &rem_info.rem_subnet_id, &cnt, OPAL_UINT64);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -762,7 +762,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
mca_btl_openib_endpoint_invoke_error(NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if (ENDPOINT_CONNECT_REQUEST != message_type) {
|
||||
BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
|
||||
rc = opal_dss.unpack(buffer, &lcl_qp, &cnt, OPAL_UINT32);
|
||||
@ -780,14 +780,14 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
}
|
||||
}
|
||||
if (ENDPOINT_CONNECT_ACK != message_type) {
|
||||
int qp;
|
||||
int qp;
|
||||
/* get ready for the data */
|
||||
rem_info.rem_qps =
|
||||
(mca_btl_openib_rem_qp_info_t*) malloc(sizeof(mca_btl_openib_rem_qp_info_t) *
|
||||
rem_info.rem_qps =
|
||||
(mca_btl_openib_rem_qp_info_t*) malloc(sizeof(mca_btl_openib_rem_qp_info_t) *
|
||||
mca_btl_openib_component.num_qps);
|
||||
|
||||
|
||||
/* unpack all the qp info */
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
|
||||
BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32));
|
||||
rc = opal_dss.unpack(buffer, &rem_info.rem_qps[qp].rem_qp_num, &cnt,
|
||||
OPAL_UINT32);
|
||||
@ -805,7 +805,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16));
|
||||
rc = opal_dss.unpack(buffer, &rem_info.rem_lid, &cnt, OPAL_UINT16);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
@ -828,14 +828,14 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
BTL_VERBOSE(("Received QP Info, LID = %d, SUBNET = %" PRIx64 "\n",
|
||||
rem_info.rem_lid,
|
||||
rem_info.rem_lid,
|
||||
rem_info.rem_subnet_id));
|
||||
|
||||
|
||||
master = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME,
|
||||
process_name) > 0 ? true : false;
|
||||
|
||||
|
||||
/* Need to protect the ib_procs list */
|
||||
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
|
||||
|
||||
@ -845,16 +845,16 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
opal_list_get_end(&mca_btl_openib_component.ib_procs);
|
||||
ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
|
||||
bool found = false;
|
||||
|
||||
|
||||
if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
|
||||
&ib_proc->proc_ompi->proc_name, process_name) != OPAL_EQUAL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
if (ENDPOINT_CONNECT_REQUEST != message_type) {
|
||||
/* This is a reply message. Try to get the endpoint
|
||||
instance the reply belongs to */
|
||||
for (i = 0; i < ib_proc->proc_endpoint_count; i++) {
|
||||
for (i = 0; i < ib_proc->proc_endpoint_count; i++) {
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if (ib_endpoint->qps[0].qp->lcl_qp != NULL &&
|
||||
lcl_lid == ib_endpoint->endpoint_btl->lid &&
|
||||
@ -872,7 +872,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
mca_btl_openib_endpoint_t *ib_endpoint_found = NULL;
|
||||
int master_first_closed = -1;
|
||||
|
||||
for (i = 0; i < ib_proc->proc_endpoint_count; i++) {
|
||||
for (i = 0; i < ib_proc->proc_endpoint_count; i++) {
|
||||
ib_endpoint = ib_proc->proc_endpoints[i];
|
||||
if (ib_endpoint->subnet_id != rem_info.rem_subnet_id ||
|
||||
(ib_endpoint->endpoint_state != MCA_BTL_IB_CONNECTING
|
||||
@ -894,7 +894,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
break; /* Found one. No point to continue */
|
||||
}
|
||||
ib_endpoint = ib_endpoint_found;
|
||||
|
||||
|
||||
if (found && master &&
|
||||
MCA_BTL_IB_CLOSED == ib_endpoint->endpoint_state ) {
|
||||
/* since this is master and no endpoints found in
|
||||
@ -912,17 +912,17 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!found) {
|
||||
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
|
||||
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
|
||||
mca_btl_openib_endpoint_invoke_error(NULL);
|
||||
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
|
||||
return;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock);
|
||||
endpoint_state = ib_endpoint->endpoint_state;
|
||||
|
||||
|
||||
/* Update status */
|
||||
switch (endpoint_state) {
|
||||
case MCA_BTL_IB_CLOSED :
|
||||
@ -933,17 +933,17 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
if (master) {
|
||||
rc = reply_start_connect(ib_endpoint, &rem_info);
|
||||
} else {
|
||||
rc = oob_module_start_connect(ib_endpoint->endpoint_local_cpc,
|
||||
rc = oob_module_start_connect(ib_endpoint->endpoint_local_cpc,
|
||||
ib_endpoint);
|
||||
}
|
||||
|
||||
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
BTL_ERROR(("error in endpoint reply start connect"));
|
||||
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
/* As long as we expect a message from the peer (in order
|
||||
to setup the connection) let the event engine pool the
|
||||
RML events. Note: we increment it once peer active
|
||||
@ -951,16 +951,16 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
opal_progress_event_users_increment();
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
|
||||
|
||||
case MCA_BTL_IB_CONNECTING :
|
||||
set_remote_info(ib_endpoint, &rem_info);
|
||||
if (OMPI_SUCCESS != (rc = qp_connect_all(ib_endpoint))) {
|
||||
BTL_ERROR(("endpoint connect error: %d", rc));
|
||||
BTL_ERROR(("endpoint connect error: %d", rc));
|
||||
mca_btl_openib_endpoint_invoke_error(ib_endpoint);
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
if (master) {
|
||||
ib_endpoint->endpoint_state = MCA_BTL_IB_WAITING_ACK;
|
||||
|
||||
@ -974,20 +974,20 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
/* cpc complete unlock the endpoint */
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case MCA_BTL_IB_WAITING_ACK:
|
||||
/* Tell main BTL that we're done */
|
||||
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
|
||||
/* cpc complete unlock the endpoint */
|
||||
break;
|
||||
|
||||
|
||||
case MCA_BTL_IB_CONNECT_ACK:
|
||||
send_connect_data(ib_endpoint, ENDPOINT_CONNECT_ACK);
|
||||
/* Tell main BTL that we're done */
|
||||
mca_btl_openib_endpoint_cpc_complete(ib_endpoint);
|
||||
/* cpc complete unlock the endpoint */
|
||||
break;
|
||||
|
||||
|
||||
case MCA_BTL_IB_CONNECTED:
|
||||
OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock);
|
||||
break;
|
||||
|
@ -2,9 +2,9 @@
|
||||
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
@ -108,7 +108,7 @@ typedef struct {
|
||||
|
||||
static void rdmacm_contents_constructor(rdmacm_contents_t *contents);
|
||||
static void rdmacm_contents_destructor(rdmacm_contents_t *contents);
|
||||
OBJ_CLASS_INSTANCE(rdmacm_contents_t, opal_list_item_t,
|
||||
OBJ_CLASS_INSTANCE(rdmacm_contents_t, opal_list_item_t,
|
||||
rdmacm_contents_constructor,
|
||||
rdmacm_contents_destructor);
|
||||
|
||||
@ -142,7 +142,7 @@ typedef struct {
|
||||
|
||||
static void id_context_constructor(id_context_t *context);
|
||||
static void id_context_destructor(id_context_t *context);
|
||||
OBJ_CLASS_INSTANCE(id_context_t, opal_list_item_t,
|
||||
OBJ_CLASS_INSTANCE(id_context_t, opal_list_item_t,
|
||||
id_context_constructor,
|
||||
id_context_destructor);
|
||||
|
||||
@ -225,7 +225,7 @@ static void rdmacm_contents_destructor(rdmacm_contents_t *contents)
|
||||
/*
|
||||
* Invoked by main thread
|
||||
*
|
||||
* Sets up any rdma_cm specific commandline params
|
||||
* Sets up any rdma_cm specific commandline params
|
||||
*/
|
||||
static void rdmacm_component_register(void)
|
||||
{
|
||||
@ -288,7 +288,7 @@ static void rdmacm_component_register(void)
|
||||
static char *stringify(uint32_t addr)
|
||||
{
|
||||
char *line = (char *) malloc(64);
|
||||
asprintf(&line, "%d.%d.%d.%d (0x%x)",
|
||||
asprintf(&line, "%d.%d.%d.%d (0x%x)",
|
||||
#if defined(WORDS_BIGENDIAN)
|
||||
(addr >> 24),
|
||||
(addr >> 16) & 0xff,
|
||||
@ -306,7 +306,7 @@ static char *stringify(uint32_t addr)
|
||||
|
||||
/*
|
||||
* Invoked by service thread
|
||||
*
|
||||
*
|
||||
* This function traverses the list of endpoints associated with the
|
||||
* device and determines which of them the remote side is attempting
|
||||
* to connect to. This is determined based on the local endpoint's
|
||||
@ -360,7 +360,7 @@ static mca_btl_openib_endpoint_t *rdmacm_find_endpoint(rdmacm_contents_t *conten
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns max inlne size for qp #N
|
||||
* Returns max inlne size for qp #N
|
||||
*/
|
||||
static uint32_t max_inline_size(int qp, mca_btl_openib_device_t *device)
|
||||
{
|
||||
@ -425,7 +425,7 @@ static int rdmacm_setup_qp(rdmacm_contents_t *contents,
|
||||
attr.cap.max_recv_wr = 0;
|
||||
}
|
||||
attr.cap.max_send_wr = max_send_wr;
|
||||
attr.cap.max_inline_data = req_inline =
|
||||
attr.cap.max_inline_data = req_inline =
|
||||
max_inline_size(qpnum, contents->openib_btl->device);
|
||||
attr.cap.max_send_sge = 1;
|
||||
attr.cap.max_recv_sge = 1; /* we do not use SG list */
|
||||
@ -473,7 +473,7 @@ out:
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
/*
|
||||
* Invoked by both main and service threads
|
||||
*
|
||||
* To avoid all kinds of nasty race conditions, we only allow
|
||||
@ -490,7 +490,7 @@ static bool i_initiate(uint32_t local_ipaddr, uint16_t local_port,
|
||||
char *a = stringify(local_ipaddr);
|
||||
char *b = stringify(remote_ipaddr);
|
||||
#endif
|
||||
|
||||
|
||||
if (local_ipaddr > remote_ipaddr ||
|
||||
(local_ipaddr == remote_ipaddr && local_port < remote_port)) {
|
||||
OPAL_OUTPUT((-1, "i_initiate (I WIN): local ipaddr %s, remote ipaddr %s",
|
||||
@ -564,10 +564,10 @@ static int rdmacm_client_connect_one(rdmacm_contents_t *contents,
|
||||
* RDMA_CM_EVENT_ADDR_RESOLVED event will occur on the local event
|
||||
* handler.
|
||||
*/
|
||||
OPAL_OUTPUT((-1, "MAIN Resolving id: from IP %s:%d to IP %s:%d",
|
||||
a = stringify(contents->ipaddr),
|
||||
OPAL_OUTPUT((-1, "MAIN Resolving id: from IP %s:%d to IP %s:%d",
|
||||
a = stringify(contents->ipaddr),
|
||||
contents->tcp_port,
|
||||
b = stringify(message->ipaddr),
|
||||
b = stringify(message->ipaddr),
|
||||
message->tcp_port));
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
free(a);
|
||||
@ -629,7 +629,7 @@ out:
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* Invoked by main thread
|
||||
*
|
||||
* Connect method called by the upper layers to connect the local
|
||||
@ -651,7 +651,7 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
|
||||
invoked from the event_handler (to intitiate connections in the
|
||||
Right direction), where we don't have the CPC, so it'll be
|
||||
NULL. */
|
||||
local_message =
|
||||
local_message =
|
||||
(modex_message_t *) endpoint->endpoint_local_cpc->data.cbm_modex_message;
|
||||
message = (modex_message_t *)
|
||||
endpoint->endpoint_remote_cpc_data->cbm_modex_message;
|
||||
@ -698,8 +698,8 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
|
||||
|
||||
/* Are we the initiator? Or do we expect this connect request to
|
||||
be rejected? */
|
||||
endpoint->endpoint_initiator =
|
||||
i_initiate(contents->ipaddr, contents->tcp_port,
|
||||
endpoint->endpoint_initiator =
|
||||
i_initiate(contents->ipaddr, contents->tcp_port,
|
||||
message->ipaddr, message->tcp_port);
|
||||
OPAL_OUTPUT((-1, "MAIN Start connect; ep=%p (%p), I %s the initiator to %s",
|
||||
(void*) endpoint,
|
||||
@ -711,7 +711,7 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
|
||||
if (contents->endpoint->endpoint_initiator) {
|
||||
/* Initiator needs a CTS frag (non-initiator will have a CTS
|
||||
frag allocated later) */
|
||||
if (OMPI_SUCCESS !=
|
||||
if (OMPI_SUCCESS !=
|
||||
(rc = ompi_btl_openib_connect_base_alloc_cts(contents->endpoint))) {
|
||||
BTL_ERROR(("Failed to alloc CTS frag"));
|
||||
goto out;
|
||||
@ -720,7 +720,7 @@ static int rdmacm_module_start_connect(ompi_btl_openib_connect_base_module_t *cp
|
||||
for (qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
rc = rdmacm_client_connect_one(contents, message, qp);
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
BTL_ERROR(("rdmacm_client_connect_one error (real QP %d)",
|
||||
BTL_ERROR(("rdmacm_client_connect_one error (real QP %d)",
|
||||
qp));
|
||||
goto out;
|
||||
}
|
||||
@ -750,7 +750,7 @@ out:
|
||||
static void *show_help_cant_find_endpoint(void *context)
|
||||
{
|
||||
char *msg;
|
||||
cant_find_endpoint_context_t *c =
|
||||
cant_find_endpoint_context_t *c =
|
||||
(cant_find_endpoint_context_t*) context;
|
||||
|
||||
if (NULL != c) {
|
||||
@ -774,7 +774,7 @@ static void *show_help_cant_find_endpoint(void *context)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* Invoked by service thread
|
||||
*
|
||||
* The server thread will handle the incoming connection requests and
|
||||
@ -822,12 +822,12 @@ static int handle_connect_request(struct rdma_cm_event *event)
|
||||
}
|
||||
|
||||
message = (modex_message_t *) endpoint->endpoint_remote_cpc_data->cbm_modex_message;
|
||||
endpoint->endpoint_initiator =
|
||||
endpoint->endpoint_initiator =
|
||||
i_initiate(contents->ipaddr, contents->tcp_port,
|
||||
message->ipaddr, rem_port);
|
||||
|
||||
BTL_VERBOSE(("ep state = %d, local ipaddr = %x, remote ipaddr = %x, local port = %d, remote port = %d",
|
||||
endpoint->endpoint_state, contents->ipaddr, message->ipaddr,
|
||||
endpoint->endpoint_state, contents->ipaddr, message->ipaddr,
|
||||
contents->tcp_port, rem_port));
|
||||
|
||||
OPAL_OUTPUT((-1, "SERVICE in handle_connect_request; ep=%p (%p), I still %s the initiator to %s",
|
||||
@ -879,7 +879,7 @@ static int handle_connect_request(struct rdma_cm_event *event)
|
||||
if (mca_btl_openib_component.credits_qp == qpnum) {
|
||||
struct ibv_recv_wr *bad_wr, *wr;
|
||||
|
||||
if (OMPI_SUCCESS !=
|
||||
if (OMPI_SUCCESS !=
|
||||
ompi_btl_openib_connect_base_alloc_cts(endpoint)) {
|
||||
BTL_ERROR(("Failed to alloc CTS frag"));
|
||||
goto out1;
|
||||
@ -888,7 +888,7 @@ static int handle_connect_request(struct rdma_cm_event *event)
|
||||
assert(NULL != wr);
|
||||
wr->next = NULL;
|
||||
|
||||
if (0 != ibv_post_recv(endpoint->qps[qpnum].qp->lcl_qp,
|
||||
if (0 != ibv_post_recv(endpoint->qps[qpnum].qp->lcl_qp,
|
||||
wr, &bad_wr)) {
|
||||
BTL_ERROR(("failed to post CTS recv buffer"));
|
||||
goto out1;
|
||||
@ -923,10 +923,10 @@ static int handle_connect_request(struct rdma_cm_event *event)
|
||||
/* See rdma_connect(3) for a description of these 2 values. We
|
||||
ensure to pass these values around via the modex so that we can
|
||||
compute the values properly. */
|
||||
conn_param.responder_resources =
|
||||
conn_param.responder_resources =
|
||||
mymin(contents->openib_btl->device->ib_dev_attr.max_qp_rd_atom,
|
||||
message->device_max_qp_init_rd_atom);
|
||||
conn_param.initiator_depth =
|
||||
conn_param.initiator_depth =
|
||||
mymin(contents->openib_btl->device->ib_dev_attr.max_qp_init_rd_atom,
|
||||
message->device_max_qp_rd_atom);
|
||||
conn_param.retry_count = mca_btl_openib_component.ib_retry_count;
|
||||
@ -1001,9 +1001,9 @@ static void *call_disconnect_callback(void *v)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* Invoked by main thread
|
||||
*
|
||||
*
|
||||
* Runs *while* the progress thread is running. We can't stop the
|
||||
* progress thread because this function may be invoked to kill a
|
||||
* specific endpoint that was the result of MPI-2 dynamics (i.e., this
|
||||
@ -1033,12 +1033,12 @@ static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
|
||||
opal_mutex_lock(&client_list_lock);
|
||||
num_to_wait_for = disconnect_callbacks = 0;
|
||||
for (item = opal_list_get_first(&client_list);
|
||||
item != opal_list_get_end(&client_list);
|
||||
item != opal_list_get_end(&client_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
rdmacm_contents_t *contents = (rdmacm_contents_t *) item;
|
||||
|
||||
if (endpoint == contents->endpoint) {
|
||||
while (NULL !=
|
||||
while (NULL !=
|
||||
(item2 = opal_list_remove_first(&(contents->ids)))) {
|
||||
/* Fun race condition: we cannot call
|
||||
rdma_disconnect() here in the main thread, because
|
||||
@ -1131,7 +1131,7 @@ static int rdmacm_connect_endpoint(id_context_t *context,
|
||||
BTL_ERROR(("Can't find endpoint"));
|
||||
return OMPI_ERR_NOT_FOUND;
|
||||
}
|
||||
data =
|
||||
data =
|
||||
(rdmacm_endpoint_local_cpc_data_t *)endpoint->endpoint_local_cpc_data;
|
||||
|
||||
/* Only notify the upper layers after the last QP has been
|
||||
@ -1210,7 +1210,7 @@ static int rdmacm_rejected(id_context_t *context, struct rdma_cm_event *event)
|
||||
/* Why were we rejected? */
|
||||
switch (*((reject_reason_t*) event->param.conn.private_data)) {
|
||||
case REJECT_WRONG_DIRECTION:
|
||||
OPAL_OUTPUT((-1, "SERVICE A good reject! for qp %d, id 0x%p",
|
||||
OPAL_OUTPUT((-1, "SERVICE A good reject! for qp %d, id 0x%p",
|
||||
context->qpnum, (void*) context->id));
|
||||
rdmacm_destroy_dummy_qp(context);
|
||||
break;
|
||||
@ -1261,10 +1261,10 @@ out:
|
||||
/*
|
||||
* Runs in service thread
|
||||
*/
|
||||
static int create_dummy_cq(rdmacm_contents_t *contents,
|
||||
static int create_dummy_cq(rdmacm_contents_t *contents,
|
||||
mca_btl_openib_module_t *openib_btl)
|
||||
{
|
||||
contents->dummy_cq =
|
||||
contents->dummy_cq =
|
||||
ibv_create_cq(openib_btl->device->ib_dev_context, 1, NULL, NULL, 0);
|
||||
if (NULL == contents->dummy_cq) {
|
||||
BTL_ERROR(("dummy_cq not created"));
|
||||
@ -1279,7 +1279,7 @@ out:
|
||||
/*
|
||||
* Runs in service thread
|
||||
*/
|
||||
static int create_dummy_qp(rdmacm_contents_t *contents,
|
||||
static int create_dummy_qp(rdmacm_contents_t *contents,
|
||||
struct rdma_cm_id *id, int qpnum)
|
||||
{
|
||||
struct ibv_qp_init_attr attr;
|
||||
@ -1347,7 +1347,7 @@ static int finish_connect(id_context_t *context)
|
||||
/* If we're the initiator, then setup the QP's and post the CTS
|
||||
message buffer */
|
||||
if (contents->endpoint->endpoint_initiator) {
|
||||
rc = rdmacm_setup_qp(contents, contents->endpoint,
|
||||
rc = rdmacm_setup_qp(contents, contents->endpoint,
|
||||
context->id, context->qpnum);
|
||||
if (0 != rc) {
|
||||
BTL_ERROR(("rdmacm_setup_qp error %d", rc));
|
||||
@ -1357,14 +1357,14 @@ static int finish_connect(id_context_t *context)
|
||||
if (mca_btl_openib_component.credits_qp == context->qpnum) {
|
||||
/* Post a single receive buffer on the smallest QP for the CTS
|
||||
protocol */
|
||||
|
||||
|
||||
struct ibv_recv_wr *bad_wr, *wr;
|
||||
assert(NULL != contents->endpoint->endpoint_cts_frag.super.super.base.super.ptr);
|
||||
wr = &(contents->endpoint->endpoint_cts_frag.rd_desc);
|
||||
assert(NULL != wr);
|
||||
wr->next = NULL;
|
||||
|
||||
if (0 != ibv_post_recv(contents->endpoint->qps[context->qpnum].qp->lcl_qp,
|
||||
|
||||
if (0 != ibv_post_recv(contents->endpoint->qps[context->qpnum].qp->lcl_qp,
|
||||
wr, &bad_wr)) {
|
||||
BTL_ERROR(("failed to post CTS recv buffer"));
|
||||
goto out1;
|
||||
@ -1399,10 +1399,10 @@ static int finish_connect(id_context_t *context)
|
||||
|
||||
memset(&conn_param, 0, sizeof(conn_param));
|
||||
/* See above comment about rdma_connect(3) and these two values. */
|
||||
conn_param.responder_resources =
|
||||
conn_param.responder_resources =
|
||||
mymin(contents->openib_btl->device->ib_dev_attr.max_qp_rd_atom,
|
||||
message->device_max_qp_init_rd_atom);
|
||||
conn_param.initiator_depth =
|
||||
conn_param.initiator_depth =
|
||||
mymin(contents->openib_btl->device->ib_dev_attr.max_qp_init_rd_atom,
|
||||
message->device_max_qp_rd_atom);
|
||||
conn_param.flow_control = 0;
|
||||
@ -1456,7 +1456,7 @@ out:
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* Runs in main thread
|
||||
*/
|
||||
static void *show_help_rdmacm_event_error(void *c)
|
||||
@ -1566,14 +1566,14 @@ static int event_handler(struct rdma_cm_event *event)
|
||||
found = false;
|
||||
if (OMPI_SUCCESS == ompi_btl_openib_ini_query(attr->vendor_id,
|
||||
attr->vendor_part_id,
|
||||
&ini) &&
|
||||
&ini) &&
|
||||
ini.rdmacm_reject_causes_connect_error) {
|
||||
found = true;
|
||||
}
|
||||
if (rdmacm_reject_causes_connect_error) {
|
||||
found = true;
|
||||
}
|
||||
|
||||
|
||||
if (found) {
|
||||
OPAL_OUTPUT((-1, "SERVICE Got CONNECT_ERROR, but ignored: %p", (void*) event->id));
|
||||
rc = rdmacm_destroy_dummy_qp(context);
|
||||
@ -1628,7 +1628,7 @@ static inline void rdmamcm_event_error(struct rdma_cm_event *event)
|
||||
endpoint = ((id_context_t *)event->id->context)->contents->endpoint;
|
||||
}
|
||||
|
||||
ompi_btl_openib_fd_run_in_main(mca_btl_openib_endpoint_invoke_error,
|
||||
ompi_btl_openib_fd_run_in_main(mca_btl_openib_endpoint_invoke_error,
|
||||
endpoint);
|
||||
}
|
||||
|
||||
@ -1684,7 +1684,7 @@ static void *rdmacm_event_dispatch(int fd, int flags, void *context)
|
||||
/*
|
||||
* Runs in main thread
|
||||
*
|
||||
* CPC init function - Setup all globals here
|
||||
* CPC init function - Setup all globals here
|
||||
*/
|
||||
static int rdmacm_init(mca_btl_openib_endpoint_t *endpoint)
|
||||
{
|
||||
@ -1716,14 +1716,14 @@ static int ipaddrcheck(id_context_t *context,
|
||||
* up). Unfortunately, the subnet and IP address look up needs to match or
|
||||
* there could be a mismatch if IP Aliases are being used. For more
|
||||
* information on this, please read comment above
|
||||
* mca_btl_openib_get_ip_subnet_id in btl_openib_ip.c
|
||||
* mca_btl_openib_get_ip_subnet_id in btl_openib_ip.c
|
||||
*/
|
||||
ipaddr =
|
||||
mca_btl_openib_rdma_get_ipv4addr(openib_btl->device->ib_dev_context,
|
||||
ipaddr =
|
||||
mca_btl_openib_rdma_get_ipv4addr(openib_btl->device->ib_dev_context,
|
||||
openib_btl->port_num);
|
||||
if (0 == ipaddr) {
|
||||
BTL_VERBOSE(("*** Could not find IP address for %s:%d -- is there an IP address configured for this device?",
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
openib_btl->port_num));
|
||||
return OMPI_ERR_NOT_FOUND;
|
||||
}
|
||||
@ -1735,16 +1735,16 @@ static int ipaddrcheck(id_context_t *context,
|
||||
|
||||
/* Ok, we found the IP address of this device/port. Have we
|
||||
already see this IP address/TCP port before? */
|
||||
for (item = opal_list_get_first(&server_listener_list);
|
||||
item != opal_list_get_end(&server_listener_list);
|
||||
for (item = opal_list_get_first(&server_listener_list);
|
||||
item != opal_list_get_end(&server_listener_list);
|
||||
item = opal_list_get_next(item)) {
|
||||
rdmacm_contents_t *contents = (rdmacm_contents_t *)item;
|
||||
BTL_VERBOSE(("paddr = %x, ipaddr addr = %x",
|
||||
BTL_VERBOSE(("paddr = %x, ipaddr addr = %x",
|
||||
contents->ipaddr, ipaddr));
|
||||
if (contents->ipaddr == ipaddr &&
|
||||
contents->tcp_port == server_tcp_port) {
|
||||
str = stringify(ipaddr);
|
||||
BTL_VERBOSE(("server already listening on %s:%d",
|
||||
BTL_VERBOSE(("server already listening on %s:%d",
|
||||
str, server_tcp_port));
|
||||
free(str);
|
||||
already_exists = true;
|
||||
@ -1755,7 +1755,7 @@ static int ipaddrcheck(id_context_t *context,
|
||||
/* If we haven't seen it before, save it */
|
||||
if (!already_exists) {
|
||||
str = stringify(ipaddr);
|
||||
BTL_VERBOSE(("creating new server to listen on %s:%d",
|
||||
BTL_VERBOSE(("creating new server to listen on %s:%d",
|
||||
str, server_tcp_port));
|
||||
free(str);
|
||||
server->ipaddr = ipaddr;
|
||||
@ -1765,8 +1765,8 @@ static int ipaddrcheck(id_context_t *context,
|
||||
return already_exists ? OMPI_ERROR : OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int create_message(rdmacm_contents_t *server,
|
||||
mca_btl_openib_module_t *openib_btl,
|
||||
static int create_message(rdmacm_contents_t *server,
|
||||
mca_btl_openib_module_t *openib_btl,
|
||||
ompi_btl_openib_connect_base_module_data_t *data)
|
||||
{
|
||||
modex_message_t *message;
|
||||
@ -1780,14 +1780,14 @@ static int create_message(rdmacm_contents_t *server,
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
message->device_max_qp_rd_atom =
|
||||
message->device_max_qp_rd_atom =
|
||||
openib_btl->device->ib_dev_attr.max_qp_rd_atom;
|
||||
message->device_max_qp_init_rd_atom =
|
||||
message->device_max_qp_init_rd_atom =
|
||||
openib_btl->device->ib_dev_attr.max_qp_init_rd_atom;
|
||||
message->ipaddr = server->ipaddr;
|
||||
message->tcp_port = server->tcp_port;
|
||||
|
||||
OPAL_OUTPUT((-1, "Message IP address is %s, port %d",
|
||||
OPAL_OUTPUT((-1, "Message IP address is %s, port %d",
|
||||
a = stringify(message->ipaddr), message->tcp_port));
|
||||
#if OPAL_ENABLE_DEBUG
|
||||
free(a);
|
||||
@ -1798,7 +1798,7 @@ static int create_message(rdmacm_contents_t *server,
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
/*
|
||||
* Runs in main thread
|
||||
*
|
||||
* This function determines if the RDMACM is a possible cpc method and
|
||||
@ -1943,7 +1943,7 @@ out:
|
||||
} else {
|
||||
opal_output_verbose(5, mca_btl_base_output,
|
||||
"openib BTL: rmacm CPC unavailable for use on %s:%d; fatal error %d (%s)",
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
openib_btl->port_num, rc,
|
||||
opal_strerror(rc));
|
||||
}
|
||||
@ -1971,7 +1971,7 @@ static int rdmacm_component_finalize(void)
|
||||
|
||||
if (NULL != event_channel) {
|
||||
#ifndef __WINDOWS__
|
||||
rc = ompi_btl_openib_fd_unmonitor(event_channel->fd,
|
||||
rc = ompi_btl_openib_fd_unmonitor(event_channel->fd,
|
||||
rdmacm_unmonitor, (void*) &barrier);
|
||||
#endif
|
||||
if (OMPI_SUCCESS != rc) {
|
||||
@ -2048,7 +2048,7 @@ static int rdmacm_component_init(void)
|
||||
ompi_btl_openib_fd_monitor(event_channel->fd, OPAL_EV_READ,
|
||||
rdmacm_event_dispatch, NULL);
|
||||
#endif
|
||||
|
||||
|
||||
rdmacm_component_initialized = true;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -2,9 +2,9 @@
|
||||
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
@ -36,7 +36,7 @@
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
static void xoob_component_register(void);
|
||||
static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
|
||||
static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
|
||||
ompi_btl_openib_connect_base_module_t **cpc);
|
||||
static int xoob_component_finalize(void);
|
||||
|
||||
@ -812,7 +812,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
requested_lid, message_type);
|
||||
if ( NULL == ib_endpoint) {
|
||||
BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_REQUEST."
|
||||
" Failed to find endpoint with subnet %" PRIx64
|
||||
" Failed to find endpoint with subnet %" PRIx64
|
||||
" and LID %d",
|
||||
rem_info.rem_subnet_id,requested_lid));
|
||||
mca_btl_openib_endpoint_invoke_error(NULL);
|
||||
@ -899,7 +899,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
/* update ib_addr with remote qp number */
|
||||
ib_endpoint->ib_addr->remote_xrc_rcv_qp_num =
|
||||
ib_endpoint->rem_info.rem_qps->rem_qp_num;
|
||||
BTL_VERBOSE(("rem_info: lid %d, sid %" PRIx64
|
||||
BTL_VERBOSE(("rem_info: lid %d, sid %" PRIx64
|
||||
" ep %d %" PRIx64 "\n",
|
||||
rem_info.rem_lid,
|
||||
rem_info.rem_subnet_id,
|
||||
@ -962,7 +962,7 @@ static void xoob_rml_recv_cb(int status, orte_process_name_t* process_name,
|
||||
*/
|
||||
|
||||
/* Quere for the XOOB priority - will be highest in XRC case */
|
||||
static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
|
||||
static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
|
||||
ompi_btl_openib_connect_base_module_t **cpc)
|
||||
{
|
||||
int rc;
|
||||
@ -986,7 +986,7 @@ static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
|
||||
ensure to only post it *once*, because another btl may have
|
||||
come in before this and already posted it. */
|
||||
if (!rml_recv_posted) {
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
|
||||
OMPI_RML_TAG_XOPENIB,
|
||||
ORTE_RML_PERSISTENT,
|
||||
xoob_rml_recv_cb,
|
||||
@ -999,7 +999,7 @@ static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
|
||||
}
|
||||
rml_recv_posted = true;
|
||||
}
|
||||
|
||||
|
||||
(*cpc)->data.cbm_component = &ompi_btl_openib_connect_xoob;
|
||||
(*cpc)->data.cbm_priority = xoob_priority;
|
||||
(*cpc)->data.cbm_modex_message = NULL;
|
||||
|
@ -3,9 +3,9 @@
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
|
@ -2,9 +2,9 @@
|
||||
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
@ -30,26 +30,26 @@
|
||||
*
|
||||
* - a BTL module represents a network port (in the case of the openib
|
||||
* BTL, a LID)
|
||||
* - a CPC module represents one way to make connections to a BTL module
|
||||
* - a CPC module represents one way to make connections to a BTL module
|
||||
* - hence, a BTL module has potentially multiple CPC modules
|
||||
* associated with it
|
||||
* - an endpoint represnts a connection between a local BTL module and
|
||||
* a remote BTL module (in the openib BTL, because of BSRQ, an
|
||||
* endpoint can contain multiple QPs)
|
||||
* - an endpoint represnts a connection between a local BTL module and
|
||||
* a remote BTL module (in the openib BTL, because of BSRQ, an
|
||||
* endpoint can contain multiple QPs)
|
||||
* - when an endpoint is created, one of the CPC modules associated
|
||||
* with the local BTL is selected and associated with the endpoint
|
||||
* with the local BTL is selected and associated with the endpoint
|
||||
* (obviously, it is a CPC module that is common between the local
|
||||
* and remote BTL modules)
|
||||
* - endpoints may be created and destroyed during the MPI job
|
||||
* - endpoints are created lazily, during the first communication
|
||||
* between two peers
|
||||
* - endpoints are destroyed when two MPI processes become
|
||||
* disconnected (e.g., MPI-2 dynamics or MPI_FINALIZE)
|
||||
* - hence, BTL modules and CPC modules outlive endpoints.
|
||||
* Specifically, BTL modules and CPC modules live from MPI_INIT to
|
||||
* MPI_FINALIZE. endpoints come and go as MPI semantics demand it.
|
||||
* - therefore, CPC modules need to cache information on endpoints that
|
||||
* are specific to that connection.
|
||||
* and remote BTL modules)
|
||||
* - endpoints may be created and destroyed during the MPI job
|
||||
* - endpoints are created lazily, during the first communication
|
||||
* between two peers
|
||||
* - endpoints are destroyed when two MPI processes become
|
||||
* disconnected (e.g., MPI-2 dynamics or MPI_FINALIZE)
|
||||
* - hence, BTL modules and CPC modules outlive endpoints.
|
||||
* Specifically, BTL modules and CPC modules live from MPI_INIT to
|
||||
* MPI_FINALIZE. endpoints come and go as MPI semantics demand it.
|
||||
* - therefore, CPC modules need to cache information on endpoints that
|
||||
* are specific to that connection.
|
||||
*
|
||||
* Component interface:
|
||||
*
|
||||
@ -57,7 +57,7 @@
|
||||
* calls the connect_base_register() function, which scans all
|
||||
* compiled-in CPC's. If they have component_register() functions,
|
||||
* they are called (component_register() functions are only allowed to
|
||||
* register MCA parameters).
|
||||
* register MCA parameters).
|
||||
*
|
||||
* NOTE: The connect_base_register() function will process the
|
||||
* btl_openib_cpc_include and btl_openib_cpc_exclude MCA parameters
|
||||
@ -230,7 +230,7 @@ typedef int (*ompi_btl_openib_connect_base_component_init_fn_t)(void);
|
||||
* - Other OMPI_ERR_* code: an error occurred.
|
||||
*/
|
||||
typedef int (*ompi_btl_openib_connect_base_func_component_query_t)
|
||||
(struct mca_btl_openib_module_t *btl,
|
||||
(struct mca_btl_openib_module_t *btl,
|
||||
struct ompi_btl_openib_connect_base_module_t **cpc);
|
||||
|
||||
/**
|
||||
|
@ -6,7 +6,7 @@
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
@ -14,9 +14,9 @@
|
||||
# Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
||||
# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI's OpenFabrics support
|
||||
@ -151,9 +151,9 @@ sent.
|
||||
|
||||
This error usually means one of two things:
|
||||
|
||||
1. There is something awry within the network fabric itself.
|
||||
1. There is something awry within the network fabric itself.
|
||||
2. A bug in Open MPI has caused flow control to malfunction.
|
||||
|
||||
|
||||
#1 is usually more likely. You should note the hosts on which this
|
||||
error has occurred; it has been observed that rebooting or removing a
|
||||
particular host from the job can sometimes resolve this issue.
|
||||
@ -200,7 +200,7 @@ exceeded. "Retry count" is defined in the InfiniBand spec 1.2
|
||||
This error typically means that there is something awry within the
|
||||
InfiniBand fabric itself. You should note the hosts on which this
|
||||
error has occurred; it has been observed that rebooting or removing a
|
||||
particular host from the job can sometimes resolve this issue.
|
||||
particular host from the job can sometimes resolve this issue.
|
||||
|
||||
Two MCA parameters can be used to control Open MPI's behavior with
|
||||
respect to the retry count:
|
||||
@ -280,7 +280,7 @@ Deactivating the OpenFabrics BTL.
|
||||
Wrong buffer alignment %d configured on host '%s'. Should be bigger
|
||||
than zero and power of two. Use default %d instead.
|
||||
#
|
||||
[of error event]
|
||||
[of error event]
|
||||
The OpenFabrics stack has reported a network error event. Open MPI
|
||||
will try to continue, but your job may end up failing.
|
||||
|
||||
@ -591,7 +591,7 @@ conflict:
|
||||
WARNING: The openib BTL was directed to use "eager RDMA" for short
|
||||
messages, but the openib BTL was compiled with progress threads
|
||||
support. Short eager RDMA is not yet supported with progress threads;
|
||||
its use has been disabled in this job.
|
||||
its use has been disabled in this job.
|
||||
|
||||
This is a warning only; you job will attempt to continue.
|
||||
#
|
||||
@ -644,7 +644,7 @@ be able to run successfully.
|
||||
Local host: %s
|
||||
Local adapter: %s (vendor 0x%x, part ID %d)
|
||||
Local queues: %s
|
||||
|
||||
|
||||
Remote host: %s
|
||||
Remote adapter: (vendor 0x%x, part ID %d)
|
||||
Remote queues: %s
|
||||
@ -656,7 +656,7 @@ Such mixed network trasport configuration is not supported by Open MPI.
|
||||
Local host: %s
|
||||
Local adapter: %s (vendor 0x%x, part ID %d)
|
||||
Local transport type: %s
|
||||
|
||||
|
||||
Remote host: %s
|
||||
Remote Adapter: (vendor 0x%x, part ID %d)
|
||||
Remote transport type: %s
|
||||
|
@ -2,7 +2,7 @@
|
||||
# Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user