1
1

This commit brings in two major things:

1. Galen's fine-grain control of queue pair resources in the openib
   BTL.
1. Pasha's new implementation of asychronous HCA event handling.

Pasha's new implementation doesn't take much explanation, but the new
"multifrag" stuff does.  

Note that "svn merge" was not used to bring this new code from the
/tmp/ib_multifrag branch -- something Bad happened in the periodic
trunk pulls on that branch making an actual merge back to the trunk
effectively impossible (i.e., lots and lots of arbitrary conflicts and
artifical changes).  :-(

== Fine-grain control of queue pair resources ==

Galen's fine-grain control of queue pair resources to the OpenIB BTL
(thanks to Gleb for fixing broken code and providing additional
functionality, Pasha for finding broken code, and Jeff for doing all
the svn work and regression testing).

Prior to this commit, the OpenIB BTL created two queue pairs: one for
eager size fragments and one for max send size fragments.  When the
use of the shared receive queue (SRQ) was specified (via "-mca
btl_openib_use_srq 1"), these QPs would use a shared receive queue for
receive buffers instead of the default per-peer (PP) receive queues
and buffers.  One consequence of this design is that receive buffer
utilization (the size of the data received as a percentage of the
receive buffer used for the data) was quite poor for a number of
applications.

The new design allows multiple QPs to be specified at runtime.  Each
QP can be setup to use PP or SRQ receive buffers as well as giving
fine-grained control over receive buffer size, number of receive
buffers to post, when to replenish the receive queue (low water mark)
and for SRQ QPs, the number of outstanding sends can also be
specified.  The following is an example of the syntax to describe QPs
to the OpenIB BTL using the new MCA parameter btl_openib_receive_queues:

{{{
-mca btl_openib_receive_queues \
     "P,128,16,4;S,1024,256,128,32;S,4096,256,128,32;S,65536,256,128,32"
}}}

Each QP description is delimited by ";" (semicolon) with individual
fields of the QP description delimited by "," (comma).  The above
example therefore describes 4 QPs.

The first QP is:

    P,128,16,4

Meaning: per-peer receive buffer QPs are indicated by a starting field
of "P"; the first QP (shown above) is therefore a per-peer based QP.
The second field indicates the size of the receive buffer in bytes
(128 bytes).  The third field indicates the number of receive buffers
to allocate to the QP (16).  The fourth field indicates the low
watermark for receive buffers at which time the BTL will repost
receive buffers to the QP (4).

The second QP is:

    S,1024,256,128,32

Shared receive queue based QPs are indicated by a starting field of
"S"; the second QP (shown above) is therefore a shared receive queue
based QP.  The second, third and fourth fields are the same as in the
per-peer based QP.  The fifth field is the number of outstanding sends
that are allowed at a given time on the QP (32).  This provides a
"good enough" mechanism of flow control for some regular communication
patterns.

QPs MUST be specified in ascending receive buffer size order.  This
requirement may be removed prior to 1.3 release.

This commit was SVN r15474.
Этот коммит содержится в:
Jeff Squyres 2007-07-18 01:15:59 +00:00
родитель e3ad495e7b
Коммит 8ace07efed
21 изменённых файлов: 2362 добавлений и 1465 удалений

Просмотреть файл

@ -54,6 +54,13 @@ AC_DEFUN([OMPI_CHECK_OPENIB],[
AC_MSG_WARN([Not building component.])
ompi_check_openib_happy="no"])])
AS_IF([test "$ompi_check_openib_happy" = "yes"],
[AC_CHECK_HEADERS(
fcntl.h sys/poll.h,
[],
[AC_MSG_WARN([fcntl.h sys/poll.h not found. Can not build component.])
ompi_check_openib_happy="no"])])
AS_IF([test "$ompi_check_openib_happy" = "yes"],
[OMPI_CHECK_PACKAGE([$1],
[infiniband/verbs.h],

Просмотреть файл

@ -87,6 +87,10 @@ static void ompi_free_list_destruct(ompi_free_list_t* fl)
OBJ_DESTRUCT(&fl->fl_allocations);
OBJ_DESTRUCT(&fl->fl_condition);
OBJ_DESTRUCT(&fl->fl_lock);
if(fl->ctx) {
free(fl->ctx);
}
}
int ompi_free_list_init_ex(
@ -97,7 +101,9 @@ int ompi_free_list_init_ex(
int num_elements_to_alloc,
int max_elements_to_alloc,
int num_elements_per_alloc,
mca_mpool_base_module_t* mpool)
mca_mpool_base_module_t* mpool,
ompi_free_list_item_init_fn_t item_init,
void* ctx)
{
/* alignment must be more than zero and power of two */
if(alignment <= 1 || (alignment & (alignment - 1)))
@ -112,6 +118,8 @@ int ompi_free_list_init_ex(
flist->fl_num_per_alloc = num_elements_per_alloc;
flist->fl_mpool = mpool;
flist->fl_alignment = alignment;
flist->item_init = item_init;
flist->ctx = ctx;
if(num_elements_to_alloc)
return ompi_free_list_grow(flist, num_elements_to_alloc);
return OMPI_SUCCESS;
@ -176,12 +184,17 @@ int ompi_free_list_grow(ompi_free_list_t* flist, size_t num_elements)
item->ptr = mpool_alloc_ptr;
OBJ_CONSTRUCT_INTERNAL(item, flist->fl_elem_class);
/* run the initialize function if present */
if(flist->item_init) {
flist->item_init(item, flist->ctx);
}
opal_atomic_lifo_push(&(flist->super), &(item->super));
ptr += head_size;
mpool_alloc_ptr += elem_size;
}
flist->fl_num_allocated += num_elements;
return OMPI_SUCCESS;
}

Просмотреть файл

@ -31,6 +31,11 @@ extern "C" {
#endif
struct mca_mem_pool_t;
struct ompi_free_list_item_t;
typedef void (*ompi_free_list_item_init_fn_t) (
struct ompi_free_list_item_t*,
void* ctx);
struct ompi_free_list_t
{
@ -46,6 +51,8 @@ struct ompi_free_list_t
opal_mutex_t fl_lock;
opal_condition_t fl_condition;
opal_list_t fl_allocations;
ompi_free_list_item_init_fn_t item_init;
void* ctx;
};
typedef struct ompi_free_list_t ompi_free_list_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_free_list_t);
@ -61,6 +68,7 @@ typedef struct ompi_free_list_item_t ompi_free_list_item_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_free_list_item_t);
/**
* Initialize a free list.
*
@ -81,7 +89,10 @@ OMPI_DECLSPEC int ompi_free_list_init_ex(
int num_elements_to_alloc,
int max_elements_to_alloc,
int num_elements_per_alloc,
struct mca_mpool_base_module_t*);
struct mca_mpool_base_module_t*,
ompi_free_list_item_init_fn_t item_init,
void *ctx
);
static inline int ompi_free_list_init(
ompi_free_list_t *free_list,
@ -94,7 +105,7 @@ static inline int ompi_free_list_init(
{
return ompi_free_list_init_ex(free_list, element_size, CACHE_LINE_SIZE,
element_class, num_elements_to_alloc, max_elements_to_alloc,
num_elements_per_alloc, mpool);
num_elements_per_alloc, mpool, NULL, NULL);
}

Просмотреть файл

@ -376,14 +376,15 @@ static inline void mca_bml_base_prepare_dst(mca_bml_base_btl_t* bml_btl,
} \
} while(0)
#else
#define MCA_BML_BASE_BTL_DES_ALLOC(bml_btl, des, order, \
#define MCA_BML_BASE_BTL_DES_ALLOC(bml_btl, des, _order, \
alloc_size, seg_size) \
do { \
if( MCA_BTL_NO_ORDER == order && \
if( MCA_BTL_NO_ORDER == _order && \
NULL != (des = bml_btl->btl_cache) ) { \
bml_btl->btl_cache = NULL; \
des->order = MCA_BTL_NO_ORDER; \
} else { \
des = bml_btl->btl_alloc(bml_btl->btl, order, alloc_size); \
des = bml_btl->btl_alloc(bml_btl->btl, _order, alloc_size); \
} \
if( OPAL_LIKELY(des != NULL) ) { \
des->des_src->seg_len = seg_size; \

Просмотреть файл

@ -46,7 +46,9 @@ sources = \
btl_openib_mca.c \
btl_openib_mca.h \
btl_openib_ini.c \
btl_openib_ini.h
btl_openib_ini.h \
btl_openib_async.c \
btl_openib_async.h
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la

Просмотреть файл

@ -140,7 +140,9 @@ int mca_btl_openib_add_procs(
int rem_subnet_id_port_cnt;
int lcl_subnet_id_port_cnt = 0;
int btl_rank = 0;
mca_btl_base_endpoint_t* endpoint;
int ep_index;
for(j=0; j < mca_btl_openib_component.ib_num_btls; j++){
if(mca_btl_openib_component.openib_btls[j]->port_info.subnet_id
== openib_btl->port_info.subnet_id) {
@ -154,7 +156,7 @@ int mca_btl_openib_add_procs(
for(i = 0; i < (int) nprocs; i++) {
struct ompi_proc_t* ompi_proc = ompi_procs[i];
mca_btl_openib_proc_t* ib_proc;
mca_btl_base_endpoint_t* endpoint;
/* mca_btl_base_endpoint_t* endpoint; */
if(NULL == (ib_proc = mca_btl_openib_proc_create(ompi_proc))) {
return OMPI_ERR_OUT_OF_RESOURCE;
@ -197,6 +199,7 @@ int mca_btl_openib_add_procs(
* Cache the peer instance on the btl_proc.
*/
endpoint = OBJ_NEW(mca_btl_openib_endpoint_t);
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
if(NULL == endpoint) {
OPAL_THREAD_UNLOCK(&ib_proc->proc_lock);
return OMPI_ERR_OUT_OF_RESOURCE;
@ -220,75 +223,84 @@ int mca_btl_openib_add_procs(
peers[i] = endpoint;
}
for(ep_index=0;
ep_index < orte_pointer_array_get_size(openib_btl->endpoints);
ep_index++) {
endpoint=orte_pointer_array_get_item(openib_btl->endpoints,ep_index);
if(0xffffffff == (uint64_t) endpoint) {
opal_output(0, "WTF?\n");
abort();
}
}
return mca_btl_openib_size_queues(openib_btl, nprocs);
}
static int mca_btl_openib_size_queues( struct mca_btl_openib_module_t* openib_btl, size_t nprocs)
{
int min_cq_size;
int min_hp_cq_size = 0, min_lp_cq_size = 0;
int first_time = (0 == openib_btl->num_peers);
int rc;
openib_btl->num_peers += nprocs;
if(mca_btl_openib_component.use_srq) {
openib_btl->rd_num = mca_btl_openib_component.rd_num + log2(nprocs) * mca_btl_openib_component.srq_rd_per_peer;
if(openib_btl->rd_num > mca_btl_openib_component.srq_rd_max)
openib_btl->rd_num = mca_btl_openib_component.srq_rd_max;
openib_btl->rd_low = openib_btl->rd_num - 1;
min_cq_size = openib_btl->rd_num * 2 * openib_btl->num_peers;
if(!first_time) {
struct ibv_srq_attr srq_attr;
srq_attr.max_wr = openib_btl->rd_num;
rc = ibv_modify_srq(openib_btl->srq[BTL_OPENIB_HP_QP],
&srq_attr, IBV_SRQ_MAX_WR);
if(rc) {
BTL_ERROR(("cannot resize high priority shared receive queue, error: %d", rc));
return OMPI_ERROR;
}
rc = ibv_modify_srq(openib_btl->srq[BTL_OPENIB_LP_QP],
&srq_attr, IBV_SRQ_MAX_WR);
if(rc) {
BTL_ERROR(("cannot resize low priority shared receive queue, error: %d", rc));
return OMPI_ERROR;
}
}
} else
{
min_cq_size = ( mca_btl_openib_component.rd_num > (int32_t) mca_btl_openib_component.eager_rdma_num ?
mca_btl_openib_component.rd_num : (int32_t) mca_btl_openib_component.eager_rdma_num ) *
2 * openib_btl->num_peers;
}
if(min_cq_size > (int32_t) mca_btl_openib_component.ib_cq_size) {
mca_btl_openib_component.ib_cq_size = min_cq_size > openib_btl->hca->ib_dev_attr.max_cq ?
openib_btl->hca->ib_dev_attr.max_cq : min_cq_size;
#ifdef HAVE_IBV_RESIZE_CQ
if(!first_time) {
rc = ibv_resize_cq(openib_btl->ib_cq[BTL_OPENIB_LP_QP], mca_btl_openib_component.ib_cq_size);
if(rc) {
BTL_ERROR(("cannot resize low priority completion queue, error: %d", rc));
return OMPI_ERROR;
}
rc = ibv_resize_cq(openib_btl->ib_cq[BTL_OPENIB_HP_QP],
mca_btl_openib_component.ib_cq_size);
if(rc) {
BTL_ERROR(("cannot resize high priority completion queue, error: %d", rc));
return OMPI_ERROR;
int qp;
openib_btl->num_peers += nprocs;
/* figure out reasonable sizes for completion queues */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if(MCA_BTL_OPENIB_SRQ_QP == mca_btl_openib_component.qp_infos[qp].type) {
if(mca_btl_openib_component.qp_infos[qp].size <=
mca_btl_openib_component.eager_limit){
min_hp_cq_size += mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
} else {
min_lp_cq_size += mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
}
}
else {
if(mca_btl_openib_component.qp_infos[qp].size <=
mca_btl_openib_component.eager_limit){
min_hp_cq_size += mca_btl_openib_component.qp_infos[qp].rd_num
* 2 * openib_btl->num_peers;
} else {
min_lp_cq_size += mca_btl_openib_component.qp_infos[qp].rd_num
* 2 * openib_btl->num_peers;
}
}
#endif
}
/* make sure we don't exceed the maximum CQ size and that we
* don't size the queue smaller than otherwise requested
*/
if(min_lp_cq_size > (int32_t) mca_btl_openib_component.ib_lp_cq_size) {
mca_btl_openib_component.ib_lp_cq_size =
min_lp_cq_size > openib_btl->hca->ib_dev_attr.max_cq ?
openib_btl->hca->ib_dev_attr.max_cq : min_lp_cq_size;
}
if(min_hp_cq_size > (int32_t) mca_btl_openib_component.ib_hp_cq_size) {
mca_btl_openib_component.ib_hp_cq_size =
min_hp_cq_size > openib_btl->hca->ib_dev_attr.max_cq ?
openib_btl->hca->ib_dev_attr.max_cq : min_hp_cq_size;
}
#ifdef HAVE_IBV_RESIZE_CQ
if(!first_time) {
rc = ibv_resize_cq(openib_btl->ib_cq[BTL_OPENIB_LP_CQ],
mca_btl_openib_component.ib_lp_cq_size);
if(rc) {
BTL_ERROR(("cannot resize low priority completion queue, error: %d", rc));
return OMPI_ERROR;
}
rc = ibv_resize_cq(openib_btl->ib_cq[BTL_OPENIB_HP_CQ],
mca_btl_openib_component.ib_hp_cq_size);
if(rc) {
BTL_ERROR(("cannot resize high priority completion queue, error: %d", rc));
return OMPI_ERROR;
}
}
#endif
if(first_time) {
/* never been here before, setup cq and srq */
mca_btl_openib_component.ib_cq_size = (int) mca_btl_openib_component.ib_cq_size >
openib_btl->hca->ib_dev_attr.max_cq ?
openib_btl->hca->ib_dev_attr.max_cq :
(int) mca_btl_openib_component.ib_cq_size;
return mca_btl_openib_create_cq_srq(openib_btl);
}
return OMPI_SUCCESS;
@ -303,24 +315,44 @@ int mca_btl_openib_del_procs(struct mca_btl_base_module_t* btl,
{
int i,ep_index;
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl;
mca_btl_openib_endpoint_t* endpoint;
/* opal_output(0, "del_procs called!\n"); */
for (i=0 ; i < (int) nprocs ; i++) {
mca_btl_base_endpoint_t* del_endpoint = peers[i];
for(ep_index=0;
ep_index < orte_pointer_array_get_size(openib_btl->endpoints);
ep_index++) {
mca_btl_openib_endpoint_t* endpoint =
ep_index < orte_pointer_array_get_size(openib_btl->endpoints);
ep_index++) {
endpoint =
orte_pointer_array_get_item(openib_btl->endpoints,ep_index);
if(!endpoint) {
continue;
}
if (endpoint == del_endpoint) {
opal_output(mca_btl_base_output,"in del_procs %d, setting another endpoint to null\n",
ep_index);
orte_pointer_array_set_item(openib_btl->endpoints,ep_index,NULL);
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
OBJ_RELEASE(endpoint);
} else if(0xffffffff == (uint64_t) endpoint) {
opal_output(0, "WTF?");
abort();
}
}
}
for(ep_index=0;
ep_index < orte_pointer_array_get_size(openib_btl->endpoints);
ep_index++) {
endpoint=orte_pointer_array_get_item(openib_btl->endpoints,ep_index);
if(0xffffffff == (uint64_t) endpoint) {
opal_output(0, "WTF?\n");
abort();
}
}
return OMPI_SUCCESS;
}
/*
*Register callback function to support send/recv semantics
*/
@ -333,7 +365,6 @@ int mca_btl_openib_register(
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl;
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
openib_btl->ib_reg[tag].cbfunc = cbfunc;
openib_btl->ib_reg[tag].cbdata = cbdata;
@ -374,30 +405,17 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
mca_btl_openib_module_t* openib_btl;
int rc;
openib_btl = (mca_btl_openib_module_t*) btl;
if(size <= mca_btl_openib_component.eager_limit){
MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc);
if(order == MCA_BTL_NO_ORDER) {
order = BTL_OPENIB_HP_QP;
}
} else if(size <= mca_btl_openib_component.max_send_size) {
if(order == MCA_BTL_NO_ORDER) {
order = BTL_OPENIB_LP_QP;
} else if(order != BTL_OPENIB_LP_QP) {
return NULL;
}
MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc);
}
MCA_BTL_IB_FRAG_ALLOC_BY_SIZE(btl, frag, size, rc);
if(NULL == frag)
return NULL;
frag->segment.seg_len =
size <= openib_btl->super.btl_eager_limit ? size : openib_btl->super.btl_eager_limit;
frag->base.order = order;
frag->base.des_flags = 0;
/* GMS is this necessary anymore ? */
frag->segment.seg_len = size;
frag->base.order = order;
frag->base.des_flags = 0;
assert(frag->qp_idx <= order);
return (mca_btl_base_descriptor_t*)frag;
}
@ -412,15 +430,17 @@ int mca_btl_openib_free(
mca_btl_base_descriptor_t* des)
{
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)des;
if(((MCA_BTL_OPENIB_SEND_FRAG_FRAG == frag->type) ||
(MCA_BTL_OPENIB_RECV_FRAG_FRAG == frag->type))
&& frag->registration != NULL) {
/* is this fragment pointing at user memory? */
if(((MCA_BTL_OPENIB_FRAG_SEND_USER == frag->type) ||
(MCA_BTL_OPENIB_FRAG_RECV_USER == frag->type))
&& frag->registration != NULL) {
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
(mca_mpool_base_registration_t*)
frag->registration);
(mca_mpool_base_registration_t*)
frag->registration);
frag->registration = NULL;
}
MCA_BTL_IB_FRAG_RETURN(((mca_btl_openib_module_t*) btl), frag);
return OMPI_SUCCESS;
@ -471,17 +491,21 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
openib_btl = (mca_btl_openib_module_t*)btl;
if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) {
/* GMS bloody HACK! */
if(registration != NULL || max_data > btl->btl_max_send_size) {
MCA_BTL_IB_FRAG_ALLOC_SEND_FRAG(btl, frag, rc);
opal_output(mca_btl_base_output, "prepare_src called on endpoint %p\n",
(void*) endpoint);
MCA_BTL_IB_FRAG_ALLOC_SEND_USER(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = NULL;
ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
*size = max_data;
if(NULL == registration) {
@ -496,6 +520,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
}
openib_reg = (mca_btl_openib_reg_t*)registration;
frag->base.order = order;
frag->base.des_flags = 0;
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
@ -511,11 +536,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
frag->segment.seg_addr.pval = iov.iov_base;
frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey;
if(MCA_BTL_NO_ORDER == order) {
frag->base.order = BTL_OPENIB_LP_QP;
} else {
frag->base.order = order;
}
assert(MCA_BTL_NO_ORDER == order);
BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu "
"frag->segment.seg_key.key32[0] = %lu",
@ -525,54 +546,32 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
return &frag->base;
}
}
if(max_data + reserve <= btl->btl_eager_limit) {
/* the data is small enough to fit in the eager frag and
* memory is not prepinned */
MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc);
if(MCA_BTL_NO_ORDER == order) {
order = BTL_OPENIB_LP_QP;
}
}
if(NULL == frag) {
/* the data doesn't fit into eager frag or eager frag is
* not available */
if(MCA_BTL_NO_ORDER == order) {
order = BTL_OPENIB_LP_QP;
} else if(BTL_OPENIB_HP_QP == order){
return NULL;
}
MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
if(max_data + reserve > btl->btl_max_send_size) {
max_data = btl->btl_max_send_size - reserve;
}
assert(MCA_BTL_NO_ORDER == order);
if(max_data + reserve > btl->btl_max_send_size) {
max_data = btl->btl_max_send_size - reserve;
}
MCA_BTL_IB_FRAG_ALLOC_BY_SIZE(btl, frag, max_data + reserve, rc);
if(NULL == frag)
return NULL;
iov.iov_len = max_data;
iov.iov_base = (unsigned char*)frag->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data);
if(rc < 0) {
MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
return NULL;
}
*size = max_data;
frag->segment.seg_len = max_data + reserve;
frag->segment.seg_key.key32[0] = (uint32_t)frag->sg_entry.lkey;
frag->base.order = order;
/* frag->base.order = order; */
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
frag->base.order = order;
return &frag->base;
}
@ -605,9 +604,12 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
int rc;
openib_btl = (mca_btl_openib_module_t*)btl;
MCA_BTL_IB_FRAG_ALLOC_RECV_FRAG(btl, frag, rc);
opal_output(mca_btl_base_output, "prepare_dst called on endpoint %p\n",
(void*) endpoint);
MCA_BTL_IB_FRAG_ALLOC_RECV_USER(btl, frag, rc);
if(NULL == frag) {
abort();
return NULL;
}
@ -621,6 +623,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
frag->segment.seg_addr.pval, *size, 0, &registration);
if(OMPI_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
abort();
return NULL;
}
/* keep track of the registration we did */
@ -635,19 +638,13 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
frag->segment.seg_len = *size;
frag->segment.seg_key.key32[0] = openib_reg->mr->rkey;
frag->base.order = order;
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->base.des_flags = 0;
if(MCA_BTL_NO_ORDER == order) {
frag->base.order = BTL_OPENIB_LP_QP;
} else {
frag->base.order = order;
}
BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu "
"frag->segment.seg_key.key32[0] = %lu",
frag->sg_entry.lkey, frag->sg_entry.addr,
@ -658,6 +655,8 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
static int mca_btl_finalize_hca(struct mca_btl_openib_hca_t *hca)
{
#if OMPI_HAVE_THREADS
int hca_to_remove;
#if OMPI_ENABLE_PROGRESS_THREADS == 1
if(hca->progress) {
hca->progress = false;
@ -671,27 +670,37 @@ static int mca_btl_finalize_hca(struct mca_btl_openib_hca_t *hca)
return OMPI_ERROR;
}
#endif
#if OMPI_HAVE_POSIX_THREADS
if (pthread_cancel(hca->async_thread)) {
BTL_ERROR(("Failed to cancel OpenIB async thread"));
/* signaling to async_tread to stop poll for this hca */
if (mca_btl_openib_component.use_async_event_thread) {
hca_to_remove=-(hca->ib_dev_context->async_fd);
if (write(mca_btl_openib_component.async_pipe[1],
&hca_to_remove,sizeof(int))<0){
BTL_ERROR(("Failed to write to pipe"));
return OMPI_ERROR;
}
}
pthread_join(hca->async_thread, NULL);
#endif
if (OMPI_SUCCESS != mca_mpool_base_module_destroy(hca->mpool)) {
BTL_VERBOSE(("Failed to release mpool"));
return OMPI_ERROR;
}
if (ibv_dealloc_pd(hca->ib_pd)) {
BTL_VERBOSE(("Failed to release PD"));
return OMPI_ERROR;
if (ompi_mpi_leave_pinned || ompi_mpi_leave_pinned_pipeline) {
BTL_VERBOSE(("Warning! Failed to release PD"));
return OMPI_SUCCESS;
} else {
BTL_ERROR(("Error! Failed to release PD"));
return OMPI_ERROR;
}
}
if (ibv_close_device(hca->ib_dev_context)) {
if (ompi_mpi_leave_pinned || ompi_mpi_leave_pinned_pipeline) {
BTL_VERBOSE(("Warrning! Failed to close HCA"));
BTL_VERBOSE(("Warning! Failed to close HCA"));
return OMPI_SUCCESS;
} else {
BTL_ERROR(("Error! Failed to close HCA"));
return OMPI_ERROR;
}
return OMPI_ERROR;
}
OBJ_DESTRUCT(&hca->hca_lock);
free(hca);
@ -703,7 +712,10 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
mca_btl_openib_module_t* openib_btl;
mca_btl_openib_endpoint_t* endpoint;
int ep_index, rdma_index, i;
int qp;
/* return OMPI_SUCCESS; */
openib_btl = (mca_btl_openib_module_t*) btl;
/* Remove the btl from component list */
@ -712,7 +724,7 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
if (mca_btl_openib_component.openib_btls[i] == openib_btl){
mca_btl_openib_component.openib_btls[i] =
mca_btl_openib_component.openib_btls[mca_btl_openib_component.ib_num_btls-1];
break;
break;
}
}
}
@ -721,7 +733,7 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
/* Release eager RDMAs */
for(rdma_index=0;
rdma_index < orte_pointer_array_get_size(openib_btl->eager_rdma_buffers);
rdma_index++) {
rdma_index++) {
endpoint=orte_pointer_array_get_item(openib_btl->eager_rdma_buffers,rdma_index);
if(!endpoint) {
continue;
@ -734,44 +746,52 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
ep_index++) {
endpoint=orte_pointer_array_get_item(openib_btl->endpoints,ep_index);
if(!endpoint) {
BTL_VERBOSE(("In finalize, got another null endpoint\n"));
continue;
} else if(0xffffffff == (uint64_t) endpoint) {
opal_output(0,"Got brocken pointer to endpoint. Internal error");
abort();
}
OBJ_RELEASE(endpoint);
}
/* Release SRQ */
if(mca_btl_openib_component.use_srq) {
if (ibv_destroy_srq(openib_btl->srq[BTL_OPENIB_HP_QP])) {
BTL_VERBOSE(("Failed to close HP SRQ"));
return OMPI_ERROR;
}
if (ibv_destroy_srq(openib_btl->srq[BTL_OPENIB_LP_QP])) {
BTL_VERBOSE(("Failed to close LP SRQ"));
return OMPI_ERROR;
/* Release SRQ resources */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if(MCA_BTL_OPENIB_SRQ_QP == mca_btl_openib_component.qp_infos[qp].type){
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(openib_btl,
&openib_btl->qps[qp].u.srq_qp.pending_frags);
if (ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)){
BTL_VERBOSE(("Failed to close SRQ %d", qp));
return OMPI_ERROR;
}
/* Destroy free lists */
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags);
OBJ_DESTRUCT(&openib_btl->qps[qp].send_free);
OBJ_DESTRUCT(&openib_btl->qps[qp].recv_free);
} else {
/* Destroy free lists */
OBJ_DESTRUCT(&openib_btl->qps[qp].send_free);
OBJ_DESTRUCT(&openib_btl->qps[qp].recv_free);
}
}
OBJ_DESTRUCT(&openib_btl->send_free_control);
OBJ_DESTRUCT(&openib_btl->send_user_free);
OBJ_DESTRUCT(&openib_btl->recv_user_free);
/* Release CQs */
if (ibv_destroy_cq(openib_btl->ib_cq[BTL_OPENIB_HP_QP])) {
BTL_VERBOSE(("Failed to close HP CQ %p",openib_btl->ib_cq[BTL_OPENIB_HP_QP]));
if (ibv_destroy_cq(openib_btl->ib_cq[BTL_OPENIB_HP_CQ])) {
BTL_VERBOSE(("Failed to close HP CQ"));
return OMPI_ERROR;
}
if (ibv_destroy_cq(openib_btl->ib_cq[BTL_OPENIB_LP_QP])) {
if (ibv_destroy_cq(openib_btl->ib_cq[BTL_OPENIB_LP_CQ])) {
BTL_VERBOSE(("Failed to close LP CQ"));
return OMPI_ERROR;
}
/* Release pending lists */
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(openib_btl,
&openib_btl->pending_frags[BTL_OPENIB_HP_QP]);
OBJ_DESTRUCT(&openib_btl->pending_frags[BTL_OPENIB_HP_QP]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(openib_btl,
&openib_btl->pending_frags[BTL_OPENIB_LP_QP]);
OBJ_DESTRUCT(&openib_btl->pending_frags[BTL_OPENIB_LP_QP]);
/* Destroy free lists */
OBJ_DESTRUCT(&openib_btl->send_free[BTL_OPENIB_HP_QP]);
OBJ_DESTRUCT(&openib_btl->send_free[BTL_OPENIB_LP_QP]);
OBJ_DESTRUCT(&openib_btl->send_free_frag);
OBJ_DESTRUCT(&openib_btl->send_free_control);
OBJ_DESTRUCT(&openib_btl->recv_free[BTL_OPENIB_HP_QP]);
OBJ_DESTRUCT(&openib_btl->recv_free[BTL_OPENIB_LP_QP]);
if (!(--openib_btl->hca->btls)) {
/* All btls for the HCA were closed
* Now we can close the HCA
@ -781,45 +801,26 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
return OMPI_ERROR;
}
}
#if OMPI_HAVE_THREADS
if (mca_btl_openib_component.use_async_event_thread &&
! mca_btl_openib_component.ib_num_btls) {
/* signaling to async_tread to stop */
int async_command=0;
if (write(mca_btl_openib_component.async_pipe[1],
&async_command,sizeof(int))<0){
BTL_ERROR(("Failed to write to pipe"));
return OMPI_ERROR;
}
if (pthread_join(mca_btl_openib_component.async_thread, NULL)) {
BTL_ERROR(("Failed to stop OpenIB async event thread"));
return OMPI_ERROR;
}
}
#endif
OBJ_DESTRUCT(&openib_btl->ib_lock);
free(openib_btl);
BTL_VERBOSE(("Success to close BTL resources"));
#if 0
if(openib_btl->send_free_eager.fl_num_allocated !=
openib_btl->send_free_eager.super.opal_list_length){
opal_output(0, "btl ib send_free_eager frags: %d allocated %d returned \n",
openib_btl->send_free_eager.fl_num_allocated,
openib_btl->send_free_eager.super.opal_list_length);
}
if(openib_btl->send_free_max.fl_num_allocated !=
openib_btl->send_free_max.super.opal_list_length){
opal_output(0, "btl ib send_free_max frags: %d allocated %d returned \n",
openib_btl->send_free_max.fl_num_allocated,
openib_btl->send_free_max.super.opal_list_length);
}
if(openib_btl->send_free_frag.fl_num_allocated !=
openib_btl->send_free_frag.super.opal_list_length){
opal_output(0, "btl ib send_free_frag frags: %d allocated %d returned \n",
openib_btl->send_free_frag.fl_num_allocated,
openib_btl->send_free_frag.super.opal_list_length);
}
if(openib_btl->recv_free_eager.fl_num_allocated !=
openib_btl->recv_free_eager.super.opal_list_length){
opal_output(0, "btl ib recv_free_eager frags: %d allocated %d returned \n",
openib_btl->recv_free_eager.fl_num_allocated,
openib_btl->recv_free_eager.super.opal_list_length);
}
if(openib_btl->recv_free_max.fl_num_allocated !=
openib_btl->recv_free_max.super.opal_list_length){
opal_output(0, "btl ib recv_free_max frags: %d allocated %d returned \n",
openib_btl->recv_free_max.fl_num_allocated,
openib_btl->recv_free_max.super.opal_list_length);
}
#endif
BTL_VERBOSE(("Success in closing BTL resources"));
return OMPI_SUCCESS;
}
@ -837,6 +838,8 @@ int mca_btl_openib_send(
{
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*)descriptor;
assert(frag->type == MCA_BTL_OPENIB_FRAG_SEND);
frag->endpoint = endpoint;
frag->hdr->tag = tag;
frag->wr_desc.sr_desc.opcode = IBV_WR_SEND;
@ -854,16 +857,21 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
int rc = OMPI_SUCCESS;
struct ibv_send_wr* bad_wr;
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor;
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl;
/* mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; */
int qp = frag->base.order;
if(MCA_BTL_NO_ORDER == qp)
qp = mca_btl_openib_component.rdma_qp;
/* setup for queued requests */
frag->endpoint = endpoint;
frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_WRITE;
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe[BTL_OPENIB_LP_QP],-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe[BTL_OPENIB_LP_QP],1);
if (OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,-1) < 0) {
opal_output(mca_btl_base_output, "can't get sd_wqe for put on qp %d endpoint %p\n",
qp, (void*) endpoint);
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_put_frags, (opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
@ -871,26 +879,28 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl,
/* post descriptor */
} else {
int ib_rc;
opal_output(mca_btl_base_output, "schedule put on qp %d endpoint %p\n",
qp, (void*) endpoint);
frag->wr_desc.sr_desc.send_flags = IBV_SEND_SIGNALED;
frag->wr_desc.sr_desc.wr.rdma.remote_addr = frag->base.des_dst->seg_addr.lval;
frag->wr_desc.sr_desc.wr.rdma.rkey = frag->base.des_dst->seg_key.key32[0];
frag->sg_entry.addr = (unsigned long) frag->base.des_src->seg_addr.pval;
frag->sg_entry.length = frag->base.des_src->seg_len;
if(ibv_post_send(endpoint->lcl_qp[BTL_OPENIB_LP_QP],
&frag->wr_desc.sr_desc,
&bad_wr)){
frag->base.order = qp;
ib_rc = ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->wr_desc.sr_desc, &bad_wr);
if(ib_rc){
opal_output(0, "got error: %d : %s \n", ib_rc, strerror(ib_rc));
abort();
rc = OMPI_ERROR;
}
if(mca_btl_openib_component.use_srq) {
mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_HP_QP);
mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_LP_QP);
} else {
btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_HP_QP);
btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_LP_QP);
}
/* mca_btl_openib_post_srr_all(openib_btl, 1); */
/* mca_btl_openib_endpoint_post_rr_all(endpoint, 1); */
}
return rc;
}
@ -907,14 +917,18 @@ int mca_btl_openib_get( mca_btl_base_module_t* btl,
int rc;
struct ibv_send_wr* bad_wr;
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) descriptor;
mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl;
/* mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*) btl; */
int qp = frag->base.order;
frag->endpoint = endpoint;
frag->wr_desc.sr_desc.opcode = IBV_WR_RDMA_READ;
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->sd_wqe[BTL_OPENIB_LP_QP],-1) < 0) {
if(MCA_BTL_NO_ORDER == qp)
qp = mca_btl_openib_component.rdma_qp;
OPAL_THREAD_ADD32(&endpoint->sd_wqe[BTL_OPENIB_LP_QP],1);
/* check for a send wqe */
if (OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
@ -923,7 +937,7 @@ int mca_btl_openib_get( mca_btl_base_module_t* btl,
/* check for a get token */
} else if(OPAL_THREAD_ADD32(&endpoint->get_tokens,-1) < 0) {
OPAL_THREAD_ADD32(&endpoint->sd_wqe[BTL_OPENIB_LP_QP],1);
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe,1);
OPAL_THREAD_ADD32(&endpoint->get_tokens,1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->pending_get_frags, (opal_list_item_t*)frag);
@ -938,131 +952,105 @@ int mca_btl_openib_get( mca_btl_base_module_t* btl,
frag->sg_entry.addr = (unsigned long) frag->base.des_dst->seg_addr.pval;
frag->sg_entry.length = frag->base.des_dst->seg_len;
if(ibv_post_send(endpoint->lcl_qp[BTL_OPENIB_LP_QP],
&frag->wr_desc.sr_desc,
&bad_wr)){
BTL_ERROR(("error posting send request errno (%d) says %s", errno, strerror(errno)));
frag->base.order = qp;
if(ibv_post_send(endpoint->qps[qp].lcl_qp, &frag->wr_desc.sr_desc, &bad_wr)){
BTL_ERROR(("error posting send request errno (%d) says %s",
errno, strerror(errno)));
rc = ORTE_ERROR;
} else {
rc = ORTE_SUCCESS;
}
if(mca_btl_openib_component.use_srq) {
mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_HP_QP);
mca_btl_openib_post_srr(openib_btl, 1, BTL_OPENIB_LP_QP);
} else {
btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_HP_QP);
btl_openib_endpoint_post_rr(endpoint, 1, BTL_OPENIB_LP_QP);
}
#if 0
mca_btl_openib_post_srr_all(openib_btl, 1);
mca_btl_openib_endpoint_post_rr_all(endpoint, 1);
#endif
}
return rc;
}
static inline struct ibv_cq *ibv_create_cq_compat(struct ibv_context *context,
int cqe, void *cq_context, struct ibv_comp_channel *channel,
int comp_vector)
{
#if OMPI_IBV_CREATE_CQ_ARGS == 3
return ibv_create_cq(context, cqe, channel);
#else
return ibv_create_cq(context, cqe, cq_context, channel, comp_vector);
#endif
}
/*
* create both the high and low priority completion queues
* and the shared receive queue (if requested)
*/
int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
{
/* Allocate Protection Domain */
int qp;
openib_btl->poll_cq = false;
if (mca_btl_openib_component.use_srq) {
/* create the SRQ's */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
struct ibv_srq_init_attr attr;
attr.attr.max_wr = mca_btl_openib_component.srq_rd_max;
attr.attr.max_sge = mca_btl_openib_component.ib_sg_list_size;
openib_btl->srd_posted[BTL_OPENIB_HP_QP] = 0;
openib_btl->srd_posted[BTL_OPENIB_LP_QP] = 0;
openib_btl->srq[BTL_OPENIB_HP_QP] =
ibv_create_srq(openib_btl->hca->ib_pd, &attr);
if (NULL == openib_btl->srq[BTL_OPENIB_HP_QP]) {
show_init_error(__FILE__, __LINE__, "ibv_create_srq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR;
if(MCA_BTL_OPENIB_SRQ_QP == openib_btl->qps[qp].type) {
attr.attr.max_wr = mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
attr.attr.max_sge = mca_btl_openib_component.ib_sg_list_size;
openib_btl->qps[qp].u.srq_qp.rd_posted = 0;
openib_btl->qps[qp].u.srq_qp.srq =
ibv_create_srq(openib_btl->hca->ib_pd, &attr);
if (NULL == openib_btl->qps[qp].u.srq_qp.srq) {
abort();
show_init_error(__FILE__, __LINE__, "ibv_create_srq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR;
}
}
openib_btl->srq[BTL_OPENIB_LP_QP] =
ibv_create_srq(openib_btl->hca->ib_pd, &attr);
if (NULL == openib_btl->srq[BTL_OPENIB_LP_QP]) {
show_init_error(__FILE__, __LINE__, "ibv_create_srq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR;
}
} else {
openib_btl->srq[BTL_OPENIB_HP_QP] = NULL;
openib_btl->srq[BTL_OPENIB_LP_QP] = NULL;
}
}
/* Create the low and high priority queue pairs */
/* Create the CQs, one HP, one LP */
openib_btl->ib_cq[BTL_OPENIB_LP_CQ] =
ibv_create_cq_compat(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_lp_cq_size,
#if OMPI_ENABLE_PROGRESS_THREADS == 1
#if OMPI_IBV_CREATE_CQ_ARGS == 3
openib_btl->ib_cq[BTL_OPENIB_LP_QP] =
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, openib_btl->hca->ib_channel);
openib_btl, openib_btl->hca->ib_channel,
#else
openib_btl->ib_cq[BTL_OPENIB_LP_QP] =
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, openib_btl, openib_btl->hca->ib_channel, 0);
NULL, NULL,
#endif
#else /* OMPI_ENABLE_PROGRESS_THREADS DISABLED */
#if OMPI_IBV_CREATE_CQ_ARGS == 3
openib_btl->ib_cq[BTL_OPENIB_LP_QP] =
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, NULL);
#else
openib_btl->ib_cq[BTL_OPENIB_LP_QP] =
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, NULL, NULL, 0);
#endif
#endif /* OMPI_ENABLE_PROGRESS_THREADS */
0);
if (NULL == openib_btl->ib_cq[BTL_OPENIB_LP_QP]) {
if (NULL == openib_btl->ib_cq[BTL_OPENIB_LP_CQ]) {
show_init_error(__FILE__, __LINE__, "ibv_create_cq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR;
}
openib_btl->ib_cq[BTL_OPENIB_HP_CQ] =
ibv_create_cq_compat(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_hp_cq_size,
#if OMPI_ENABLE_PROGRESS_THREADS == 1
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_LP_QP], 0)) {
openib_btl, openib_btl->hca->ib_channel,
#else
NULL, NULL,
#endif
0);
if(NULL == openib_btl->ib_cq[BTL_OPENIB_HP_CQ]) {
show_init_error(__FILE__, __LINE__, "ibv_create_cq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR;
}
openib_btl->cq_users[BTL_OPENIB_HP_CQ] = 0;
openib_btl->cq_users[BTL_OPENIB_LP_CQ] = 0;
#if OMPI_ENABLE_PROGRESS_THREADS == 1
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_LP_CQ], 0)) {
show_init_error(__FILE__, __LINE__, "ibv_req_notify_cq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR;
}
#if OMPI_IBV_CREATE_CQ_ARGS == 3
openib_btl->ib_cq[BTL_OPENIB_HP_QP] =
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, openib_btl->hca->ib_channel);
#else
openib_btl->ib_cq[BTL_OPENIB_HP_QP] =
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, openib_btl, openib_btl->hca->ib_channel, 0);
#endif
#else /* OMPI_ENABLE_PROGRESS_THREADS DISABLED */
#if OMPI_IBV_CREATE_CQ_ARGS == 3
openib_btl->ib_cq[BTL_OPENIB_HP_QP] =
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, NULL);
#else
openib_btl->ib_cq[BTL_OPENIB_HP_QP] =
ibv_create_cq(openib_btl->hca->ib_dev_context,
mca_btl_openib_component.ib_cq_size, NULL, NULL, 0);
#endif
#endif /* OMPI_ENABLE_PROGRESS_THREADS */
if(NULL == openib_btl->ib_cq[BTL_OPENIB_HP_QP]) {
show_init_error(__FILE__, __LINE__, "ibv_create_cq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR;
}
#if OMPI_ENABLE_PROGRESS_THREADS == 1
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_HP_QP], 0)) {
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_HP_CQ], 0)) {
show_init_error(__FILE__, __LINE__, "ibv_req_notify_cq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR;

Просмотреть файл

@ -49,10 +49,37 @@ BEGIN_C_DECLS
#define MCA_BTL_IB_LEAVE_PINNED 1
#define IB_DEFAULT_GID_PREFIX 0xfe80000000000000ll
/**
* Infiniband (IB) BTL component.
*/
typedef enum {
MCA_BTL_OPENIB_PP_QP,
MCA_BTL_OPENIB_SRQ_QP
} mca_btl_openib_qp_type_t;
struct mca_btl_openib_pp_qp_info_t {
int32_t rd_win;
int32_t rd_rsv;
}; typedef struct mca_btl_openib_pp_qp_info_t mca_btl_openib_pp_qp_info_t;
struct mca_btl_openib_srq_qp_info_t {
int32_t sd_max;
}; typedef struct mca_btl_openib_srq_qp_info_t mca_btl_openib_srq_qp_info_t;
struct mca_btl_openib_qp_info_t {
size_t size;
int32_t rd_num;
int32_t rd_low;
mca_btl_openib_qp_type_t type;
union {
mca_btl_openib_pp_qp_info_t pp_qp;
mca_btl_openib_srq_qp_info_t srq_qp;
} u;
}; typedef struct mca_btl_openib_qp_info_t mca_btl_openib_qp_info_t;
struct mca_btl_openib_component_t {
mca_btl_base_component_1_0_1_t super; /**< base BTL component */
@ -88,22 +115,21 @@ struct mca_btl_openib_component_t {
char* ib_mpool_name;
/**< name of ib memory pool */
int32_t rd_num; /**< the number of receive descriptors to post to each queue pair */
int32_t rd_low; /**< low water mark to reach before posting additional receive descriptors */
int32_t rd_win; /**< ack credits when window size exceeded */
int32_t rd_rsv; /**< descriptors held in reserve for control messages */
int32_t srq_rd_max; /**< maximum number of receive descriptors posted on shared receive queue */
int32_t srq_rd_per_peer; /**< number of receive descriptors to post per log2(peers) in SRQ mode */
int32_t srq_sd_max; /**< maximum number of send descriptors posted in use SRQ mode */
uint8_t num_pp_qps; /**< number of pp qp's */
uint8_t num_srq_qps; /**< number of srq qp's */
uint8_t num_qps; /**< total number of qp's */
mca_btl_openib_qp_info_t* qp_infos;
size_t eager_limit; /**< Eager send limit of first fragment, in Bytes */
size_t max_send_size; /**< Maximum send size, in Bytes */
uint32_t reg_mru_len; /**< Length of the registration cache most recently used list */
uint32_t use_srq; /**< Use the Shared Receive Queue (SRQ mode) */
uint32_t ib_cq_size; /**< Max outstanding CQE on the CQ */
uint32_t ib_lp_cq_size; /**< Max outstanding CQE on the CQ */
uint32_t ib_hp_cq_size; /**< Max outstanding CQE on the CQ */
uint32_t ib_sg_list_size; /**< Max scatter/gather descriptor entries on the WQ */
uint32_t ib_pkey_ix; /**< InfiniBand pkey index */
uint32_t ib_pkey_val;
@ -124,8 +150,11 @@ struct mca_btl_openib_component_t {
uint32_t btls_per_lid;
uint32_t max_lmc;
uint32_t buffer_alignment; /**< Preferred communication buffer alignment in Bytes (must be power of two) */
#if OMPI_HAVE_POSIX_THREADS
int32_t fatal_counter; /**< Counts number on fatal events that we got on all hcas */
#if OMPI_HAVE_THREADS
int32_t fatal_counter; /**< Counts number on fatal events that we got on all hcas */
int async_pipe[2]; /**< Pipe for comunication with async event thread */
pthread_t async_thread; /**< Async thread that will handle fatal errors */
uint32_t use_async_event_thread; /**< Use the async event handler */
#endif
char *if_include;
char **if_include_list;
@ -155,6 +184,9 @@ struct mca_btl_openib_component_t {
/** Whether we want fork support or not */
int want_fork_support;
#endif
int rdma_qp;
int eager_rdma_qp;
}; typedef struct mca_btl_openib_component_t mca_btl_openib_component_t;
OMPI_MODULE_DECLSPEC extern mca_btl_openib_component_t mca_btl_openib_component;
@ -198,14 +230,36 @@ struct mca_btl_openib_hca_t {
/* Whether this HCA supports eager RDMA */
uint8_t use_eager_rdma;
uint8_t btls; /** < number of btls using this HCA */
#if OMPI_HAVE_POSIX_THREADS
/* Support for fatal event handling */
pthread_t async_thread; /* Async thread that will handle fatal errors */
#if OMPI_HAVE_THREADS
volatile bool got_fatal_event;
#endif
bool got_fatal_event;
};
typedef struct mca_btl_openib_hca_t mca_btl_openib_hca_t;
struct mca_btl_openib_module_pp_qp_t {
int32_t dummy;
}; typedef struct mca_btl_openib_module_pp_qp_t mca_btl_openib_module_pp_qp_t;
struct mca_btl_openib_module_srq_qp_t {
struct ibv_srq *srq;
int32_t rd_posted;
int32_t sd_credits; /* the max number of outstanding sends on a QP when using SRQ */
/* i.e. the number of frags that can be outstanding (down counter) */
opal_list_t pending_frags; /**< list of pending frags */
}; typedef struct mca_btl_openib_module_srq_qp_t mca_btl_openib_module_srq_qp_t;
struct mca_btl_openib_module_qp_t {
ompi_free_list_t send_free; /**< free lists of send buffer descriptors */
ompi_free_list_t recv_free; /**< free lists of receive buffer descriptors */
mca_btl_openib_qp_type_t type;
union {
mca_btl_openib_module_pp_qp_t pp_qp;
mca_btl_openib_module_srq_qp_t srq_qp;
} u;
}; typedef struct mca_btl_openib_module_qp_t mca_btl_openib_module_qp_t;
/**
* IB BTL Interface
*/
@ -218,42 +272,37 @@ struct mca_btl_openib_module_t {
uint8_t port_num; /**< ID of the PORT */
uint16_t pkey_index;
struct ibv_cq *ib_cq[2];
uint32_t cq_users[2];
struct ibv_port_attr ib_port_attr;
uint16_t lid; /**< lid that is actually used (for LMC) */
uint8_t src_path_bits; /**< offset from base lid (for LMC) */
ompi_free_list_t send_free[2]; /**< free lists of send buffer descriptors */
ompi_free_list_t send_free_frag; /**< free list of frags only... used for pining memory */
int32_t num_peers;
ompi_free_list_t recv_free[2]; /**< free lists of receive buffer descriptors */
ompi_free_list_t recv_free_frag; /**< free list of frags only... used for pining memory */
ompi_free_list_t send_user_free; /**< free list of frags only...
* used for pining user memory */
ompi_free_list_t recv_user_free; /**< free list of frags only...
* used for pining user memory */
ompi_free_list_t send_free_control; /**< frags for control massages */
opal_mutex_t ib_lock; /**< module level lock */
/**< an array to allow posting of rr in one swoop */
size_t ib_inline_max; /**< max size of inline send*/
bool poll_cq;
struct ibv_srq *srq[2];
int32_t srd_posted[2];
int32_t num_peers;
int32_t rd_num;
int32_t rd_low;
int32_t sd_credits[2]; /* the max number of outstanding sends on a QP when using SRQ */
/**< number of frags that can be outstanding (down counter) */
opal_list_t pending_frags[2]; /**< list of pending frags */
size_t eager_rdma_frag_size; /**< length of eager frag */
orte_pointer_array_t *eager_rdma_buffers; /**< RDMA buffers to poll */
volatile int32_t eager_rdma_buffers_count; /**< number of RDMA buffers */
mca_btl_base_module_error_cb_fn_t error_cb; /**< error handler */
mca_btl_openib_module_qp_t * qps;
orte_pointer_array_t *endpoints;
};
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
@ -468,8 +517,9 @@ extern void mca_btl_openib_send_frag_return(mca_btl_base_module_t* btl,
extern int mca_btl_openib_ft_event(int state);
#define BTL_OPENIB_HP_QP 0
#define BTL_OPENIB_LP_QP 1
#define BTL_OPENIB_HP_CQ 0
#define BTL_OPENIB_LP_CQ 1
/**
* Post to Shared Receive Queue with certain priority
@ -482,38 +532,66 @@ extern int mca_btl_openib_ft_event(int state);
static inline int mca_btl_openib_post_srr(mca_btl_openib_module_t* openib_btl,
const int additional,
const int prio)
const int qp)
{
assert(MCA_BTL_OPENIB_SRQ_QP == openib_btl->qps[qp].type);
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
if(openib_btl->srd_posted[prio] <= openib_btl->rd_low + additional &&
openib_btl->srd_posted[prio] < openib_btl->rd_num) {
if(openib_btl->qps[qp].u.srq_qp.rd_posted <=
mca_btl_openib_component.qp_infos[qp].rd_low + additional &&
openib_btl->qps[qp].u.srq_qp.rd_posted <
mca_btl_openib_component.qp_infos[qp].rd_num) {
int rc;
int32_t i, num_post = openib_btl->rd_num - openib_btl->srd_posted[prio];
int32_t i, num_post = mca_btl_openib_component.qp_infos[qp].rd_num -
openib_btl->qps[qp].u.srq_qp.rd_posted;
struct ibv_recv_wr *bad_wr;
ompi_free_list_t *free_list;
free_list = &openib_btl->recv_free[prio];
free_list = &openib_btl->qps[qp].recv_free;
for(i = 0; i < num_post; i++) {
ompi_free_list_item_t* item;
mca_btl_openib_frag_t* frag;
OMPI_FREE_LIST_WAIT(free_list, item, rc);
frag = (mca_btl_openib_frag_t*)item;
if(ibv_post_srq_recv(openib_btl->srq[prio], &frag->wr_desc.rd_desc,
&bad_wr)) {
frag->base.order = qp;
if(ibv_post_srq_recv(openib_btl->qps[qp].u.srq_qp.srq,
&frag->wr_desc.rd_desc,
&bad_wr)) {
BTL_ERROR(("error posting receive descriptors to shared "
"receive queue: %s", strerror(errno)));
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_ERROR;
}
}
OPAL_THREAD_ADD32(&openib_btl->srd_posted[prio], num_post);
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.rd_posted, num_post);
}
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_SUCCESS;
}
static inline int mca_btl_openib_post_srr_all(mca_btl_openib_module_t *openib_btl,
const int additional)
{
int qp;
for(qp = 0; qp < mca_btl_openib_component.num_srq_qps; qp++){
if(MCA_BTL_OPENIB_SRQ_QP == openib_btl->qps[qp].type) {
mca_btl_openib_post_srr(openib_btl, additional, qp);
}
}
return OMPI_SUCCESS;
}
#define BTL_OPENIB_EAGER_RDMA_QP(QP) \
((QP) == mca_btl_openib_component.eager_rdma_qp)
#define BTL_OPENIB_RDMA_QP(QP) \
((QP) == mca_btl_openib_component.rdma_qp)
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
END_C_DECLS
#endif /* MCA_BTL_IB_H */

313
ompi/mca/btl/openib/btl_openib_async.c Обычный файл
Просмотреть файл

@ -0,0 +1,313 @@
/*
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#if OMPI_HAVE_THREADS
#include <infiniband/verbs.h>
#include <fcntl.h>
#include <sys/poll.h>
#include <unistd.h>
#include <errno.h>
#include "opal/util/output.h"
#include "opal/util/show_help.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/base.h"
#include "btl_openib.h"
#include "btl_openib_mca.h"
#include "btl_openib_async.h"
struct mca_btl_openib_async_poll {
int active_poll_size;
int poll_size;
struct pollfd *async_pollfd;
};
typedef struct mca_btl_openib_async_poll mca_btl_openib_async_poll;
static int btl_openib_async_poll_init(struct mca_btl_openib_async_poll *hcas_poll);
static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *hcas_poll);
static int btl_openib_async_hcah(struct mca_btl_openib_async_poll *hcas_poll, int index);
static const char *openib_event_to_str (enum ibv_event_type event);
/* Function converts event to string (name)
* Open Fabris don't have function that do this job :(
*/
static const char *openib_event_to_str (enum ibv_event_type event)
{
switch (event) {
case IBV_EVENT_CQ_ERR:
return "IBV_EVENT_CQ_ERR";
case IBV_EVENT_QP_FATAL:
return "IBV_EVENT_QP_FATAL";
case IBV_EVENT_QP_REQ_ERR:
return "IBV_EVENT_QP_REQ_ERR";
case IBV_EVENT_QP_ACCESS_ERR:
return "IBV_EVENT_QP_ACCESS_ERR";
case IBV_EVENT_PATH_MIG:
return "IBV_EVENT_PATH_MIG";
case IBV_EVENT_PATH_MIG_ERR:
return "IBV_EVENT_PATH_MIG_ERR";
case IBV_EVENT_DEVICE_FATAL:
return "IBV_EVENT_DEVICE_FATAL";
case IBV_EVENT_SRQ_ERR:
return "IBV_EVENT_SRQ_ERR";
case IBV_EVENT_PORT_ERR:
return "IBV_EVENT_PORT_ERR";
case IBV_EVENT_COMM_EST:
return "IBV_EVENT_COMM_EST";
case IBV_EVENT_PORT_ACTIVE:
return "IBV_EVENT_PORT_ACTIVE";
case IBV_EVENT_SQ_DRAINED:
return "IBV_EVENT_SQ_DRAINED";
case IBV_EVENT_LID_CHANGE:
return "IBV_EVENT_LID_CHANGE";
case IBV_EVENT_PKEY_CHANGE:
return "IBV_EVENT_PKEY_CHANGE";
case IBV_EVENT_SM_CHANGE:
return "IBV_EVENT_SM_CHANGE";
case IBV_EVENT_QP_LAST_WQE_REACHED:
return "IBV_EVENT_QP_LAST_WQE_REACHED";
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
case IBV_EVENT_CLIENT_REREGISTER:
return "IBV_EVENT_CLIENT_REREGISTER";
#endif
case IBV_EVENT_SRQ_LIMIT_REACHED:
return "IBV_EVENT_SRQ_LIMIT_REACHED";
default:
return "UNKNOWN";
}
}
/* Function inits mca_btl_openib_async_poll */
static int btl_openib_async_poll_init(struct mca_btl_openib_async_poll *hcas_poll)
{
hcas_poll->active_poll_size = 1;
hcas_poll->poll_size = 4;
hcas_poll->async_pollfd = malloc(sizeof(struct pollfd) * hcas_poll->poll_size);
if (NULL == hcas_poll->async_pollfd) {
BTL_ERROR(("Failed malloc: %s:%d"
, __FILE__, __LINE__));
return OMPI_ERROR;
}
/* Creating comunication channel with the main thread */
hcas_poll->async_pollfd[0].fd = mca_btl_openib_component.async_pipe[0];
hcas_poll->async_pollfd[0].events = POLLIN;
hcas_poll->async_pollfd[0].revents = 0;
return OMPI_SUCCESS;
}
/* Function handle async thread commands */
static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *hcas_poll)
{
struct pollfd *async_pollfd_tmp;
int fd,flags,j;
/* Got command from main thread */
if (read(hcas_poll->async_pollfd[0].fd, &fd, sizeof(int)) < 0) {
BTL_ERROR(("Read failed [%d]",errno));
return OMPI_ERROR;
}
BTL_VERBOSE(("GOT event from -> %d",fd));
if (fd > 0) {
BTL_VERBOSE(("Adding HCA [%d] to async event poll[%d]"
,fd,hcas_poll->active_poll_size));
flags = fcntl(fd, F_GETFL);
if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) {
BTL_ERROR(("Failed to change file descriptor of async event"));
return OMPI_ERROR;
}
if ((hcas_poll->active_poll_size + 1) > hcas_poll->poll_size) {
hcas_poll->poll_size+=hcas_poll->poll_size;
async_pollfd_tmp = malloc(sizeof(struct pollfd) * hcas_poll->poll_size);
if (NULL == async_pollfd_tmp) {
BTL_ERROR(("Failed malloc: %s:%d"
"Fatal error, stoping asyn event thread"
, __FILE__, __LINE__));
return OMPI_ERROR;
}
memcpy (async_pollfd_tmp,hcas_poll->async_pollfd,
sizeof(struct pollfd) * (hcas_poll->active_poll_size));
free(hcas_poll->async_pollfd);
hcas_poll->async_pollfd = async_pollfd_tmp;
}
hcas_poll->async_pollfd[hcas_poll->active_poll_size].fd = fd;
hcas_poll->async_pollfd[hcas_poll->active_poll_size].events = POLLIN;
hcas_poll->async_pollfd[hcas_poll->active_poll_size].revents = 0;
hcas_poll->active_poll_size++;
} else if (fd < 0) {
bool fd_found = false;
/* Removing HCA from poll */
fd = -(fd);
BTL_VERBOSE(("Removing HCA [%d] from async event poll [%d]"
,fd,hcas_poll->active_poll_size));
if (hcas_poll->active_poll_size > 1) {
for (j=0; (j < hcas_poll->active_poll_size || !fd_found); j++) {
if (hcas_poll->async_pollfd[j].fd == fd) {
hcas_poll->async_pollfd[j].fd =
hcas_poll->async_pollfd[hcas_poll->active_poll_size-1].fd;
hcas_poll->async_pollfd[j].events =
hcas_poll->async_pollfd[hcas_poll->active_poll_size-1].events;
hcas_poll->async_pollfd[j].revents =
hcas_poll->async_pollfd[hcas_poll->active_poll_size-1].revents;
fd_found = true;
}
}
if (!fd_found) {
BTL_ERROR(("Requested FD[%d] was not found in poll array\n",fd));
return OMPI_ERROR;
}
}
hcas_poll->active_poll_size--;
} else {
/* Got 0 - command to close the thread */
BTL_VERBOSE(("Async event thread exit"));
free(hcas_poll->async_pollfd);
pthread_exit(NULL);
}
return OMPI_SUCCESS;
}
/* Function handle async hca events */
static int btl_openib_async_hcah(struct mca_btl_openib_async_poll *hcas_poll, int index)
{
int j;
mca_btl_openib_hca_t *hca = NULL;
struct ibv_async_event event;
/* We need to find correct hca and process this event */
for (j=0; j < mca_btl_openib_component.ib_num_btls; j++) {
if (mca_btl_openib_component.openib_btls[j]->hca->ib_dev_context->async_fd ==
hcas_poll->async_pollfd[index].fd ) {
hca = mca_btl_openib_component.openib_btls[j]->hca;
}
}
if (NULL != hca) {
if (ibv_get_async_event((struct ibv_context *)hca->ib_dev_context,&event) < 0) {
if (EWOULDBLOCK == errno) {
/* No event found ?
* It was handled by somebody other */
return OMPI_SUCCESS;
} else {
BTL_ERROR(("Failed to get async event"));
return OMPI_ERROR;
}
}
switch(event.event_type) {
case IBV_EVENT_DEVICE_FATAL:
/* Set the flag to fatal */
hca->got_fatal_event = true;
/* It is not critical to protect the counter */
OPAL_THREAD_ADD32(&mca_btl_openib_component.fatal_counter, 1);
case IBV_EVENT_CQ_ERR:
case IBV_EVENT_QP_FATAL:
case IBV_EVENT_QP_REQ_ERR:
case IBV_EVENT_QP_ACCESS_ERR:
case IBV_EVENT_PATH_MIG:
case IBV_EVENT_PATH_MIG_ERR:
case IBV_EVENT_SRQ_ERR:
case IBV_EVENT_PORT_ERR:
opal_show_help("help-mpi-btl-openib.txt", "of error event",
true,orte_system_info.nodename, orte_process_info.pid,
event.event_type, openib_event_to_str(event.event_type));
break;
case IBV_EVENT_COMM_EST:
case IBV_EVENT_PORT_ACTIVE:
case IBV_EVENT_SQ_DRAINED:
case IBV_EVENT_LID_CHANGE:
case IBV_EVENT_PKEY_CHANGE:
case IBV_EVENT_SM_CHANGE:
case IBV_EVENT_QP_LAST_WQE_REACHED:
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
case IBV_EVENT_CLIENT_REREGISTER:
#endif
case IBV_EVENT_SRQ_LIMIT_REACHED:
break;
default:
opal_show_help("help-mpi-btl-openib.txt", "of unknown event",
true,orte_system_info.nodename, orte_process_info.pid,
event.event_type);
}
ibv_ack_async_event(&event);
} else {
/* the hca == NULL , we failed to locate the HCA
* this failure should not never happed */
BTL_ERROR(("Failed to find HCA with FD %d."
"Fatal error, stoping asyn event thread"
,hcas_poll->async_pollfd[index].fd));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
/* This Async event thread is handling all async event of
* all btls/hcas in openib component
*/
void* btl_openib_async_thread(void * async)
{
int rc;
int i;
struct mca_btl_openib_async_poll hcas_poll;
if (OMPI_SUCCESS != btl_openib_async_poll_init(&hcas_poll)) {
BTL_ERROR(("Fatal error, stoping asyn event thread"));
pthread_exit(NULL);
}
while(1) {
rc = poll(hcas_poll.async_pollfd, hcas_poll.active_poll_size, -1);
if (rc < 0) {
if (errno != EINTR) {
BTL_ERROR(("Poll failed.Fatal error, stoping asyn event thread"));
pthread_exit(NULL);
} else {
/* EINTR - we got interupt */
continue;
}
}
for(i = 0; i < hcas_poll.active_poll_size; i++) {
switch (hcas_poll.async_pollfd[i].revents) {
case 0:
/* no events */
break;
case POLLIN:
/* Processing our event */
if (0 == i) {
/* 0 poll we use for comunication with main thread */
if (OMPI_SUCCESS != btl_openib_async_commandh(&hcas_poll)) {
free(hcas_poll.async_pollfd);
BTL_ERROR(("Failed to process async thread process."
"Fatal error, stoping asyn event thread"));
pthread_exit(NULL);
}
} else {
/* We get hca event */
if (btl_openib_async_hcah(&hcas_poll, i)) {
free(hcas_poll.async_pollfd);
BTL_ERROR(("Failed to process async thread process."
"Fatal error, stoping asyn event thread"));
pthread_exit(NULL);
}
}
break;
default:
/* Get event other than POLLIN
* this case should not never happend */
BTL_ERROR(("Got unexpected event %d."
"Fatal error, stoping asyn event thread"
,hcas_poll.async_pollfd[i].revents));
free(hcas_poll.async_pollfd);
pthread_exit(NULL);
}
}
}
return PTHREAD_CANCELED;
}
#endif

17
ompi/mca/btl/openib/btl_openib_async.h Обычный файл
Просмотреть файл

@ -0,0 +1,17 @@
/*
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* @file
*/
#ifndef MCA_BTL_OPENIB_ASYNC_H
#define MCA_BTL_OPENIB_ASYNC_H
void* btl_openib_async_thread(void *one_hca);
#endif

Просмотреть файл

@ -50,6 +50,9 @@
#include "ompi/datatype/convertor.h"
#include "ompi/mca/mpool/mpool.h"
#if OMPI_HAVE_THREADS
#include "btl_openib_async.h"
#endif
#include <infiniband/verbs.h>
#include <errno.h>
#include <string.h> /* for strerror()*/
@ -82,16 +85,21 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
static char* btl_openib_component_status_to_string(enum ibv_wc_status status);
static int btl_openib_component_progress(void);
static int btl_openib_module_progress(mca_btl_openib_module_t *openib_btl);
static void btl_openib_frag_progress_pending(
mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint,
const int prio);
static void btl_openib_frag_progress_pending_pp(
mca_btl_base_endpoint_t *endpoint,
const int qp);
static void btl_openib_frag_progress_pending_srq(
mca_btl_openib_module_t* openib_btl,
mca_btl_base_endpoint_t *endpoint,
const int qp);
static void btl_openib_frag_progress_pending_put_get(
mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint,
const int qp);
static int openib_reg_mr(void *reg_data, void *base, size_t size,
mca_mpool_base_registration_t *reg);
static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg);
static int get_port_list(mca_btl_openib_hca_t *hca, int *allowed_ports);
#if OMPI_HAVE_POSIX_THREADS
void* btl_openib_async_thread(void *one_hca);
#endif
mca_btl_openib_component_t mca_btl_openib_component = {
@ -145,11 +153,12 @@ int btl_openib_component_open(void)
/* register IB component parameters */
ret = btl_openib_register_mca_params();
mca_btl_openib_component.max_send_size =
mca_btl_openib_module.super.btl_max_send_size;
mca_btl_openib_component.eager_limit =
mca_btl_openib_module.super.btl_eager_limit;
mca_btl_openib_component.max_send_size =
mca_btl_openib_module.super.btl_max_send_size;
mca_btl_openib_component.eager_limit =
mca_btl_openib_module.super.btl_eager_limit;
srand48(getpid() * time(NULL));
return ret;
}
@ -199,7 +208,7 @@ static int btl_openib_modex_send(void)
}
/*
* Callback function on control message.
* Active Message Callback function on control message.
*/
static void btl_openib_control(struct mca_btl_base_module_t* btl,
@ -213,20 +222,26 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl,
mca_btl_openib_control_header_t *ctl_hdr = frag->segment.seg_addr.pval;
mca_btl_openib_eager_rdma_header_t *rdma_hdr;
mca_btl_openib_rdma_credits_header_t *credits_hdr;
if(frag->size == mca_btl_openib_component.eager_limit) {
int qp = frag->qp_idx;
opal_output(mca_btl_base_output, "got a control message\n");
if(BTL_OPENIB_EAGER_RDMA_QP(qp)) {
/* if not sent via rdma */
if(!MCA_BTL_OPENIB_RDMA_FRAG(frag) &&
ctl_hdr->type == MCA_BTL_OPENIB_CONTROL_CREDITS) {
OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_HP_QP], -1);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, -1);
/* assert(endpoint->qps[qp].u.pp_qp.rd_credits >= -(mca_btl_openib_component.qp_infos[qp].rd_num - mca_btl_openib_component.qp_infos[qp].rd_low)); */
}
} else {
OPAL_THREAD_ADD32(&endpoint->rd_credits[BTL_OPENIB_LP_QP], -1);
} else if (ctl_hdr->type == MCA_BTL_OPENIB_CONTROL_CREDITS) {
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, -1);
/* assert(endpoint->qps[qp].u.pp_qp.rd_credits >= -(mca_btl_openib_component.qp_infos[qp].rd_num - mca_btl_openib_component.qp_infos[qp].rd_low)); */
}
switch (ctl_hdr->type) {
case MCA_BTL_OPENIB_CONTROL_CREDITS:
opal_output(mca_btl_base_output, "got me some credits \n");
credits_hdr = (mca_btl_openib_rdma_credits_header_t*)ctl_hdr;
if(endpoint->nbo) {
BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH((*credits_hdr));
@ -293,7 +308,7 @@ static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
if(openib_reg->mr != NULL) {
if(ibv_dereg_mr(openib_reg->mr)) {
opal_output(0, "%s: error unpinning openib memory errno says %s\n",
opal_output(mca_btl_base_output, "%s: error unpinning openib memory errno says %s\n",
__func__, strerror(errno));
return OMPI_ERROR;
}
@ -629,39 +644,29 @@ static int init_one_hca(opal_list_t *btl_list, struct ibv_device* ib_dev)
}
}
if (hca->btls != 0){
if (hca->btls != 0) {
#if OMPI_HAVE_THREADS
if (mca_btl_openib_component.use_async_event_thread) {
hca->got_fatal_event = false;
if (write(mca_btl_openib_component.async_pipe[1],
&hca->ib_dev_context->async_fd,
sizeof(int))<0){
BTL_ERROR(("Failed to write to pipe [%d]",errno));
goto comp_channel;
}
}
#if OMPI_ENABLE_PROGRESS_THREADS == 1
/* Prepare data for thread, but not starting it */
OBJ_CONSTRUCT(&hca->thread, opal_thread_t);
hca->thread.t_run = mca_btl_openib_progress_thread;
hca->thread.t_arg = hca;
hca->progress = false;
#endif
hca->got_fatal_event=false;
#if OMPI_HAVE_POSIX_THREADS
/* Starting async event thread */
ret = pthread_create(&hca->async_thread,NULL,
(void*(*)(void*))btl_openib_async_thread,hca);
if (ret != 0) {
#if OMPI_ENABLE_PROGRESS_THREADS == 1
/* Failed to create async thread. Cancel the progress
* and exit
*/
if (pthread_cancel(hca->thread.t_handle)) {
BTL_ERROR(("Failed to cancel OpenIB progress thread"));
}
opal_thread_join(&hca->thread, NULL);
#endif
BTL_ERROR(("Failed to create asyn thread for openib"));
ret = OMPI_ERROR;
} else {
return OMPI_SUCCESS;
}
#else
return OMPI_SUCCESS;
#endif
}
#endif
return OMPI_SUCCESS;
comp_channel:
#if OMPI_ENABLE_PROGRESS_THREADS == 1
ibv_destroy_comp_channel(hca->ib_channel);
mpool_destroy:
@ -718,9 +723,16 @@ btl_openib_component_init(int *num_btl_modules,
if (OMPI_SUCCESS != (ret = ompi_btl_openib_ini_init())) {
goto no_btls;
}
#if OMPI_HAVE_POSIX_THREADS
#if OMPI_HAVE_THREADS
/* Set the fatal counter to zero */
mca_btl_openib_component.fatal_counter = 0;
/* Create pipe for comunication with async event thread */
if (mca_btl_openib_component.use_async_event_thread) {
if (pipe (mca_btl_openib_component.async_pipe)) {
BTL_ERROR(("Failed to create pipe for comunication with async event thread"));
return NULL;
}
}
#endif
/* If we want fork support, try to enable it */
@ -810,9 +822,20 @@ btl_openib_component_init(int *num_btl_modules,
OBJ_CONSTRUCT(&btl_list, opal_list_t);
OBJ_CONSTRUCT(&mca_btl_openib_component.ib_lock, opal_mutex_t);
for (i = 0; i < num_devs &&
(-1 == mca_btl_openib_component.ib_max_btls ||
mca_btl_openib_component.ib_num_btls <
mca_btl_openib_component.ib_max_btls); i++){
(-1 == mca_btl_openib_component.ib_max_btls ||
mca_btl_openib_component.ib_num_btls <
mca_btl_openib_component.ib_max_btls); i++){
#if OMPI_HAVE_THREADS
if (mca_btl_openib_component.use_async_event_thread &&
0 == i) {
/* Starting async event thread for the component */
if (pthread_create(&mca_btl_openib_component.async_thread,NULL,
(void*(*)(void*))btl_openib_async_thread,NULL)) {
BTL_ERROR(("Failed to create async event thread for openib"));
return NULL;
}
}
#endif
if (OMPI_SUCCESS != (ret = init_one_hca(&btl_list, ib_devs[i]))) {
break;
}
@ -839,6 +862,21 @@ btl_openib_component_init(int *num_btl_modules,
}
if(0 == mca_btl_openib_component.ib_num_btls) {
#if OMPI_HAVE_THREADS
if (mca_btl_openib_component.use_async_event_thread) {
int async_command = 0;
/* signaling to async_tread to stop poll for this hca*/
if (write(mca_btl_openib_component.async_pipe[1],
&async_command,sizeof(int))<0){
BTL_ERROR(("Failed to write to pipe"));
return NULL;
}
if (pthread_join(mca_btl_openib_component.async_thread, NULL)) {
BTL_ERROR(("Failed to stop OpenIB async event thread"));
return NULL;
}
}
#endif
opal_show_help("help-mpi-btl-openib.txt",
"no active ports found", true, orte_system_info.nodename);
return NULL;
@ -862,132 +900,79 @@ btl_openib_component_init(int *num_btl_modules,
/* Copy the btl module structs into a contiguous array and fully
initialize them */
for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++){
mca_btl_openib_frag_init_data_t* init_data;
int qp;
item = opal_list_remove_first(&btl_list);
ib_selected = (mca_btl_base_selected_module_t*)item;
mca_btl_openib_component.openib_btls[i] = (mca_btl_openib_module_t*)ib_selected->btl_module;
OBJ_RELEASE(ib_selected);
openib_btl = mca_btl_openib_component.openib_btls[i];
openib_btl->rd_num = mca_btl_openib_component.rd_num +
mca_btl_openib_component.rd_rsv;
openib_btl->rd_low = mca_btl_openib_component.rd_low;
openib_btl->num_peers = 0;
if(mca_btl_openib_component.use_srq) {
openib_btl->sd_credits[BTL_OPENIB_HP_QP] =
openib_btl->sd_credits[BTL_OPENIB_LP_QP] = mca_btl_openib_component.srq_sd_max;
}
/* Initialize module state */
OBJ_CONSTRUCT(&openib_btl->pending_frags[BTL_OPENIB_HP_QP], opal_list_t);
OBJ_CONSTRUCT(&openib_btl->pending_frags[BTL_OPENIB_LP_QP], opal_list_t);
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
OBJ_CONSTRUCT(&openib_btl->send_free[BTL_OPENIB_HP_QP], ompi_free_list_t);
OBJ_CONSTRUCT(&openib_btl->send_free[BTL_OPENIB_LP_QP], ompi_free_list_t);
OBJ_CONSTRUCT(&openib_btl->send_free_frag, ompi_free_list_t);
OBJ_CONSTRUCT(&openib_btl->send_free_control, ompi_free_list_t);
OBJ_CONSTRUCT(&openib_btl->recv_free[BTL_OPENIB_HP_QP], ompi_free_list_t);
OBJ_CONSTRUCT(&openib_btl->recv_free[BTL_OPENIB_LP_QP], ompi_free_list_t);
OBJ_CONSTRUCT(&openib_btl->recv_free_frag, ompi_free_list_t);
OBJ_CONSTRUCT(&openib_btl->send_user_free, ompi_free_list_t);
OBJ_CONSTRUCT(&openib_btl->recv_user_free, ompi_free_list_t);
/* setup the qp structure */
openib_btl->qps =
(mca_btl_openib_module_qp_t*)
malloc(sizeof(mca_btl_openib_module_qp_t)*
mca_btl_openib_component.num_qps);
/* initialize the memory pool using the hca */
openib_btl->super.btl_mpool = openib_btl->hca->mpool;
/* Initialize pool of send fragments */
length = sizeof(mca_btl_openib_send_frag_eager_t) +
sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_footer_t) +
openib_btl->super.btl_eager_limit;
openib_btl->eager_rdma_frag_size = OPAL_ALIGN(
sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_footer_t) +
openib_btl->super.btl_eager_limit,
mca_btl_openib_component.buffer_alignment, size_t);
ompi_free_list_init_ex(&openib_btl->send_free[BTL_OPENIB_HP_QP],
length,
mca_btl_openib_component.buffer_alignment,
OBJ_CLASS(mca_btl_openib_send_frag_eager_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
openib_btl->super.btl_mpool);
length = sizeof(mca_btl_openib_recv_frag_eager_t) +
sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_footer_t) +
openib_btl->super.btl_eager_limit;
ompi_free_list_init_ex(&openib_btl->recv_free[BTL_OPENIB_HP_QP],
length,
mca_btl_openib_component.buffer_alignment,
OBJ_CLASS(mca_btl_openib_recv_frag_eager_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
openib_btl->super.btl_mpool);
length = sizeof(mca_btl_openib_send_frag_max_t) +
sizeof(mca_btl_openib_header_t) +
openib_btl->super.btl_max_send_size;
ompi_free_list_init_ex(&openib_btl->send_free[BTL_OPENIB_LP_QP],
length,
mca_btl_openib_component.buffer_alignment,
OBJ_CLASS(mca_btl_openib_send_frag_max_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
openib_btl->super.btl_mpool);
length = sizeof(mca_btl_openib_recv_frag_max_t) +
sizeof(mca_btl_openib_header_t) +
openib_btl->super.btl_max_send_size;
/* Initialize pool of receive fragments */
ompi_free_list_init_ex(&openib_btl->recv_free[BTL_OPENIB_LP_QP],
length,
mca_btl_openib_component.buffer_alignment,
OBJ_CLASS(mca_btl_openib_recv_frag_max_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
openib_btl->super.btl_mpool);
length = sizeof(mca_btl_openib_send_frag_control_t) +
sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_footer_t) +
sizeof(mca_btl_openib_eager_rdma_header_t);
ompi_free_list_init_ex(&openib_btl->send_free_control,
length,
mca_btl_openib_component.buffer_alignment,
OBJ_CLASS(mca_btl_openib_send_frag_control_t),
mca_btl_openib_component.ib_free_list_num,
-1,
mca_btl_openib_component.ib_free_list_inc,
openib_btl->super.btl_mpool);
length = sizeof(mca_btl_openib_frag_t);
ompi_free_list_init(&openib_btl->send_free_frag,
length,
OBJ_CLASS(mca_btl_openib_send_frag_frag_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
NULL);
ompi_free_list_init(&openib_btl->recv_free_frag,
length,
OBJ_CLASS(mca_btl_openib_recv_frag_frag_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
NULL);
init_data = (mca_btl_openib_frag_init_data_t*)
malloc(sizeof(mca_btl_openib_frag_init_data_t));
init_data->length = length;
init_data->type = MCA_BTL_OPENIB_FRAG_SEND_USER;
init_data->order = mca_btl_openib_component.rdma_qp;
init_data->list = &openib_btl->send_user_free;
if(OMPI_SUCCESS != ompi_free_list_init_ex( &openib_btl->send_user_free,
length,
2,
OBJ_CLASS(mca_btl_openib_send_user_frag_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
NULL,
mca_btl_openib_frag_init,
(void*)init_data)) {
return NULL;
}
init_data = (mca_btl_openib_frag_init_data_t*)
malloc(sizeof(mca_btl_openib_frag_init_data_t));
init_data->length = length;
init_data->type = MCA_BTL_OPENIB_FRAG_RECV_USER;
init_data->order = mca_btl_openib_component.rdma_qp;
init_data->list = &openib_btl->recv_user_free;
if(OMPI_SUCCESS != ompi_free_list_init_ex(&openib_btl->recv_user_free,
length,
2,
OBJ_CLASS(mca_btl_openib_recv_user_frag_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
NULL,
mca_btl_openib_frag_init,
(void*)init_data)) {
return NULL;
}
orte_pointer_array_init(&openib_btl->eager_rdma_buffers,
mca_btl_openib_component.max_eager_rdma,
@ -995,8 +980,114 @@ btl_openib_component_init(int *num_btl_modules,
0);
openib_btl->eager_rdma_buffers_count = 0;
orte_pointer_array_init(&openib_btl->endpoints, 10, INT_MAX, 100);
orte_pointer_array_init(&openib_btl->endpoints, 10, INT_MAX, 10);
btls[i] = &openib_btl->super;
openib_btl->eager_rdma_frag_size = OPAL_ALIGN(
sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_footer_t) +
openib_btl->super.btl_eager_limit,
mca_btl_openib_component.buffer_alignment, size_t);
length = sizeof(mca_btl_openib_send_frag_control_t) +
sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_footer_t) +
sizeof(mca_btl_openib_eager_rdma_header_t);
init_data = (mca_btl_openib_frag_init_data_t*)
malloc(sizeof(mca_btl_openib_frag_init_data_t));
init_data->length = sizeof(mca_btl_openib_eager_rdma_header_t);
init_data->type = MCA_BTL_OPENIB_FRAG_CONTROL;
init_data->order = mca_btl_openib_component.eager_rdma_qp;
init_data->list = &openib_btl->send_free_control;
if(OMPI_SUCCESS != ompi_free_list_init_ex(&openib_btl->send_free_control,
length,
mca_btl_openib_component.buffer_alignment,
OBJ_CLASS(mca_btl_openib_send_frag_control_t),
mca_btl_openib_component.ib_free_list_num,
-1,
mca_btl_openib_component.ib_free_list_inc,
openib_btl->super.btl_mpool,
mca_btl_openib_frag_init,
init_data)) {
return NULL;
}
/* setup all the qps */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
OBJ_CONSTRUCT(&openib_btl->qps[qp].send_free, ompi_free_list_t);
OBJ_CONSTRUCT(&openib_btl->qps[qp].recv_free, ompi_free_list_t);
openib_btl->qps[qp].type = mca_btl_openib_component.qp_infos[qp].type;
if(MCA_BTL_OPENIB_SRQ_QP == openib_btl->qps[qp].type) {
OBJ_CONSTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags, opal_list_t);
openib_btl->qps[qp].u.srq_qp.sd_credits =
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
}
/* Initialize pool of send fragments */
length = sizeof(mca_btl_openib_send_frag_t) +
sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_footer_t) +
mca_btl_openib_component.qp_infos[qp].size;
init_data = (mca_btl_openib_frag_init_data_t*)
malloc(sizeof(mca_btl_openib_frag_init_data_t));
init_data->length = mca_btl_openib_component.qp_infos[qp].size;
init_data->type = MCA_BTL_OPENIB_FRAG_SEND;
init_data->order = qp;
init_data->list = &openib_btl->qps[qp].send_free;
if(OMPI_SUCCESS != ompi_free_list_init_ex(init_data->list,
length,
mca_btl_openib_component.buffer_alignment,
OBJ_CLASS(mca_btl_openib_send_frag_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
openib_btl->super.btl_mpool,
mca_btl_openib_frag_init,
(void*)init_data)) {
return NULL;
}
length = sizeof(mca_btl_openib_recv_frag_t) +
sizeof(mca_btl_openib_header_t) +
sizeof(mca_btl_openib_footer_t) +
mca_btl_openib_component.qp_infos[qp].size;
init_data = (mca_btl_openib_frag_init_data_t*)
malloc(sizeof(mca_btl_openib_frag_init_data_t));
init_data->length = mca_btl_openib_component.qp_infos[qp].size;
init_data->type = MCA_BTL_OPENIB_FRAG_RECV;
init_data->order = qp;
init_data->list = &openib_btl->qps[qp].recv_free;
if(OMPI_SUCCESS != ompi_free_list_init_ex(init_data->list,
length,
mca_btl_openib_component.buffer_alignment,
OBJ_CLASS(mca_btl_openib_recv_frag_t),
mca_btl_openib_component.ib_free_list_num,
mca_btl_openib_component.ib_free_list_max,
mca_btl_openib_component.ib_free_list_inc,
openib_btl->super.btl_mpool,
mca_btl_openib_frag_init,
init_data)) {
return NULL;
}
}
}
/* Post OOB receive to support dynamic connection setup */
@ -1047,14 +1138,14 @@ static void merge_values(ompi_btl_openib_ini_values_t *target,
static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
mca_btl_openib_endpoint_t *endpoint,
mca_btl_openib_frag_t *frag,
size_t byte_len, const int prio)
size_t byte_len, const int qp)
{
ompi_free_list_t *free_list;
if(endpoint->nbo) {
BTL_OPENIB_HEADER_NTOH((*(frag->hdr)));
}
free_list = &openib_btl->recv_free[prio];
free_list = frag->list;
/* advance the segment address past the header and subtract from the
* length..*/
@ -1066,20 +1157,35 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
openib_btl->ib_reg[frag->hdr->tag].cbdata);
if(BTL_OPENIB_IS_RDMA_CREDITS(frag->hdr->credits) &&
BTL_OPENIB_CREDITS(frag->hdr->credits) > 0)
BTL_OPENIB_CREDITS(frag->hdr->credits) > 0)
OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens,
BTL_OPENIB_CREDITS(frag->hdr->credits));
BTL_OPENIB_CREDITS(frag->hdr->credits));
else
if(!mca_btl_openib_component.use_srq && frag->hdr->credits > 0)
OPAL_THREAD_ADD32(&endpoint->sd_credits[prio], frag->hdr->credits);
if(MCA_BTL_OPENIB_PP_QP == endpoint->qps[qp].qp_type && frag->hdr->credits > 0) {
opal_output(mca_btl_base_output, "got %d sd_credits on qp:%d endpoint %p\n",
frag->hdr->credits, qp, (void*) endpoint);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits,
frag->hdr->credits);
assert(endpoint->qps[qp].u.pp_qp.sd_credits <=
mca_btl_openib_component.qp_infos[qp].rd_num);
}
if (!MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
/* repost receive descriptors if receive not by RDMA */
if(MCA_BTL_OPENIB_SRQ_QP == endpoint->qps[qp].qp_type) {
OPAL_THREAD_ADD32((int32_t*)&openib_btl->qps[qp].u.srq_qp.rd_posted, -1);
mca_btl_openib_post_srr(openib_btl, 0, qp);
} else {
OPAL_THREAD_ADD32((int32_t*)&endpoint->qps[qp].u.pp_qp.rd_posted, -1);
mca_btl_openib_endpoint_post_rr(endpoint, 0, qp);
}
OMPI_FREE_LIST_RETURN(free_list, (ompi_free_list_item_t*) frag);
} else {
mca_btl_openib_frag_t *tf;
OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
while (endpoint->eager_rdma_local.tail !=
endpoint->eager_rdma_local.head) {
endpoint->eager_rdma_local.head) {
tf = MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(endpoint,
endpoint->eager_rdma_local.tail);
if (MCA_BTL_OPENIB_RDMA_FRAG_LOCAL (tf))
@ -1089,38 +1195,27 @@ static int btl_openib_handle_incoming(mca_btl_openib_module_t *openib_btl,
}
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
}
/* decide if it is time to setup an eager rdma channel */
if (!endpoint->eager_rdma_local.base.pval &&
endpoint->use_eager_rdma &&
BTL_OPENIB_HP_QP == prio &&
openib_btl->eager_rdma_buffers_count <
mca_btl_openib_component.max_eager_rdma &&
OPAL_THREAD_ADD32(&endpoint->eager_recv_count, 1) ==
mca_btl_openib_component.eager_rdma_threshold) {
endpoint->use_eager_rdma &&
byte_len < mca_btl_openib_component.eager_limit &&
openib_btl->eager_rdma_buffers_count <
mca_btl_openib_component.max_eager_rdma &&
OPAL_THREAD_ADD32(&endpoint->eager_recv_count, 1) ==
mca_btl_openib_component.eager_rdma_threshold) {
mca_btl_openib_endpoint_connect_eager_rdma(endpoint);
}
/* repost receive descriptors if receive not by RDMA */
if(!MCA_BTL_OPENIB_RDMA_FRAG(frag)) {
if(mca_btl_openib_component.use_srq) {
OPAL_THREAD_ADD32((int32_t*)&openib_btl->srd_posted[prio], -1);
mca_btl_openib_post_srr(openib_btl, 0, prio);
} else {
OPAL_THREAD_ADD32((int32_t*)&endpoint->rd_posted[prio], -1);
btl_openib_endpoint_post_rr(endpoint, 0, prio);
}
/* We may receive credits here so try to progress only things that
* may be pending because of credit shortage */
if(MCA_BTL_OPENIB_PP_QP == endpoint->qps[qp].qp_type ||
BTL_OPENIB_EAGER_RDMA_QP(qp)) {
btl_openib_frag_progress_pending_pp(endpoint, qp);
if(btl_openib_check_send_credits(endpoint, qp))
mca_btl_openib_endpoint_send_credits(endpoint, qp);
}
/* nothing to progress for SRQ case */
if(!mca_btl_openib_component.use_srq) {
btl_openib_frag_progress_pending(openib_btl, endpoint, prio);
}
/* check to see if we need to return credits */
if(btl_openib_check_send_credits(endpoint, prio)) {
mca_btl_openib_endpoint_send_credits(endpoint, prio);
}
return OMPI_SUCCESS;
}
@ -1199,131 +1294,91 @@ static char* btl_openib_component_status_to_string(enum ibv_wc_status status)
}
}
#define BTL_OPENIB_TOKENS(E, P) ((E)->sd_credits[(P)] + \
(((P) == BTL_OPENIB_HP_QP)?(E)->eager_rdma_remote.tokens:0))
static void btl_openib_frag_progress_pending(
mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint,
const int prio)
#define BTL_OPENIB_TOKENS(E, P) \
(((E)->qps[qp].qp_type == MCA_BTL_OPENIB_SRQ_QP) ? 1 : \
((E)->qps[(P)].u.pp_qp.sd_credits + \
((BTL_OPENIB_EAGER_RDMA_QP(P))?(E)->eager_rdma_remote.tokens:0)))
static void btl_openib_frag_progress_pending_pp(
mca_btl_base_endpoint_t *endpoint, const int qp)
{
opal_list_item_t *frag_item;
mca_btl_openib_frag_t* frag;
size_t i, len = opal_list_get_size(&endpoint->pending_frags[prio]);
size_t i, len = opal_list_get_size(&endpoint->qps[qp].pending_frags);
/* check to see if we need to progress any pending descriptors */
for(i = 0; i < len && endpoint->sd_wqe[prio] > 0 &&
BTL_OPENIB_TOKENS(endpoint, prio) > 0; i++) {
for(i = 0; i < len && endpoint->qps[qp].sd_wqe > 0 &&
BTL_OPENIB_TOKENS(endpoint, qp) > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_frags[prio]));
frag_item =
opal_list_remove_first(&(endpoint->qps[qp].pending_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
break;
if(mca_btl_openib_endpoint_send(frag->endpoint, frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
OMPI_ERR_OUT_OF_RESOURCE)
break;
}
}
if(BTL_OPENIB_LP_QP == prio) {
len = opal_list_get_size(&endpoint->pending_get_frags);
for(i = 0; i < len && endpoint->sd_wqe[BTL_OPENIB_LP_QP] > 0 &&
endpoint->get_tokens > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_get_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
break;
if(mca_btl_openib_get((mca_btl_base_module_t *)openib_btl,
frag->endpoint, (mca_btl_base_descriptor_t*)frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
break;
}
len = opal_list_get_size(&endpoint->pending_put_frags);
for(i = 0; i < len && endpoint->sd_wqe[BTL_OPENIB_LP_QP] > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_put_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
break;
if(mca_btl_openib_put((mca_btl_base_module_t*)openib_btl,
frag->endpoint, (mca_btl_base_descriptor_t*)frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
break;
}
static void btl_openib_frag_progress_pending_put_get(
mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint,
const int qp) {
opal_list_item_t *frag_item;
mca_btl_openib_frag_t* frag;
size_t i, len = opal_list_get_size(&endpoint->pending_get_frags);
for(i = 0; i < len && endpoint->qps[qp].sd_wqe > 0 &&
endpoint->get_tokens > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_get_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
break;
if(mca_btl_openib_get((mca_btl_base_module_t *)openib_btl,
frag->endpoint, (mca_btl_base_descriptor_t*)frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
break;
}
len = opal_list_get_size(&endpoint->pending_put_frags);
for(i = 0; i < len && endpoint->qps[qp].sd_wqe > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag_item = opal_list_remove_first(&(endpoint->pending_put_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
break;
if(mca_btl_openib_put((mca_btl_base_module_t*)openib_btl,
frag->endpoint, (mca_btl_base_descriptor_t*)frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
break;
}
}
if(!mca_btl_openib_component.use_srq)
return;
len = opal_list_get_size(&openib_btl->pending_frags[prio]);
for(i = 0; i < len && openib_btl->sd_credits[prio] > 0; i++) {
static void btl_openib_frag_progress_pending_srq(
mca_btl_openib_module_t* openib_btl, mca_btl_base_endpoint_t *endpoint,
const int qp)
{
opal_list_item_t *frag_item;
mca_btl_openib_frag_t* frag;
size_t i, len;
assert(MCA_BTL_OPENIB_SRQ_QP == endpoint->qps[qp].qp_type);
len = opal_list_get_size(&openib_btl->qps[qp].u.srq_qp.pending_frags);
for(i = 0; i < len && openib_btl->qps[qp].u.srq_qp.sd_credits > 0; i++) {
/* dequeue resources due to global flow control */
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
frag_item = opal_list_remove_first(&openib_btl->pending_frags[prio]);
frag_item =
opal_list_remove_first(&openib_btl->qps[qp].u.srq_qp.pending_frags);
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
if(NULL == (frag = (mca_btl_openib_frag_t *) frag_item))
break;
if(mca_btl_openib_endpoint_send(frag->endpoint, frag) ==
OMPI_ERR_OUT_OF_RESOURCE)
OMPI_ERR_OUT_OF_RESOURCE)
break;
}
}
#if OMPI_HAVE_POSIX_THREADS
void* btl_openib_async_thread(void *one_hca)
{
struct ibv_async_event event;
struct mca_btl_openib_hca_t *hca = (struct mca_btl_openib_hca_t *)one_hca;
/* This thread enter in a cancel enabled state */
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE,NULL);
pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS,NULL);
while (1) {
if (ibv_get_async_event((struct ibv_context *)hca->ib_dev_context,
&event)) {
BTL_ERROR(("Failed to get async event"));
}
switch(event.event_type) {
/* Fatal */
case IBV_EVENT_CQ_ERR:
case IBV_EVENT_QP_FATAL:
case IBV_EVENT_QP_REQ_ERR:
case IBV_EVENT_QP_ACCESS_ERR:
case IBV_EVENT_PATH_MIG:
case IBV_EVENT_PATH_MIG_ERR:
case IBV_EVENT_DEVICE_FATAL:
case IBV_EVENT_SRQ_ERR:
BTL_ERROR(( "Openib got FATAL event %d",event.event_type));
/* Set the flag to fatal */
hca->got_fatal_event=true;
/* It is not critical to protect the counter */
OPAL_THREAD_ADD32(&mca_btl_openib_component.fatal_counter, 1);
break;
case IBV_EVENT_PORT_ERR:
BTL_ERROR(( "Openib got port ERROR event %d",event.event_type));
break;
case IBV_EVENT_COMM_EST:
case IBV_EVENT_PORT_ACTIVE:
case IBV_EVENT_SQ_DRAINED:
case IBV_EVENT_LID_CHANGE:
case IBV_EVENT_PKEY_CHANGE:
case IBV_EVENT_SM_CHANGE:
case IBV_EVENT_QP_LAST_WQE_REACHED:
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
case IBV_EVENT_CLIENT_REREGISTER:
#endif
break;
case IBV_EVENT_SRQ_LIMIT_REACHED:
BTL_ERROR(("Got SRQ limit event %d",event.event_type));
break;
default:
BTL_ERROR(("Got unknown event %d.Continuing...",event.event_type));
}
ibv_ack_async_event(&event);
}
return PTHREAD_CANCELED;
}
#endif
#if OMPI_ENABLE_PROGRESS_THREADS == 1
void* mca_btl_openib_progress_thread(opal_object_t* arg)
@ -1376,8 +1431,9 @@ static int btl_openib_component_progress(void)
mca_btl_openib_frag_t* frag;
mca_btl_openib_endpoint_t* endpoint;
#if OMPI_HAVE_POSIX_THREADS
if(mca_btl_openib_component.fatal_counter) {
#if OMPI_HAVE_THREADS
if(mca_btl_openib_component.use_async_event_thread &&
mca_btl_openib_component.fatal_counter) {
goto error;
}
#endif
@ -1425,7 +1481,7 @@ static int btl_openib_component_progress(void)
ret = btl_openib_handle_incoming(openib_btl,
frag->endpoint, frag,
size - sizeof(mca_btl_openib_footer_t),
BTL_OPENIB_HP_QP);
frag->qp_idx);
if (ret != MPI_SUCCESS) {
openib_btl->error_cb(&openib_btl->super,
MCA_BTL_ERROR_FLAGS_FATAL);
@ -1444,7 +1500,7 @@ static int btl_openib_component_progress(void)
return count;
#if OMPI_HAVE_POSIX_THREADS
#if OMPI_HAVE_THREADS
error:
/* Set the fatal counter to zero */
mca_btl_openib_component.fatal_counter = 0;
@ -1461,96 +1517,82 @@ error:
static int btl_openib_module_progress(mca_btl_openib_module_t* openib_btl)
{
static char *qp_name[] = {"HP", "LP"};
int qp;
static char *cq_name[] = {"HP CQ", "LP CQ"};
int cq, qp;
int count = 0,ne = 0, ret;
mca_btl_openib_frag_t* frag;
mca_btl_openib_endpoint_t* endpoint;
struct ibv_wc wc;
for(qp = 0; qp < 2; qp++) {
ne = ibv_poll_cq(openib_btl->ib_cq[qp], 1, &wc);
for(cq = 0; cq < 2; cq++) {
if(0 == openib_btl->cq_users[cq])
continue;
ne = ibv_poll_cq(openib_btl->ib_cq[cq], 1, &wc);
if(0 == ne)
continue;
if(ne < 0 || wc.status != IBV_WC_SUCCESS)
goto error;
frag = (mca_btl_openib_frag_t*) (unsigned long) wc.wr_id;
frag = (mca_btl_openib_frag_t*) (unsigned long) wc.wr_id;
qp = frag->base.order;
endpoint = frag->endpoint;
/* Handle work completions */
switch(wc.opcode) {
case IBV_WC_RDMA_READ:
assert(BTL_OPENIB_LP_QP == qp);
OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
/* fall through */
case IBV_WC_RDMA_WRITE:
if(BTL_OPENIB_LP_QP == qp) {
/* process a completed write */
frag->base.des_cbfunc(&openib_btl->super, endpoint,
&frag->base, OMPI_SUCCESS);
/* return send wqe */
OPAL_THREAD_ADD32(&endpoint->sd_wqe[qp], 1);
/* check for pending frags */
btl_openib_frag_progress_pending(openib_btl, endpoint, qp);
count++;
break;
}
/* fall through for high prio QP */
case IBV_WC_SEND:
/* Process a completed send */
/* Process a completed send/put/get */
frag->base.des_cbfunc(&openib_btl->super, endpoint, &frag->base,
OMPI_SUCCESS);
/* return send wqe */
OPAL_THREAD_ADD32(&endpoint->sd_wqe[qp], 1);
if(mca_btl_openib_component.use_srq)
OPAL_THREAD_ADD32(&openib_btl->sd_credits[qp], 1);
/* check to see if we need to progress any pending descriptors */
btl_openib_frag_progress_pending(openib_btl, endpoint, qp);
OPAL_THREAD_ADD32(&endpoint->qps[qp].sd_wqe, 1);
/* check to see if we need to return credits */
if(btl_openib_check_send_credits(endpoint, qp)) {
mca_btl_openib_endpoint_send_credits(endpoint, qp);
if(IBV_WC_SEND == wc.opcode &&
MCA_BTL_OPENIB_SRQ_QP == endpoint->qps[qp].qp_type) {
OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
/* new SRQ credit available. Try to progress pending frags*/
btl_openib_frag_progress_pending_srq(openib_btl, endpoint, qp);
}
/* new wqe or/and get token available. Try to progress pending frags */
btl_openib_frag_progress_pending_pp(endpoint, qp);
btl_openib_frag_progress_pending_put_get(openib_btl, endpoint, qp);
count++;
break;
case IBV_WC_RECV:
case IBV_WC_RECV:
if(wc.wc_flags & IBV_WC_WITH_IMM) {
endpoint = (mca_btl_openib_endpoint_t*)
orte_pointer_array_get_item(openib_btl->endpoints,
wc.imm_data);
orte_pointer_array_get_item(openib_btl->endpoints, wc.imm_data);
frag->endpoint = endpoint;
}
/* Process a RECV */
ret = btl_openib_handle_incoming(openib_btl, endpoint, frag,
wc.byte_len, qp);
ret = btl_openib_handle_incoming(openib_btl, endpoint, frag, wc.byte_len, qp);
if (ret != OMPI_SUCCESS) {
openib_btl->error_cb(&openib_btl->super,
MCA_BTL_ERROR_FLAGS_FATAL);
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
return 0;
}
count++;
opal_output(mca_btl_base_output, "completed a recv\n");
break;
default:
BTL_ERROR(("Unhandled work completion opcode is %d",
wc.opcode));
openib_btl->error_cb(&openib_btl->super,
MCA_BTL_ERROR_FLAGS_FATAL);
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
break;
}
}
return count;
error:
if(ne < 0){
BTL_ERROR(("error polling %s CQ with %d errno says %s\n",
qp_name[qp], ne, strerror(errno)));
abort();
BTL_ERROR(("error polling %s with %d errno says %s\n",
cq_name[cq], ne, strerror(errno)));
} else {
static int flush_err_printed[] = {0, 0};
ompi_proc_t* remote_proc = NULL;
@ -1563,12 +1605,14 @@ error:
remote_proc = endpoint->endpoint_proc->proc_ompi;
}
}
if(wc.status != IBV_WC_WR_FLUSH_ERR || !flush_err_printed[qp]++)
BTL_PEER_ERROR(remote_proc, ("error polling %s CQ with status %s "
if(wc.status != IBV_WC_WR_FLUSH_ERR || !flush_err_printed[cq]++) {
BTL_PEER_ERROR(remote_proc, ("error polling %s with status %s "
"status number %d for wr_id %llu opcode %d",
qp_name[qp],
cq_name[cq],
btl_openib_component_status_to_string(wc.status),
wc.status, wc.wr_id, wc.opcode));
abort();
}
if(wc.status == IBV_WC_RETRY_EXC_ERR) {
opal_show_help("help-mpi-btl-openib.txt",
"btl_openib:retry-exceeded", true);

Просмотреть файл

@ -18,16 +18,18 @@ extern "C" {
#endif
struct mca_btl_openib_eager_rdma_local_t {
ompi_ptr_t base; /**< buffer for RDMAing eager messages */
mca_btl_openib_recv_frag_eager_t *frags;
mca_btl_openib_reg_t *reg;
uint16_t head; /**< RDMA buffer to poll */
ompi_ptr_t base; /**< buffer for RDMAing eager messages */
mca_btl_openib_recv_frag_t *frags;
mca_btl_openib_reg_t *reg;
uint16_t head; /**< RDMA buffer to poll */
uint16_t tail; /**< Needed for credit managment */
int32_t credits; /**< number of RDMA credits */
int32_t credits; /**< number of RDMA credits */
int32_t rd_win;
#if OMPI_ENABLE_DEBUG
uint32_t seq;
#endif
opal_mutex_t lock; /**< guard access to RDMA buffer */
opal_mutex_t lock; /**< guard access to RDMA buffer */
int32_t rd_low;
};
typedef struct mca_btl_openib_eager_rdma_local_t mca_btl_openib_eager_rdma_local_t;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -47,10 +48,10 @@ typedef enum {
/* Waiting for ack from endpoint */
MCA_BTL_IB_CONNECT_ACK,
/*Waiting for final connection ACK from endpoint */
MCA_BTL_IB_WAITING_ACK,
/*Waiting for final connection ACK from endpoint */
MCA_BTL_IB_WAITING_ACK,
/* Connected ... both sender & receiver have
* buffers associated with this connection */
MCA_BTL_IB_CONNECTED,
@ -64,30 +65,69 @@ typedef enum {
MCA_BTL_IB_FAILED
} mca_btl_openib_endpoint_state_t;
struct mca_btl_openib_rem_info_t {
uint32_t rem_qp_num[2];
/* Remote QP number (Low and High priority) */
struct mca_btl_openib_rem_qp_info_t {
uint32_t rem_qp_num;
/* Remote QP number */
uint32_t rem_psn;
/* Remote processes port sequence number */
}; typedef struct mca_btl_openib_rem_qp_info_t mca_btl_openib_rem_qp_info_t;
struct mca_btl_openib_rem_info_t {
uint16_t rem_lid;
/* Local identifier of the remote process */
uint32_t rem_psn[2];
/* Remote processes port sequence number (Low and High) */
uint64_t rem_subnet_id;
/* subnet id of remote process */
/* MTU of remote process */
uint32_t rem_mtu;
/* index of remote endpoint in endpoint array */
/* MTU of remote process */
uint32_t rem_index;
};
typedef struct mca_btl_openib_rem_info_t mca_btl_openib_rem_info_t;
/* index of remote endpoint in endpoint array */
mca_btl_openib_rem_qp_info_t *rem_qps;
}; typedef struct mca_btl_openib_rem_info_t mca_btl_openib_rem_info_t;
/**
* Agggregates all per peer qp info for an endpoint
*/
struct mca_btl_openib_endpoint_pp_qp_t {
int32_t sd_credits; /**< this rank's view of the credits
* available for sending:
* this is the credits granted by the
* remote peer which has some relation to the
* number of receive buffers posted remotely
*/
int32_t rd_posted; /**< number of descriptors posted to the nic*/
int32_t rd_credits; /**< number of credits to return to peer */
}; typedef struct mca_btl_openib_endpoint_pp_qp_t mca_btl_openib_endpoint_pp_qp_t;
/**
* Aggregates all srq qp info for an endpoint
*/
struct mca_btl_openib_endpoint_srq_qp_t {
int32_t dummy;
}; typedef struct mca_btl_openib_endpoint_srq_qp_t mca_btl_openib_endpoint_srq_qp_t;
struct mca_btl_openib_endpoint_qp_t {
struct ibv_qp* lcl_qp; /* Local QP (Low and High) */
struct ibv_qp_attr* lcl_qp_attr;
/* Local QP attrnibutes (Low and High) */
uint32_t lcl_psn;
int32_t sd_wqe; /**< number of available send wqe entries */
int qp_type;
opal_list_t pending_frags; /**< put fragments here if there
is no wqe available or, in
case of PP QP, if there is
no credit available */
int32_t rd_pending_credit_chks; /**< number of outstanding return credit requests */
struct mca_btl_openib_frag_t *credit_frag;
union {
mca_btl_openib_endpoint_srq_qp_t srq_qp;
mca_btl_openib_endpoint_pp_qp_t pp_qp;
} u;
}; typedef struct mca_btl_openib_endpoint_qp_t mca_btl_openib_endpoint_qp_t;
/**
* An abstraction that represents a connection to a endpoint process.
@ -117,35 +157,24 @@ struct mca_btl_base_endpoint_t {
opal_mutex_t endpoint_lock;
/**< lock for concurrent access to endpoint state */
opal_list_t pending_send_frags;
/**< list of pending send frags for this endpotint */
opal_list_t pending_lazy_frags;
/**< list of pending frags due to lazy connection establishment
* for this endpotint
*/
opal_list_t pending_frags[2]; /**< list of pending frags */
mca_btl_openib_endpoint_qp_t * qps;
opal_list_t pending_get_frags; /**< list of pending rget ops */
opal_list_t pending_put_frags; /**< list of pending rput ops */
mca_btl_openib_rem_info_t rem_info;
uint32_t lcl_psn[2];
/* Local processes port sequence number (Low and High) */
struct ibv_qp* lcl_qp[2]; /* Local QP (Low and High) */
struct ibv_qp_attr* lcl_qp_attr[2];
/* Local QP attributes (Low and High) */
int32_t sd_credits[2]; /**< this rank's view of the credits
* available for sending:
* this is the credits granted by the
* remote peer which has some relation to the
* number of receive buffers posted remotely
*/
int32_t get_tokens; /**< number of available get tokens */
int32_t rd_posted[2]; /**< number of descriptors posted to the nic*/
int32_t rd_credits[2]; /**< number of credits to return to peer */
int32_t rd_pending_credit_chks[2]; /**< number of outstanding return credit requests */
int32_t sd_wqe[2]; /**< number of available send wqe entries */
uint64_t subnet_id; /**< subnet id of this endpoint*/
@ -155,10 +184,12 @@ struct mca_btl_base_endpoint_t {
mca_btl_openib_eager_rdma_local_t eager_rdma_local;
/**< info about local RDMA buffer */
uint32_t index; /**< index of the endpoint in endpoints array */
struct mca_btl_openib_frag_t *credit_frag[2];
/**< frags for sending explicit high priority credits */
bool nbo; /**< does the endpoint require network byte ordering? */
bool use_eager_rdma; /**< use eager rdma for this peer? */
mca_btl_openib_rem_info_t rem_info;
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
@ -173,22 +204,31 @@ void mca_btl_openib_post_recv(void);
void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int);
void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*);
static inline int btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpoint,
const int additional,
const int prio)
static inline int mca_btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpoint,
const int additional,
const int qp)
{
mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
int rd_num =
mca_btl_openib_component.qp_infos[qp].rd_num +
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
assert(MCA_BTL_OPENIB_PP_QP == endpoint->qps[qp].qp_type);
OPAL_THREAD_LOCK(&openib_btl->ib_lock);
if(endpoint->rd_posted[prio] <=
mca_btl_openib_component.rd_low + additional &&
endpoint->rd_posted[prio] < openib_btl->rd_num) {
if((endpoint->qps[qp].u.pp_qp.rd_posted - mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv) <=
mca_btl_openib_component.qp_infos[qp].rd_low + additional &&
endpoint->qps[qp].u.pp_qp.rd_posted <
rd_num) {
int rc;
int32_t i, num_post = openib_btl->rd_num - endpoint->rd_posted[prio];
int32_t i, num_post = rd_num - endpoint->qps[qp].u.pp_qp.rd_posted;
struct ibv_recv_wr* bad_wr;
ompi_free_list_t *free_list;
assert(num_post >= 0);
free_list = &openib_btl->recv_free[prio];
free_list = &openib_btl->qps[qp].recv_free;
for(i = 0; i < num_post; i++) {
ompi_free_list_item_t* item;
@ -196,35 +236,68 @@ static inline int btl_openib_endpoint_post_rr(mca_btl_base_endpoint_t *endpoint,
OMPI_FREE_LIST_WAIT(free_list, item, rc);
frag = (mca_btl_openib_frag_t*)item;
frag->endpoint = endpoint;
if(ibv_post_recv(endpoint->lcl_qp[prio], &frag->wr_desc.rd_desc,
&bad_wr)) {
frag->base.order = qp;
if(ibv_post_recv(endpoint->qps[qp].lcl_qp,
&frag->wr_desc.rd_desc,
&bad_wr)) {
BTL_ERROR(("error posting receive errno says %s\n",
strerror(errno)));
strerror(errno)));
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_ERROR;
}
}
OPAL_THREAD_ADD32(&endpoint->rd_posted[prio], num_post);
OPAL_THREAD_ADD32(&endpoint->rd_credits[prio], num_post);
}
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_SUCCESS;
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_posted, num_post);
OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, num_post);
assert(endpoint->qps[qp].u.pp_qp.rd_credits < rd_num);
assert(endpoint->qps[qp].u.pp_qp.rd_credits >= 0);
opal_output(mca_btl_base_output, "posting %d on qp %d \n", num_post, qp);
}
opal_output(mca_btl_base_output, "not posting on qp %d: rd_posted %d, rd_low %d, additional %d, rd_num %d rd_credits %d\n"
, qp, endpoint->qps[qp].u.pp_qp.rd_posted, mca_btl_openib_component.qp_infos[qp].rd_low , additional, rd_num,
endpoint->qps[qp].u.pp_qp.rd_credits);
OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
return OMPI_SUCCESS;
}
static inline int mca_btl_openib_endpoint_post_rr_all(mca_btl_base_endpoint_t *endpoint,
const int additional)
{
int qp;
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++){
if(MCA_BTL_OPENIB_PP_QP == mca_btl_openib_component.qp_infos[qp].type) {
mca_btl_openib_endpoint_post_rr(endpoint, additional, qp);
}
}
return OMPI_SUCCESS;
}
static inline int btl_openib_check_send_credits(
mca_btl_openib_endpoint_t *endpoint, const int prio)
mca_btl_openib_endpoint_t *endpoint, const int qp)
{
if(!mca_btl_openib_component.use_srq &&
endpoint->rd_credits[prio] >= mca_btl_openib_component.rd_win)
return OPAL_THREAD_ADD32(&endpoint->rd_pending_credit_chks[prio], 1) == 1;
opal_output(mca_btl_base_output, "check_send_credits says rd_credits is %d, rd_win is %d qp: %d endpoint %p\n",
endpoint->qps[qp].u.pp_qp.rd_credits,
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win, qp, (void*) endpoint);
if(BTL_OPENIB_LP_QP == prio) /* nothing more for low prio QP */
/* GMS, this is busted for high prio check eager RDMA credits */
if(BTL_OPENIB_EAGER_RDMA_QP(qp)) {
if(endpoint->eager_rdma_local.credits >= endpoint->eager_rdma_local.rd_win) {
opal_output(mca_btl_base_output, "check_send_credits says sending RDMA credits qp: %d\n", qp);
return OPAL_THREAD_ADD32(&endpoint->qps[qp].rd_pending_credit_chks, 1) == 1;
}
}
if(MCA_BTL_OPENIB_PP_QP != mca_btl_openib_component.qp_infos[qp].type)
return 0;
/* for high prio check eager RDMA credits */
if(endpoint->eager_rdma_local.credits >= mca_btl_openib_component.rd_win)
return OPAL_THREAD_ADD32(&endpoint->rd_pending_credit_chks[prio], 1) == 1;
if(endpoint->qps[qp].u.pp_qp.rd_credits >=
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win) {
opal_output(mca_btl_base_output, "we need to try and send credits rd_pending_credit_chks says %d qp: %d\n",
endpoint->qps[qp].rd_pending_credit_chks, qp);
return OPAL_THREAD_ADD32(&endpoint->qps[qp].rd_pending_credit_chks, 1) == 1;
}
return 0;
}

Просмотреть файл

@ -9,6 +9,8 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -19,37 +21,49 @@
#include "btl_openib_frag.h"
#include "btl_openib_eager_rdma.h"
static void mca_btl_openib_frag_common_constructor( mca_btl_openib_frag_t* frag)
{
void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx) {
mca_btl_openib_frag_init_data_t* init_data =
(mca_btl_openib_frag_init_data_t*) ctx;
mca_btl_openib_frag_t* frag = (mca_btl_openib_frag_t*) item;
mca_btl_openib_reg_t* registration =
(mca_btl_openib_reg_t*)frag->base.super.registration;
frag->size = init_data->length;
assert(init_data->order != 255);
frag->base.order = MCA_BTL_NO_ORDER;
frag->type = init_data->type;
frag->list = init_data->list;
frag->qp_idx = init_data->order;
frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr;
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
/* init the segment address to start after the btl header */
if(registration) {
frag->registration = registration;
frag->sg_entry.lkey = registration->mr->lkey;
frag->segment.seg_key.key32[0] = frag->sg_entry.lkey;
}
/* init the segment address to start after the btl header */
frag->segment.seg_len = frag->size;
frag->sg_entry.addr = (unsigned long) frag->hdr;
frag->sg_entry.length = frag->size + sizeof(mca_btl_openib_header_t);
frag->base.des_flags = 0;
return;
}
static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* frag)
{
mca_btl_openib_frag_common_constructor(frag);
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->wr_desc.sr_desc.wr_id = (unsigned long) frag;
frag->wr_desc.sr_desc.sg_list = &frag->sg_entry;
frag->wr_desc.sr_desc.num_sge = 1;
@ -60,8 +74,6 @@ static void mca_btl_openib_send_frag_common_constructor(mca_btl_openib_frag_t* f
static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* frag)
{
mca_btl_openib_frag_common_constructor(frag);
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_src = NULL;
@ -73,66 +85,35 @@ static void mca_btl_openib_recv_frag_common_constructor(mca_btl_openib_frag_t* f
frag->wr_desc.rd_desc.next = NULL;
}
static void mca_btl_openib_send_frag_eager_constructor(mca_btl_openib_frag_t* frag)
static void mca_btl_openib_recv_user_frag_constructor(mca_btl_openib_frag_t* frag)
{
frag->base.order = BTL_OPENIB_HP_QP;
frag->size = mca_btl_openib_component.eager_limit;
frag->type = MCA_BTL_OPENIB_FRAG_EAGER;
mca_btl_openib_send_frag_common_constructor(frag);
}
static void mca_btl_openib_send_frag_max_constructor(mca_btl_openib_frag_t* frag)
{
frag->base.order = BTL_OPENIB_LP_QP;
frag->size = mca_btl_openib_component.max_send_size;
frag->type = MCA_BTL_OPENIB_FRAG_MAX;
mca_btl_openib_send_frag_common_constructor(frag);
}
static void mca_btl_openib_recv_frag_max_constructor(mca_btl_openib_frag_t* frag)
{
frag->base.order = BTL_OPENIB_LP_QP;
frag->size = mca_btl_openib_component.max_send_size;
frag->type = MCA_BTL_OPENIB_FRAG_MAX;
mca_btl_openib_recv_frag_common_constructor(frag);
}
static void mca_btl_openib_recv_frag_eager_constructor(mca_btl_openib_frag_t* frag)
{
frag->base.order = BTL_OPENIB_HP_QP;
frag->size = mca_btl_openib_component.eager_limit;
frag->type = MCA_BTL_OPENIB_FRAG_EAGER;
mca_btl_openib_recv_frag_common_constructor(frag);
frag->ftr = (mca_btl_openib_footer_t*)((char*)frag->segment.seg_addr.pval
+ frag->size);
MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
}
static void mca_btl_openib_send_frag_frag_constructor(mca_btl_openib_frag_t* frag)
{
frag->base.order = BTL_OPENIB_LP_QP;
frag->size = 0;
frag->type = MCA_BTL_OPENIB_SEND_FRAG_FRAG;
frag->registration = NULL;
mca_btl_openib_send_frag_common_constructor(frag);
}
frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr;
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
/* init the segment address to start after the btl header */
frag->segment.seg_len = frag->size;
frag->sg_entry.addr = (unsigned long) frag->hdr;
frag->sg_entry.length = frag->size + sizeof(mca_btl_openib_header_t);
frag->base.des_flags = 0;
static void mca_btl_openib_recv_frag_frag_constructor(mca_btl_openib_frag_t* frag)
{
frag->base.order = BTL_OPENIB_LP_QP;
frag->size = 0;
frag->type = MCA_BTL_OPENIB_RECV_FRAG_FRAG;
frag->registration = NULL;
mca_btl_openib_recv_frag_common_constructor(frag);
}
static void mca_btl_openib_send_frag_control_constructor(mca_btl_openib_frag_t* frag)
static void mca_btl_openib_send_user_frag_constructor(mca_btl_openib_frag_t* frag)
{
frag->base.order = BTL_OPENIB_HP_QP;
frag->size = sizeof(mca_btl_openib_eager_rdma_header_t);
frag->type = MCA_BTL_OPENIB_FRAG_CONTROL;
frag->registration = NULL;
frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr;
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
/* init the segment address to start after the btl header */
frag->segment.seg_len = frag->size;
frag->sg_entry.addr = (unsigned long) frag->hdr;
frag->sg_entry.length = frag->size + sizeof(mca_btl_openib_header_t);
frag->base.des_flags = 0;
mca_btl_openib_send_frag_common_constructor(frag);
}
@ -143,46 +124,33 @@ OBJ_CLASS_INSTANCE(
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_frag_eager_t,
mca_btl_openib_send_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_send_frag_eager_constructor,
mca_btl_openib_send_frag_common_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_frag_max_t,
mca_btl_base_descriptor_t,
mca_btl_openib_send_frag_max_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_frag_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_send_frag_frag_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_recv_frag_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_recv_frag_frag_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_recv_frag_eager_t,
mca_btl_base_descriptor_t,
mca_btl_openib_recv_frag_eager_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_recv_frag_max_t,
mca_btl_base_descriptor_t,
mca_btl_openib_recv_frag_max_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_frag_control_t,
mca_btl_base_descriptor_t,
mca_btl_openib_send_frag_control_constructor,
mca_btl_openib_send_frag_common_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_send_user_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_send_user_frag_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_recv_user_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_recv_user_frag_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_openib_recv_frag_t,
mca_btl_base_descriptor_t,
mca_btl_openib_recv_frag_common_constructor,
NULL);

Просмотреть файл

@ -147,10 +147,10 @@ do { \
} while (0)
enum mca_btl_openib_frag_type_t {
MCA_BTL_OPENIB_FRAG_EAGER,
MCA_BTL_OPENIB_FRAG_MAX,
MCA_BTL_OPENIB_SEND_FRAG_FRAG,
MCA_BTL_OPENIB_RECV_FRAG_FRAG,
MCA_BTL_OPENIB_FRAG_RECV,
MCA_BTL_OPENIB_FRAG_RECV_USER,
MCA_BTL_OPENIB_FRAG_SEND,
MCA_BTL_OPENIB_FRAG_SEND_USER,
MCA_BTL_OPENIB_FRAG_EAGER_RDMA,
MCA_BTL_OPENIB_FRAG_CONTROL
};
@ -174,33 +174,26 @@ struct mca_btl_openib_frag_t {
} wr_desc;
struct ibv_sge sg_entry;
struct mca_btl_openib_reg_t *registration;
ompi_free_list_t* list;
uint8_t qp_idx;
};
typedef struct mca_btl_openib_frag_t mca_btl_openib_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_frag_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_send_frag_eager_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_eager_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_send_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_send_frag_max_t;
typedef struct mca_btl_openib_frag_t mca_btl_openib_send_user_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_max_t);
OBJ_CLASS_DECLARATION(mca_btl_openib_send_user_frag_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_send_frag_frag_t;
typedef struct mca_btl_openib_frag_t mca_btl_openib_recv_user_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_frag_t);
OBJ_CLASS_DECLARATION(mca_btl_openib_recv_user_frag_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_recv_frag_frag_t;
typedef struct mca_btl_openib_frag_t mca_btl_openib_recv_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_frag_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_recv_frag_eager_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_eager_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_recv_frag_max_t;
OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_max_t);
OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t);
typedef struct mca_btl_openib_frag_t mca_btl_openib_send_frag_control_t;
@ -227,52 +220,44 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_control_t);
item, rc); \
frag = (mca_btl_openib_frag_t*)item; \
} while (0)
#define MCA_BTL_IB_FRAG_ALLOC_EAGER(btl, frag, rc) \
MCA_BTL_IB_FRAG_ALLOC(btl, frag, rc, BTL_OPENIB_HP_QP)
#define MCA_BTL_IB_FRAG_ALLOC_MAX(btl, frag, rc) \
MCA_BTL_IB_FRAG_ALLOC(btl, frag, rc, BTL_OPENIB_LP_QP)
#define MCA_BTL_IB_FRAG_ALLOC_SEND_FRAG(btl, frag, rc) \
{ \
#define MCA_BTL_IB_FRAG_ALLOC_BY_SIZE(btl, frag, _size, rc) \
do { \
int qp; \
ompi_free_list_item_t* item = NULL; \
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { \
if(mca_btl_openib_component.qp_infos[qp].size >= _size) { \
OMPI_FREE_LIST_GET( \
&((mca_btl_openib_module_t*) btl)->qps[qp].send_free, \
item, rc); \
if(item) \
break; \
} \
} \
frag = (mca_btl_openib_frag_t*) item; \
} while(0);
#define MCA_BTL_IB_FRAG_ALLOC_SEND_USER(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_GET(&((mca_btl_openib_module_t*)btl)->send_free_frag, item, rc); \
frag = (mca_btl_openib_frag_t*) item; \
}
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_GET(&((mca_btl_openib_module_t*)btl)->send_user_free, item, rc); \
frag = (mca_btl_openib_frag_t*) item; \
}
#define MCA_BTL_IB_FRAG_ALLOC_RECV_FRAG(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_GET(&((mca_btl_openib_module_t*)btl)->recv_free_frag, item, rc); \
frag = (mca_btl_openib_frag_t*) item; \
}
#define MCA_BTL_IB_FRAG_ALLOC_RECV_USER(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_GET(&((mca_btl_openib_module_t*)btl)->recv_user_free, item, rc); \
frag = (mca_btl_openib_frag_t*) item; \
}
#define MCA_BTL_IB_FRAG_RETURN(btl, frag) \
{ do { \
ompi_free_list_t* my_list = NULL; \
switch(frag->type) { \
case MCA_BTL_OPENIB_FRAG_EAGER_RDMA: \
case MCA_BTL_OPENIB_FRAG_EAGER: \
my_list = &btl->send_free[BTL_OPENIB_HP_QP]; \
break; \
case MCA_BTL_OPENIB_FRAG_MAX: \
my_list = &btl->send_free[BTL_OPENIB_LP_QP]; \
break; \
case MCA_BTL_OPENIB_FRAG_CONTROL: \
my_list = &btl->send_free_control; \
break; \
case MCA_BTL_OPENIB_RECV_FRAG_FRAG: \
my_list = &btl->recv_free_frag; \
break; \
case MCA_BTL_OPENIB_SEND_FRAG_FRAG: \
my_list = &btl->send_free_frag; \
break; \
} \
OMPI_FREE_LIST_RETURN(my_list, (ompi_free_list_item_t*)(frag)); \
} while(0); \
{ do { \
OMPI_FREE_LIST_RETURN(frag->list, \
(ompi_free_list_item_t*)(frag)); \
} while(0); \
}
#define MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(btl,list) \
@ -284,6 +269,17 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_send_frag_control_t);
struct mca_btl_openib_module_t;
struct mca_btl_openib_frag_init_data_t {
uint8_t order;
size_t length;
mca_btl_openib_frag_type_t type;
ompi_free_list_t* list;
};
typedef struct mca_btl_openib_frag_init_data_t mca_btl_openib_frag_init_data_t;
void mca_btl_openib_frag_init(ompi_free_list_item_t* item, void* ctx);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -11,7 +11,6 @@
* All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -25,6 +24,7 @@
#include "opal/mca/installdirs/installdirs.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"
#include "opal/mca/base/mca_base_param.h"
#include "btl_openib.h"
@ -49,6 +49,9 @@ enum {
REGSTR_MAX = 0x88
};
static int mca_btl_openib_mca_setup_qps(void);
/*
* utility routine for string parameter registration
*/
@ -207,7 +210,8 @@ int btl_openib_register_mca_params(void)
"queue (will automatically be set to a minimum of "
"(2 * number_of_peers * btl_openib_rd_num))",
1000, &ival, REGINT_GE_ONE));
mca_btl_openib_component.ib_cq_size = (uint32_t) ival;
mca_btl_openib_component.ib_lp_cq_size =
mca_btl_openib_component.ib_hp_cq_size = (uint32_t) ival;
CHECK(reg_int("ib_sg_list_size", "Size of IB segment list "
"(must be >= 1)",
@ -223,9 +227,12 @@ int btl_openib_register_mca_params(void)
"(must be > 0 and < 0xffff)",
0, &ival, REGINT_GE_ZERO));
if (ival > 0xffff) {
ret = OMPI_ERR_BAD_PARAM;
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "invalid value for btl_openib_ib_pkey_val",
"btl_openib_ib_pkey_val ignored");
} else {
mca_btl_openib_component.ib_pkey_val = (uint32_t) ival;
}
mca_btl_openib_component.ib_pkey_val = (uint32_t) ival;
CHECK(reg_int("ib_psn", "InfiniBand packet sequence starting number "
"(must be >= 0)",
@ -250,7 +257,9 @@ int btl_openib_register_mca_params(void)
CHECK(reg_int("ib_mtu", msg, IBV_MTU_1024, &ival, 0));
free(msg);
if (ival < IBV_MTU_1024 || ival > IBV_MTU_4096) {
ret = OMPI_ERR_BAD_PARAM;
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "invalid value for btl_openib_ib_mtu",
"btl_openib_ib_mtu reset to 1024");
mca_btl_openib_component.ib_mtu = IBV_MTU_1024;
} else {
mca_btl_openib_component.ib_mtu = (uint32_t) ival;
@ -259,25 +268,68 @@ int btl_openib_register_mca_params(void)
/* JMS Is this really in seconds? Is there a max? */
CHECK(reg_int("ib_min_rnr_timer", "InfiniBand minimum "
"\"receiver not ready\" timer, in seconds "
"(must be >= 0 and <= 32)",
5, &ival, REGINT_GE_ZERO));
"(must be >= 0 and <= 31)",
5, &ival, 0));
if (ival > 31) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "btl_openib_ib_min_rnr_timer > 31",
"btl_openib_ib_min_rnr_timer reset to 31");
ival = 31;
} else if (ival < 0){
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "btl_openib_ib_min_rnr_timer < 0",
"btl_openib_ib_min_rnr_timer reset to 0");
ival = 0;
}
mca_btl_openib_component.ib_min_rnr_timer = (uint32_t) ival;
CHECK(reg_int("ib_timeout", "InfiniBand transmit timeout, plugged into formula: 4.096 microseconds * (2^btl_openib_ib_timeout)"
"(must be >= 0 and <= 32)",
10, &ival, REGINT_GE_ZERO));
"(must be >= 0 and <= 31)",
10, &ival, 0));
if (ival > 31) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "btl_openib_ib_timeout > 31",
"btl_openib_ib_timeout reset to 31");
ival = 31;
} else if (ival < 0) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "btl_openib_ib_timeout < 0",
"btl_openib_ib_timeout reset to 0");
ival = 0;
}
mca_btl_openib_component.ib_timeout = (uint32_t) ival;
/* JMS What is the difference between these two counts? */
CHECK(reg_int("ib_retry_count", "InfiniBand transmit retry count "
"(must be >= 0 and <= 7)",
7, &ival, REGINT_GE_ZERO));
7, &ival, 0));
if (ival > 7) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "btl_openib_ib_retry_count > 7",
"btl_openib_ib_retry_count reset to 7");
ival = 7;
} else if (ival < 0) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "btl_openib_ib_retry_count < 0",
"btl_openib_ib_retry_count reset to 0");
ival = 0;
}
mca_btl_openib_component.ib_retry_count = (uint32_t) ival;
CHECK(reg_int("ib_rnr_retry", "InfiniBand \"receiver not ready\" "
"retry count "
"(must be >= 0 and <= 7)",
7, &ival, REGINT_GE_ZERO));
7, &ival, 0));
if (ival > 7) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "btl_openib_ib_rnr_retry > 7",
"btl_openib_ib_rnr_retry reset to 7");
ival = 7;
} else if (ival < 0) {
opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
true, "btl_openib_ib_rnr_retry < 0",
"btl_openib_ib_rnr_retry reset to 0");
ival = 0;
}
mca_btl_openib_component.ib_rnr_retry = (uint32_t) ival;
CHECK(reg_int("ib_max_rdma_dst_ops", "InfiniBand maximum pending RDMA "
@ -298,53 +350,6 @@ int btl_openib_register_mca_params(void)
0, &ival, REGINT_GE_ZERO));
mca_btl_openib_component.ib_static_rate = (uint32_t) ival;
CHECK(reg_int("rd_num", "Number of receive descriptors to post to a "
"per-peer queue pair (must be >= 1)",
8, &ival, REGINT_GE_ONE));
mca_btl_openib_component.rd_num = (uint32_t) ival;
CHECK(reg_int("rd_low", "Low water mark before posting additional receive descriptors "
"(must be >= 1)",
6, &ival, REGINT_GE_ONE));
mca_btl_openib_component.rd_low = (uint32_t) ival;
/* JMS meaning what? */
CHECK(reg_int("rd_win",
"Window size at which generate explicit credit message "
"(must be >= 1)",
4, &ival, REGINT_GE_ONE));
mca_btl_openib_component.rd_win = (uint32_t) ival;
/* we only allow one outstanding ctrl message at a time */
mca_btl_openib_component.rd_rsv = 1;
CHECK(reg_int("use_srq",
"If nonzero, use the InfiniBand shared receive "
"queue (\"SRQ\")",
0, &ival, 0));
mca_btl_openib_component.use_srq = (0 != ival);
CHECK(reg_int("srq_rd_max", "Total number of receive descriptors "
"posted per SRQ. This value is only used if it is larger "
"than (rd_num + log2(num_MPI_processes) * srq_rd_per_peer), "
"and is only relevant if btl_openib_use_srq is "
"true (must be >= 1)",
1000, &ival, REGINT_GE_ONE));
mca_btl_openib_component.srq_rd_max = (uint32_t) ival;
CHECK(reg_int("srq_rd_per_peer",
"Number of receive descriptors posted per peer in the SRQ "
"(only relevant if btl_openib_use_srq is "
"true; must be >= 1)",
16, &ival, REGINT_GE_ONE));
mca_btl_openib_component.srq_rd_per_peer = ival;
CHECK(reg_int("srq_sd_max",
"Maximum number of send descriptors posted "
"(only relevant if btl_openib_use_srq is "
"true; must be >= 1)",
8, &ival, REGINT_GE_ONE));
mca_btl_openib_component.srq_sd_max = (uint32_t) ival;
CHECK(reg_int("use_eager_rdma", "Use RDMA for eager messages",
1, &ival, 0));
mca_btl_openib_component.use_eager_rdma = (uint32_t) (ival != 0);
@ -385,6 +390,13 @@ int btl_openib_register_mca_params(void)
0, &ival, REGINT_GE_ZERO));
mca_btl_openib_component.max_lmc = (uint32_t) ival;
#if OMPI_HAVE_THREADS
CHECK(reg_int("use_async_event_thread",
"If nonzero, use the thread that will handle InfiniBand asyncihronous events ",
1, &ival, 0));
mca_btl_openib_component.use_async_event_thread = (0 != ival);
#endif
CHECK(reg_int("buffer_alignment",
"Prefered communication buffer alignment, in bytes "
"(must be > 0 and power of two)",
@ -396,7 +408,7 @@ int btl_openib_register_mca_params(void)
} else {
mca_btl_openib_component.buffer_alignment = (uint32_t) ival;
}
/* Info only */
mca_base_param_reg_int(&mca_btl_openib_component.super.btl_version,
@ -411,6 +423,7 @@ int btl_openib_register_mca_params(void)
NULL);
mca_btl_openib_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_DEFAULT;
mca_btl_openib_module.super.btl_eager_limit = 12 * 1024;
mca_btl_openib_module.super.btl_min_send_size = 32 * 1024;
mca_btl_openib_module.super.btl_max_send_size = 64 * 1024;
@ -421,10 +434,18 @@ int btl_openib_register_mca_params(void)
MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_NEED_CSUM;
mca_btl_openib_module.super.btl_bandwidth = 800;
mca_btl_openib_module.super.btl_latency = 10;
mca_btl_base_param_register(&mca_btl_openib_component.super.btl_version,
ret = mca_btl_base_param_register(
&mca_btl_openib_component.super.btl_version,
&mca_btl_openib_module.super);
CHECK(reg_string("if_include",
if(ret != OMPI_SUCCESS)
return ret;
/* setup all the qp stuff */
if((ret = mca_btl_openib_mca_setup_qps()) != MPI_SUCCESS)
return ret;
CHECK(reg_string("if_include",
"Comma-delimited list of HCAs/ports to be used (e.g. \"mthca0,mthca1:2\"; empty value means to use all ports found). Mutually exclusive with btl_openib_if_exclude.",
NULL, &mca_btl_openib_component.if_include,
0));
@ -436,3 +457,140 @@ int btl_openib_register_mca_params(void)
return ret;
}
static int mca_btl_openib_mca_setup_qps(void) {
/* All the multi-qp stuff.. */
char *str;
char **queues, **params = NULL;
int num_pp_qps = 0, num_srq_qps = 0, qp = 0, ret = OMPI_ERROR;
/* char *default_qps = "P,128,32,16,20;P,256,16,8,14;P,4096,8,6,4;P,65536,8,6,4"; */
/* char *default_qps = "P,128,8,4;P,1024,8,4;P,4096,8,4;P,65536,8,2"; */
/* char *default_qps = "P,4096,16,4;P,65536,16,2"; */
/* char *default_qps = "P,128,16,4;S,1024,256,128,32;S,4096,256,128,32;S,65536,256,128,32"; */
char *default_qps = "P,128,16,4;S,1024,256,128,32;S,4096,256,128,32;S,65536,256,128,32";
uint32_t max_qp_size, max_size_needed;
reg_string("receive_queues",
"Colon-delimited, coma delimited list of receive queues: P,4096,8,6,4;P,32768,8,6,4",
default_qps, &str, 0);
queues = opal_argv_split(str, ';');
if(opal_argv_count(queues) == 0) {
opal_output(0, "At least one QP has to be specified in"
" btl_openib_receive_queues\n");
return OMPI_ERROR;
}
while(queues[qp] != NULL) {
if(strncmp("P,", queues[qp], 2) == 0) {
num_pp_qps++;
} else if(strncmp("S,", queues[qp], 2) == 0) {
num_srq_qps++;
} else {
opal_output(0, "Unknown QP type \"%s\" is specified in "
"btl_openib_receive_queues. Only 'S' - shared or "
"'P' - point-to-point are supported\n", queues[qp]);
goto error;
}
qp++;
}
mca_btl_openib_component.num_pp_qps = num_pp_qps;
mca_btl_openib_component.num_srq_qps = num_srq_qps;
mca_btl_openib_component.num_qps = num_pp_qps + num_srq_qps;
mca_btl_openib_component.qp_infos = (mca_btl_openib_qp_info_t*)
malloc(sizeof(mca_btl_openib_qp_info_t) *
mca_btl_openib_component.num_qps);
qp = 0;
while(queues[qp] != NULL) {
int rd_win, i = 0;
params = opal_argv_split(queues[qp], ',');
if(params[0][0] == 'P') {
if(opal_argv_count(params) != 4) {
opal_output(0, "Wrong QP specification (QP %d \"%s\"). "
"Point-to-point QP get 3 parameters\n", qp, queues[qp]);
goto error;
}
mca_btl_openib_component.qp_infos[qp].size = atoi(params[1]);
mca_btl_openib_component.qp_infos[qp].rd_num = atoi(params[2]);
mca_btl_openib_component.qp_infos[qp].rd_low = atoi(params[3]);
/* mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = atoi(params[4]); */
rd_win = (mca_btl_openib_component.qp_infos[qp].rd_low >> 1);
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win = rd_win > 0 ? rd_win : 1;
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv =
((mca_btl_openib_component.qp_infos[qp].rd_num << 1) - 1)/
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win;
opal_output(mca_btl_base_output, "pp: rd_num is %d \t rd_low is %d \t rd_win %d \t rd_rsv %d \n",
mca_btl_openib_component.qp_infos[qp].rd_num,
mca_btl_openib_component.qp_infos[qp].rd_low,
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_win,
mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv
);
mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_PP_QP;
} else if(params[0][0] =='S') {
if(opal_argv_count(params) != 5) {
opal_output(0, "Wrong QP specification (QP %d \"%s\"). "
"Shared QP get 4 parameters\n", qp, queues[qp]);
goto error;
}
mca_btl_openib_component.qp_infos[qp].size = atoi(params[1]);
mca_btl_openib_component.qp_infos[qp].rd_num = atoi(params[2]);
mca_btl_openib_component.qp_infos[qp].rd_low = atoi(params[3]);
mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max = atoi(params[4]);
opal_output(mca_btl_base_output, "srq: rd_num is %d \t rd_low is %d\n",
mca_btl_openib_component.qp_infos[qp].rd_num,
mca_btl_openib_component.qp_infos[qp].rd_low);
mca_btl_openib_component.qp_infos[qp].type = MCA_BTL_OPENIB_SRQ_QP;
}
while(params[i] != NULL)
free(params[i++]);
free(params);
qp++;
}
params = NULL;
max_qp_size = mca_btl_openib_component.qp_infos[mca_btl_openib_component.num_qps - 1].size;
max_size_needed = (mca_btl_openib_module.super.btl_eager_limit >
mca_btl_openib_module.super.btl_max_send_size) ?
mca_btl_openib_module.super.btl_eager_limit :
mca_btl_openib_module.super.btl_max_send_size;
if(max_qp_size < max_size_needed) {
opal_output(0, "The biggest QP is not big enough. "
"%d bytes configured, but maximum send size may be %d\n",
max_qp_size, max_size_needed);
ret = OMPI_ERROR;
goto error;
} else if(max_qp_size > max_size_needed) {
opal_output(0, "The biggest QP size is bigger than maximum send size. "
"This is not optimal configuration as memory will be waisted.\n");
}
mca_btl_openib_component.rdma_qp = mca_btl_openib_component.num_qps - 1;
mca_btl_openib_component.eager_rdma_qp = 0;
ret = MPI_SUCCESS;
error:
if(params) {
qp = 0;
while(params[qp] != NULL)
free(params[qp++]);
free(params);
}
if(queues) {
qp = 0;
while(queues[qp] != NULL)
free(queues[qp++]);
free(queues);
}
return ret;
}

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -136,14 +137,14 @@ mca_btl_openib_proc_t* mca_btl_openib_proc_create(ompi_proc_t* ompi_proc)
if(OMPI_SUCCESS != rc) {
opal_output(0, "[%s:%d] ompi_modex_recv failed for peer [%ld,%ld,%ld]",
opal_output(mca_btl_base_output, "[%s:%d] mca_pml_base_modex_recv failed for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(module_proc);
return NULL;
}
if((size % sizeof(mca_btl_openib_port_info_t)) != 0) {
opal_output(0, "[%s:%d] invalid module address for peer [%ld,%ld,%ld]",
opal_output(mca_btl_base_output, "[%s:%d] invalid module address for peer [%ld,%ld,%ld]",
__FILE__,__LINE__,ORTE_NAME_ARGS(&ompi_proc->proc_name));
OBJ_RELEASE(module_proc);
return NULL;

Просмотреть файл

@ -11,6 +11,7 @@
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2007 Mellanox Technologies. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -183,10 +184,6 @@ Please see this FAQ entry for more details:
NOTE: You can turn off this warning by setting the MCA parameter
btl_openib_warn_default_gid_prefix to 0.
#
[wrong buffer alignment]
Wrong buffer alignment %d configured on host '%s'. Should be bigger
than zero and power of two. Use default %d instead.
#
[ibv_fork requested but not supported]
WARNING: fork() support was requested for the openib BTL, but it is
not supported on the host %s. Deactivating the openib BTL.
@ -200,6 +197,31 @@ Deactivating the openib BTL.
Wrong buffer alignment %d configured on host '%s'. Should be bigger
than zero and power of two. Use default %d instead.
#
[of error event]
The OpenFabrics stack has reported a network error event
Open MPI will try to continue, but your job may end up failing.
Host: %s
MPI process PID: %d
Error number: %d
Error description: %s
This error may indicate connectivity problems within the fabric;
please contact your system administrator.
#
[of unknown event]
The OpenFabrics stack has reported an unknown network error event.
Open MPI will try to continue, but the job may end up failing.
Host: %s
MPI process PID: %d
Error number: %d
This error may indicate that you are using an OpenFabrics library
version that is not currently supported by Open MPI. You might try
recompiling Open MPI against your OpenFabrics library installation to
get more information.
#
[specified include and exclude]
ERROR: You have specified both the btl_openib_if_include and
btl_openib_if_exclude MCA parameters. These two parameters are
@ -209,6 +231,7 @@ For reference, the values that you specified are:
btl_openib_if_include: %s
btl_openib_if_exclude: %s
#
[nonexistent port]
WARNING: One or more nonexistent HCAs/ports were specified:
@ -218,3 +241,9 @@ WARNING: One or more nonexistent HCAs/ports were specified:
These entities will be ignored. You can disable this warning by
setting the btl_openib_warn_nonexistent_if MCA parameter to 0.
#
[invalid mca param value]
WARNING: the openib BTL detected an illegal MCA parameter value:
%s
%s

Просмотреть файл

@ -309,7 +309,9 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
btl->super.btl_mpool,
NULL,
NULL);
ompi_free_list_init_ex(&btl->udapl_frag_max,
sizeof(mca_btl_udapl_frag_max_t) +
@ -319,7 +321,9 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
btl->super.btl_mpool,
NULL,
NULL);
ompi_free_list_init_ex(&btl->udapl_frag_user,
sizeof(mca_btl_udapl_frag_user_t),
@ -328,7 +332,9 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
mca_btl_udapl_component.udapl_free_list_num,
mca_btl_udapl_component.udapl_free_list_max,
mca_btl_udapl_component.udapl_free_list_inc,
NULL);
NULL,
NULL,
NULL);
ompi_free_list_init_ex(&btl->udapl_frag_control,
sizeof(mca_btl_udapl_frag_eager_t) +
@ -338,7 +344,9 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
mca_btl_udapl_component.udapl_free_list_num,
-1,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
btl->super.btl_mpool,
NULL,
NULL);
/* initialize eager rdma buffer info */
orte_pointer_array_init(&btl->udapl_eager_rdma_endpoints,

Просмотреть файл

@ -78,6 +78,22 @@ ompi_init_preconnect_mpi(void)
ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE);
if (OMPI_SUCCESS != ret) return ret;
ret = MCA_PML_CALL(isend(outbuf, 1, MPI_CHAR,
next, 1,
MCA_PML_BASE_SEND_COMPLETE,
MPI_COMM_WORLD,
&requests[1]));
if (OMPI_SUCCESS != ret) return ret;
ret = MCA_PML_CALL(irecv(inbuf, 1, MPI_CHAR,
prev, 1,
MPI_COMM_WORLD,
&requests[0]));
if(OMPI_SUCCESS != ret) return ret;
ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE);
if (OMPI_SUCCESS != ret) return ret;
}
return ret;

Просмотреть файл

@ -54,6 +54,8 @@
*
**********************************************************************/
/* Do we have posix or solaris thread lib */
#define OMPI_HAVE_THREADS (OMPI_HAVE_POSIX_THREADS || OMPI_HAVE_SOLARIS_THREADS)
/* Do we have thread support? */
#define OMPI_HAVE_THREAD_SUPPORT (OMPI_ENABLE_MPI_THREADS || OMPI_ENABLE_PROGRESS_THREADS)