1
1

add udapl rdma capabilities into the udapl btl

This commit was SVN r13082.
Этот коммит содержится в:
Donald Kerr 2007-01-11 15:22:08 +00:00
родитель e5205657cf
Коммит 80f2cbb498
11 изменённых файлов: 1930 добавлений и 434 удалений

Просмотреть файл

@ -9,6 +9,8 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -21,7 +23,10 @@
CFLAGS = $(btl_udapl_CFLAGS)
AM_CPPFLAGS = $(btl_udapl_CPPFLAGS)
AM_CPPFLAGS = $(btl_udapl_CPPFLAGS) -DPKGDATADIR=\"$(pkgdatadir)\"
dist_pkgdata_DATA = \
help-mpi-btl-udapl.txt
udapl_sources = \
btl_udapl.c \
@ -31,6 +36,7 @@ udapl_sources = \
btl_udapl_endpoint.h \
btl_udapl_frag.c \
btl_udapl_frag.h \
btl_udapl_eager_rdma.h \
btl_udapl_proc.c \
btl_udapl_proc.h

Просмотреть файл

@ -25,6 +25,7 @@
#include <string.h>
#include "opal/util/output.h"
#include "opal/util/if.h"
#include "opal/util/show_help.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
@ -36,6 +37,7 @@
#include "ompi/datatype/datatype.h"
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "ompi/proc/proc.h"
static int udapl_reg_mr(void *reg_data, void *base, size_t size,
@ -61,9 +63,9 @@ mca_btl_udapl_module_t mca_btl_udapl_module = {
mca_btl_udapl_alloc,
mca_btl_udapl_free,
mca_btl_udapl_prepare_src,
NULL, /* prepare_dst */
mca_btl_udapl_prepare_dst,
mca_btl_udapl_send,
NULL, /* put */
mca_btl_udapl_put,
NULL, /* get */
mca_btl_base_dump,
NULL, /* mpool */
@ -106,8 +108,13 @@ static int udapl_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg)
if(udapl_reg->lmr != NULL) {
rc = dat_lmr_free(udapl_reg->lmr);
if(rc != DAT_SUCCESS) {
opal_output(0, "%s: error unpinning dapl memory errno says %s\n",
__func__, strerror(errno));
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_lmr_free",
major, minor));
return OMPI_ERROR;
}
}
@ -132,14 +139,26 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
rc = dat_ia_open(ia_name, mca_btl_udapl_component.udapl_evd_qlen,
&btl->udapl_evd_async, &btl->udapl_ia);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ia_open");
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ia_open",
major, minor));
return OMPI_ERROR;
}
/* create a protection zone */
rc = dat_pz_create(btl->udapl_ia, &btl->udapl_pz);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_pz_create");
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_pz_create",
major, minor));
goto failure;
}
@ -148,7 +167,13 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
rc = dat_ia_query(btl->udapl_ia, &btl->udapl_evd_async,
DAT_IA_FIELD_IA_ADDRESS_PTR, &attr, 0, NULL);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ia_query");
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ia_query",
major, minor));
goto failure;
}
@ -159,7 +184,13 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
mca_btl_udapl_component.udapl_evd_qlen, DAT_HANDLE_NULL,
DAT_EVD_DTO_FLAG | DAT_EVD_RMR_BIND_FLAG, &btl->udapl_evd_dto);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_evd_create (dto)");
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_evd_create (dto)",
major, minor));
goto failure;
}
@ -167,16 +198,39 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
mca_btl_udapl_component.udapl_evd_qlen, DAT_HANDLE_NULL,
DAT_EVD_CR_FLAG | DAT_EVD_CONNECTION_FLAG, &btl->udapl_evd_conn);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_evd_create (conn)");
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_evd_create (conn)",
major, minor));
goto failure;
}
/* create our public service point */
rc = dat_psp_create_any(btl->udapl_ia, &port, btl->udapl_evd_conn,
DAT_PSP_CONSUMER_FLAG, &btl->udapl_psp);
DAT_PSP_CONSUMER_FLAG, &btl->udapl_psp);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_psp_create_any");
goto failure;
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_psp_create_any",
major, minor));
goto failure;
}
/* establish endpoint parameters */
rc = mca_btl_udapl_endpoint_get_params(btl, &(btl->udapl_ep_param));
if(OMPI_SUCCESS != rc) {
/* by not erroring out here we can try to continue with
* the default endpoint parameter values
*/
opal_show_help("help-mpi-btl-udapl.txt",
"use default endpoint params",
true);
}
/* Save the port with the address information */
@ -211,6 +265,7 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
OBJ_CONSTRUCT(&btl->udapl_frag_eager, ompi_free_list_t);
OBJ_CONSTRUCT(&btl->udapl_frag_max, ompi_free_list_t);
OBJ_CONSTRUCT(&btl->udapl_frag_user, ompi_free_list_t);
OBJ_CONSTRUCT(&btl->udapl_frag_control, ompi_free_list_t);
OBJ_CONSTRUCT(&btl->udapl_lock, opal_mutex_t);
/* initialize free lists */
@ -240,6 +295,23 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl)
mca_btl_udapl_component.udapl_free_list_inc,
NULL);
ompi_free_list_init(&btl->udapl_frag_control,
sizeof(mca_btl_udapl_frag_eager_t) +
mca_btl_udapl_component.udapl_eager_frag_size,
OBJ_CLASS(mca_btl_udapl_frag_eager_t),
mca_btl_udapl_component.udapl_free_list_num,
-1,
mca_btl_udapl_component.udapl_free_list_inc,
btl->super.btl_mpool);
/* initialize eager rdma buffer info */
orte_pointer_array_init(&btl->udapl_eager_rdma_endpoints,
mca_btl_udapl_component.udapl_max_eager_rdma_peers,
mca_btl_udapl_component.udapl_max_eager_rdma_peers,
0);
btl->udapl_eager_rdma_endpoint_count = 0;
OBJ_CONSTRUCT(&btl->udapl_eager_rdma_lock, opal_mutex_t);
/* TODO - Set up SRQ when it is supported */
return OMPI_SUCCESS;
@ -256,6 +328,20 @@ int mca_btl_udapl_finalize(struct mca_btl_base_module_t* base_btl)
{
mca_btl_udapl_module_t* udapl_btl = (mca_btl_udapl_module_t*) base_btl;
/*
* Cleaning up the endpoints here because mca_btl_udapl_del_procs
* is never called by upper layers.
* Note: this is only looking at those endpoints which are available
* off of the btl module rdma list.
*/
for (int i=0; i < udapl_btl->udapl_eager_rdma_endpoint_count; i++) {
mca_btl_udapl_endpoint_t* endpoint =
orte_pointer_array_get_item(udapl_btl->udapl_eager_rdma_endpoints,
i);
OBJ_DESTRUCT(endpoint);
}
/* release uDAPL resources */
dat_evd_free(udapl_btl->udapl_evd_dto);
dat_evd_free(udapl_btl->udapl_evd_conn);
@ -267,7 +353,9 @@ int mca_btl_udapl_finalize(struct mca_btl_base_module_t* base_btl)
OBJ_DESTRUCT(&udapl_btl->udapl_frag_eager);
OBJ_DESTRUCT(&udapl_btl->udapl_frag_max);
OBJ_DESTRUCT(&udapl_btl->udapl_frag_user);
OBJ_DESTRUCT(&udapl_btl->udapl_frag_control);
OBJ_DESTRUCT(&udapl_btl->udapl_eager_rdma_lock);
free(udapl_btl);
return OMPI_SUCCESS;
}
@ -377,29 +465,33 @@ mca_btl_base_descriptor_t* mca_btl_udapl_alloc(
mca_btl_udapl_module_t* udapl_btl = (mca_btl_udapl_module_t*) btl;
mca_btl_udapl_frag_t* frag;
int rc;
int pad = 0;
/* compute pad as needed */
MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad,
(size + sizeof(mca_btl_udapl_footer_t)));
if(size <= btl->btl_eager_limit) {
if((size + pad) <= btl->btl_eager_limit) {
MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(udapl_btl, frag, rc);
frag->segment.seg_len =
size <= btl->btl_eager_limit ?
size : btl->btl_eager_limit;
} else if(size <= btl->btl_max_send_size) {
MCA_BTL_UDAPL_FRAG_ALLOC_MAX(udapl_btl, frag, rc);
frag->segment.seg_len =
size <= btl->btl_max_send_size ?
size : btl->btl_max_send_size;
} else {
return NULL;
}
/* Set up the LMR triplet from the frag segment */
/* Note that this triplet defines a sub-region of a registered LMR */
frag->segment.seg_len = size;
/* Set up the LMR triplet from the frag segment.
* Note: The triplet.segment_len is set to what is required for
* actually sending the fragment, if later it is determined
* that rdma can be used to transfer the fragment the
* triplet.segment_len will have to change.
*/
frag->triplet.virtual_address = (DAT_VADDR)frag->segment.seg_addr.pval;
frag->ftr = (mca_btl_udapl_footer_t *)
((char *)frag->segment.seg_addr.pval + frag->segment.seg_len);
frag->triplet.segment_length =
frag->segment.seg_len + sizeof(mca_btl_udapl_footer_t);
assert(frag->triplet.lmr_context == frag->registration->lmr_triplet.lmr_context);
assert(frag->triplet.lmr_context ==
frag->registration->lmr_triplet.lmr_context);
frag->btl = udapl_btl;
frag->base.des_src = &frag->segment;
@ -424,13 +516,13 @@ int mca_btl_udapl_free(
if(frag->size == 0 && frag->registration != NULL) {
btl->btl_mpool->mpool_deregister(btl->btl_mpool,
(mca_mpool_base_registration_t*)frag->registration);
(mca_mpool_base_registration_t*)frag->registration);
MCA_BTL_UDAPL_FRAG_RETURN_USER(btl, frag);
} else if(frag->size == mca_btl_udapl_component.udapl_eager_frag_size) {
MCA_BTL_UDAPL_FRAG_RETURN_EAGER(btl, frag);
} else if(frag->size == mca_btl_udapl_component.udapl_max_frag_size) {
MCA_BTL_UDAPL_FRAG_RETURN_MAX(btl, frag);
} else {
} else {
OPAL_OUTPUT((0, "[%s:%d] mca_btl_udapl_free: invalid descriptor\n", __FILE__,__LINE__));
return OMPI_ERR_BAD_PARAM;
}
@ -453,152 +545,99 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src(
size_t* size
)
{
mca_btl_udapl_frag_t* frag;
mca_btl_udapl_frag_t* frag = NULL;
struct iovec iov;
uint32_t iov_count = 1;
size_t max_data = *size;
int rc;
int pad = 0;
#if 0
/*
* If the data has already been pinned and is contigous than we can
* use it in place.
*/
if (NULL != registration && 0 == ompi_convertor_need_buffers(convertor)) {
size_t reg_len;
OPAL_OUTPUT((0, "udapl_prepare_src 1\n"));
/* compute pad as needed */
MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad,
(max_data + reserve + sizeof(mca_btl_udapl_footer_t)));
MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc);
if(NULL == frag){
return NULL;
}
if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) {
if(registration != NULL || max_data > btl->btl_max_send_size) {
iov.iov_len = max_data;
iov.iov_base = NULL;
MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc);
if(NULL == frag){
return NULL;
}
ompi_convertor_pack(convertor, &iov,
iov.iov_len = max_data;
iov.iov_base = NULL;
ompi_convertor_pack(convertor, &iov,
&iov_count, &max_data );
frag->segment.seg_len = max_data;
frag->segment.seg_addr.pval = iov.iov_base;
frag->triplet.segment_length = max_data;
frag->triplet.virtual_address = (DAT_VADDR)iov.iov_base;
reg_len = (unsigned char*)registration->bound -
(unsigned char*)iov.iov_base + 1;
*size = max_data;
/* bump reference count as so that the registration
* doesn't go away when the operation completes
*/
btl->btl_mpool->mpool_retain(btl->btl_mpool, registration);
frag->registration = registration;
frag->triplet.lmr_context =
((mca_mpool_udapl_registration_t*)registration)->lmr_triplet.lmr_context;
if(NULL == registration) {
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, iov.iov_base,
max_data, 0,
&registration);
/*
* if the data is not already pinned - but the leave pinned option is set,
* then go ahead and pin contigous data. however, if a reserve is required
* then we must allocate a fragment w/ buffer space
*/
} else if (max_data > btl->btl_max_send_size &&
ompi_convertor_need_buffers(convertor) == 0 &&
reserve == 0) {
if(rc != OMPI_SUCCESS) {
MCA_BTL_UDAPL_FRAG_RETURN_USER(btl,frag);
return NULL;
}
/* keep track of the registration we did */
frag->registration = (mca_btl_udapl_reg_t*)registration;
}
mca_mpool_base_module_t* mpool = btl->btl_mpool;
MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc);
if(NULL == frag){
return NULL;
}
OPAL_OUTPUT((0, "udapl_prepare_src 2\n"));
frag->segment.seg_len = max_data;
frag->segment.seg_addr.pval = iov.iov_base;
frag->triplet.segment_length = max_data;
frag->triplet.virtual_address = (DAT_VADDR)iov.iov_base;
frag->triplet.lmr_context =
((mca_btl_udapl_reg_t*)registration)->lmr_triplet.lmr_context;
iov.iov_len = max_data;
iov.iov_base = NULL;
ompi_convertor_pack(convertor, &iov,
&iov_count, &max_data );
rc = mpool->mpool_register(
mpool,
iov.iov_base,
max_data,
0,
&registration);
if(rc != OMPI_SUCCESS) {
MCA_BTL_UDAPL_FRAG_RETURN_USER(btl,frag);
return NULL;
}
frag->registration = registration;
frag->triplet.lmr_context =
((mca_mpool_udapl_registration_t*)registration)->lmr_triplet.lmr_context;
/* TODO - should our base addr be frag->ftr? */
frag->segment.seg_len = max_data;
frag->segment.seg_addr.pval = iov.iov_base;
frag->triplet.segment_length = max_data;
frag->triplet.virtual_address = (DAT_VADDR)iov.iov_base;
}
/*
* if we aren't pinning the data and the requested size is less
* than the eager limit pack into a fragment from the eager pool
*/
else
#endif
if(max_data + reserve <= btl->btl_eager_limit) {
MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
iov.iov_len = max_data;
iov.iov_base = (char *) frag->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor,
&iov, &iov_count, &max_data );
*size = max_data;
if(rc < 0) {
MCA_BTL_UDAPL_FRAG_RETURN_EAGER(btl, frag);
return NULL;
/* initialize base descriptor */
frag->base.des_src = &frag->segment;
frag->base.des_src_cnt = 1;
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
return &frag->base;
}
}
/*
* otherwise pack as much data as we can into a fragment
* that is the max send size.
*/
else {
if(max_data + pad + reserve <= btl->btl_eager_limit) {
/* the data is small enough to fit in the eager frag and
* memory is not prepinned */
MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(btl, frag, rc);
}
if(NULL == frag) {
/* the data doesn't fit into eager frag or eager frag is
* not available */
MCA_BTL_UDAPL_FRAG_ALLOC_MAX(btl, frag, rc);
if(NULL == frag) {
return NULL;
}
if(max_data + reserve > btl->btl_max_send_size){
if(max_data + reserve > btl->btl_max_send_size) {
max_data = btl->btl_max_send_size - reserve;
}
iov.iov_len = max_data;
iov.iov_base = (char *) frag->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor,
&iov, &iov_count, &max_data );
*size = max_data;
if(rc < 0) {
MCA_BTL_UDAPL_FRAG_RETURN_MAX(btl, frag);
return NULL;
}
}
iov.iov_len = max_data;
iov.iov_base = (char *) frag->segment.seg_addr.pval + reserve;
rc = ompi_convertor_pack(convertor,
&iov, &iov_count, &max_data );
if(rc < 0) {
MCA_BTL_UDAPL_FRAG_RETURN_MAX(btl, frag);
return NULL;
}
*size = max_data;
/* setup lengths and addresses to send out data */
frag->segment.seg_len = max_data + reserve;
frag->triplet.segment_length =
max_data + reserve + sizeof(mca_btl_udapl_footer_t);
max_data + reserve + sizeof(mca_btl_udapl_footer_t);
frag->triplet.virtual_address = (DAT_VADDR)frag->segment.seg_addr.pval;
frag->ftr = (mca_btl_udapl_footer_t *)
((char *)frag->segment.seg_addr.pval + frag->segment.seg_len);
/* initialize base descriptor */
frag->base.des_src = &frag->segment;
@ -606,13 +645,14 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src(
frag->base.des_dst = NULL;
frag->base.des_dst_cnt = 0;
frag->base.des_flags = 0;
return &frag->base;
}
/**
* Prepare a descriptor for send/rdma using the supplied
* convertor. If the convertor references data that is contigous,
* convertor. If the convertor references data that is contiguous,
* the descriptor may simply point to the user buffer. Otherwise,
* this routine is responsible for allocating buffer space and
* packing if required.
@ -623,7 +663,6 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src(
* @param reserve (IN) Additional bytes requested by upper layer to precede user data
* @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT)
*/
#if 0
mca_btl_base_descriptor_t* mca_btl_udapl_prepare_dst(
struct mca_btl_base_module_t* btl,
struct mca_btl_base_endpoint_t* endpoint,
@ -633,12 +672,9 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_dst(
size_t* size)
{
mca_btl_udapl_frag_t* frag;
mca_mpool_base_module_t* mpool = btl->btl_mpool;
ptrdiff_t lb;
int rc;
OPAL_OUTPUT((0, "udapl_prepare_dst\n"));
MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc);
if(NULL == frag) {
return NULL;
@ -648,46 +684,40 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_dst(
frag->segment.seg_len = *size;
frag->segment.seg_addr.pval = convertor->pBaseBuf + lb + convertor->bConverted;
if(NULL == registration) {
/* didn't get a memory registration passed in, so must
* register the region now
*/
rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
frag->segment.seg_addr.pval,
frag->segment.seg_len,
0,
&registration);
if(OMPI_SUCCESS != rc || NULL == registration) {
MCA_BTL_UDAPL_FRAG_RETURN_USER(btl,frag);
return NULL;
}
frag->registration = (mca_btl_udapl_reg_t*)registration;
}
frag->base.des_src = NULL;
frag->base.des_src_cnt = 0;
frag->base.des_dst = &frag->segment;
frag->base.des_dst_cnt = 1;
frag->base.des_flags = 0;
if(NULL != registration) {
/* bump reference count as so that the registration
* doesn't go away when the operation completes
*/
mpool->mpool_retain(mpool,
(mca_mpool_base_registration_t*) registration);
frag->registration = registration;
} else {
rc = mpool->mpool_register(
mpool,
frag->segment.seg_addr.pval,
frag->segment.seg_len,
0,
&registration);
if(rc != OMPI_SUCCESS) {
MCA_BTL_UDAPL_FRAG_RETURN_USER(btl,frag);
return NULL;
}
frag->registration = registration;
}
frag->segment.seg_key.key32[0] =
((mca_btl_udapl_reg_t*)registration)->rmr_context;
return &frag->base;
}
#endif
/**
* Initiate an asynchronous send.
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transfered
* @param descriptor (IN) Description of the data to be transferred
* @param tag (IN) The tag value used to notify the peer.
*/
@ -700,10 +730,9 @@ int mca_btl_udapl_send(
{
mca_btl_udapl_frag_t* frag = (mca_btl_udapl_frag_t*)des;
frag->btl = (mca_btl_udapl_module_t*)btl;
frag->endpoint = endpoint;
frag->ftr = (mca_btl_udapl_footer_t *)
((char *)frag->segment.seg_addr.pval + frag->segment.seg_len);
((char *)frag->segment.seg_addr.pval + frag->segment.seg_len);
frag->ftr->tag = tag;
frag->type = MCA_BTL_UDAPL_SEND;
@ -726,8 +755,57 @@ int mca_btl_udapl_put(
mca_btl_base_endpoint_t* endpoint,
mca_btl_base_descriptor_t* des)
{
OPAL_OUTPUT((0, "udapl_put\n"));
return OMPI_ERR_NOT_IMPLEMENTED;
DAT_RMR_TRIPLET remote_buffer;
DAT_DTO_COOKIE cookie;
int rc = OMPI_SUCCESS;
mca_btl_udapl_frag_t* frag = (mca_btl_udapl_frag_t*)des;
mca_btl_base_segment_t *src_segment = des->des_src;
mca_btl_base_segment_t *dst_segment = des->des_dst;
frag->btl = (mca_btl_udapl_module_t *)btl;
frag->endpoint = endpoint;
frag->type = MCA_BTL_UDAPL_PUT;
if(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], -1) < 0) {
OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], 1);
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
opal_list_append(&endpoint->endpoint_max_frags,
(opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
opal_progress();
} else {
frag->triplet.segment_length = frag->segment.seg_len;
remote_buffer.rmr_context =
(DAT_RMR_CONTEXT)dst_segment->seg_key.key32[0];
remote_buffer.target_address =
(DAT_VADDR)dst_segment->seg_addr.pval;
remote_buffer.segment_length = dst_segment->seg_len;
cookie.as_ptr = frag;
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
rc = dat_ep_post_rdma_write(endpoint->endpoint_max,
1,
&frag->triplet,
cookie,
&remote_buffer,
DAT_COMPLETION_DEFAULT_FLAG);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(DAT_SUCCESS != rc) {
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write",
major, minor));
rc = OMPI_ERROR;
}
}
return rc;
}

Просмотреть файл

@ -29,6 +29,7 @@
#include <dat/udat.h>
/* Open MPI includes */
#include "orte/class/orte_pointer_array.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/class/ompi_bitmap.h"
#include "opal/event/event.h"
@ -56,17 +57,45 @@ struct mca_btl_udapl_component_t {
size_t udapl_max_btls; /**< maximum number of supported hcas */
struct mca_btl_udapl_module_t **udapl_btls; /**< array of available BTL modules */
size_t udapl_evd_qlen;
size_t udapl_max_request_dtos; /**< maximum number of outstanding consumer
submitted sends and rdma operations, see
section 6.6.6 of uDAPL Spec */
size_t udapl_max_recv_dtos; /**< maximum number of outstanding consumer
submitted recv operations, see section 6.6.6
of uDAPL Spec */
int32_t udapl_num_recvs; /**< number of recv buffers to keep posted */
int32_t udapl_num_sends; /**< number of sends to post on endpoint */
int32_t udapl_sr_win; /**< number of fragments recieved before
returnting credits to sendier */
int32_t udapl_timeout; /**< connection timeout, in microseconds */
int32_t udapl_eager_rdma_guarantee;/**< uDAPL does not guarantee
the order of data written to
buffer, if the interface
card in use guarantees front
to back order of data
written then this flag
should remain as set by
default (off) otherwise
latency overhead will
increase if turned on */
size_t udapl_eager_frag_size;
size_t udapl_max_frag_size;
size_t udapl_eager_rdma_frag_size; /* size of the rdma fragement including data
* payload space
*/
int udapl_free_list_num; /**< initial size of free lists */
int udapl_free_list_max; /**< maximum size of free lists */
int udapl_free_list_inc; /**< number of elements to alloc when growing */
int32_t udapl_eager_rdma_num; /**< number of rdma buffers allocated
for short messages */
int32_t udapl_max_eager_rdma_peers; /**< maximum number of peers allowed to
use RDMA for short messages (cap)
*/
int32_t udapl_eager_rdma_win; /**< number of eager RDMA fragments
recieved before returning credits to
sender */
opal_list_t udapl_procs; /**< list of udapl proc structures */
opal_mutex_t udapl_lock; /**< lock for accessing module state */
char* udapl_mpool_name; /**< name of memory pool */
@ -90,7 +119,8 @@ struct mca_btl_udapl_module_t {
DAT_IA_HANDLE udapl_ia;
DAT_PZ_HANDLE udapl_pz;
DAT_PSP_HANDLE udapl_psp;
DAT_EP_PARAM udapl_ep_param;
/* event dispatchers - async, data transfer, connection negotiation */
DAT_EVD_HANDLE udapl_evd_async;
DAT_EVD_HANDLE udapl_evd_dto;
@ -100,8 +130,19 @@ struct mca_btl_udapl_module_t {
ompi_free_list_t udapl_frag_eager;
ompi_free_list_t udapl_frag_max;
ompi_free_list_t udapl_frag_user;
ompi_free_list_t udapl_frag_control;
opal_mutex_t udapl_lock; /* lock for accessing module state */
opal_mutex_t udapl_eager_rdma_lock; /* eager rdma lock */
uint32_t udapl_eager_rdma_endpoint_count; /* count of the number of
* endpoints in
* udapl_eager_rdma_endpoints
*/
orte_pointer_array_t *udapl_eager_rdma_endpoints; /* array of endpoints
* with eager rdma
* connections
*/
};
typedef struct mca_btl_udapl_module_t mca_btl_udapl_module_t;
extern mca_btl_udapl_module_t mca_btl_udapl_module;
@ -231,7 +272,7 @@ extern int mca_btl_udapl_del_procs(
*
* @param btl (IN) BTL module
* @param endpoint (IN) BTL addressing information
* @param descriptor (IN) Description of the data to be transfered
* @param descriptor (IN) Description of the data to be transferred
* @param tag (IN) The tag value used to notify the peer.
*/

Просмотреть файл

@ -154,13 +154,27 @@ int mca_btl_udapl_component_open(void)
mca_btl_udapl_param_register_int("max_modules", 8);
mca_btl_udapl_component.udapl_evd_qlen =
mca_btl_udapl_param_register_int("evd_qlen", 32);
mca_btl_udapl_component.udapl_max_request_dtos =
mca_btl_udapl_param_register_int("max_request_dtos", 18);
mca_btl_udapl_component.udapl_max_recv_dtos =
mca_btl_udapl_param_register_int("max_recv_dtos", 18);
mca_btl_udapl_component.udapl_num_recvs =
mca_btl_udapl_param_register_int("num_recvs", 8);
mca_btl_udapl_component.udapl_num_sends =
mca_btl_udapl_param_register_int("num_sends", 8);
mca_btl_udapl_param_register_int("num_sends", 7);
mca_btl_udapl_component.udapl_sr_win =
mca_btl_udapl_param_register_int("sr_win", 4);
mca_btl_udapl_component.udapl_eager_rdma_num =
mca_btl_udapl_param_register_int("eager_rdma_num", 8);
mca_btl_udapl_component.udapl_max_eager_rdma_peers =
mca_btl_udapl_param_register_int("max_eager_rdma_peers", 16);
mca_btl_udapl_component.udapl_eager_rdma_win =
mca_btl_udapl_param_register_int("eager_rdma_win", 4);
mca_btl_udapl_component.udapl_timeout =
mca_btl_udapl_param_register_int("timeout", 10000000);
mca_btl_udapl_component.udapl_eager_rdma_guarantee =
mca_btl_udapl_param_register_int("eager_rdma_guarantee", 0);
/* register uDAPL module parameters */
mca_btl_udapl_module.super.btl_exclusivity =
mca_btl_udapl_param_register_int ("exclusivity",
@ -175,6 +189,8 @@ int mca_btl_udapl_component_open(void)
mca_btl_udapl_param_register_int("min_rdma_size", 512*1024);
mca_btl_udapl_module.super.btl_max_rdma_size =
mca_btl_udapl_param_register_int("max_rdma_size", 128*1024);
mca_btl_udapl_module.super.btl_flags =
mca_btl_udapl_param_register_int("flags", MCA_BTL_FLAGS_PUT);
mca_btl_udapl_module.super.btl_bandwidth =
mca_btl_udapl_param_register_int("bandwidth", 225);
@ -182,13 +198,17 @@ int mca_btl_udapl_component_open(void)
mca_btl_udapl_component.udapl_eager_frag_size =
mca_btl_udapl_module.super.btl_eager_limit;
mca_btl_udapl_module.super.btl_eager_limit -=
sizeof(mca_btl_udapl_footer_t);
(sizeof(mca_btl_udapl_footer_t) + sizeof(mca_btl_udapl_rdma_footer_t));
mca_btl_udapl_component.udapl_max_frag_size =
mca_btl_udapl_module.super.btl_max_send_size;
mca_btl_udapl_module.super.btl_max_send_size -=
sizeof(mca_btl_udapl_footer_t);
(sizeof(mca_btl_udapl_footer_t) + sizeof(mca_btl_udapl_rdma_footer_t));
/* compute udapl_eager_rdma_frag_size */
mca_btl_udapl_component.udapl_eager_rdma_frag_size =
sizeof(mca_btl_udapl_frag_eager_rdma_t) +
mca_btl_udapl_component.udapl_eager_frag_size;
/* leave pinned option */
value = 0;
@ -247,6 +267,81 @@ mca_btl_udapl_modex_send(void)
}
/*
* Callback function used for udapl btl internal control messages.
*
* @param btl (IN) BTL module
* @param tag (IN) Not used but part of callback interface
* @param descriptor (IN) Description of the data that was just transferred
* @param cbdata (IN) Data used by call back function. Not used.
*
*/
static void mca_btl_udapl_receive_control(struct mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* descriptor,
void* cbdata)
{
mca_btl_udapl_frag_t* frag = (mca_btl_udapl_frag_t*)descriptor;
mca_btl_udapl_endpoint_t* endpoint = frag->endpoint;
mca_btl_udapl_control_header_t* ctl_hdr =
frag->segment.seg_addr.pval;
switch (ctl_hdr->type) {
case MCA_BTL_UDAPL_CONTROL_RDMA_CONNECT:
{
mca_btl_udapl_eager_rdma_connect_t* rdma_connect =
frag->segment.seg_addr.pval;
if (endpoint->endpoint_eager_rdma_remote.base.pval) {
BTL_ERROR(("ERROR: Received RDMA connect twice!"));
return;
}
endpoint->endpoint_eager_rdma_remote.rkey = rdma_connect->rkey;
endpoint->endpoint_eager_rdma_remote.base.pval =
rdma_connect->rdma_start.pval;
OPAL_THREAD_ADD32(&(endpoint->endpoint_eager_rdma_remote.tokens),
mca_btl_udapl_component.udapl_eager_rdma_num);
break;
}
case MCA_BTL_UDAPL_CONTROL_RDMA_CREDIT:
{
mca_btl_udapl_eager_rdma_credit_t* rdma_credit =
frag->segment.seg_addr.pval;
/* don't return credits used for rdma credit control message */
OPAL_THREAD_ADD32(
&(endpoint->endpoint_sr_credits[BTL_UDAPL_EAGER_CONNECTION]),
-1);
OPAL_THREAD_ADD32(&(endpoint->endpoint_eager_rdma_remote.tokens),
rdma_credit->credits);
break;
}
case MCA_BTL_UDAPL_CONTROL_SR_CREDIT:
{
mca_btl_udapl_sr_credit_t* sr_credit =
frag->segment.seg_addr.pval;
/* don't return credits used for sr credit control message */
OPAL_THREAD_ADD32(
&(endpoint->endpoint_sr_credits[sr_credit->connection]), -1);
OPAL_THREAD_ADD32(
&(endpoint->endpoint_sr_tokens[sr_credit->connection]),
sr_credit->credits);
break;
}
default:
BTL_ERROR(("ERROR: Unknown contrl message type received by BTL"));
break;
}
}
/*
* Initialize the uDAPL component,
* check how many interfaces are available and create a btl module for each.
@ -316,6 +411,10 @@ mca_btl_udapl_component_init (int *num_btl_modules,
continue;
}
/* register internal control message callback */
btl->udapl_reg[MCA_BTL_TAG_BTL].cbfunc = mca_btl_udapl_receive_control;
btl->udapl_reg[MCA_BTL_TAG_BTL].cbdata = NULL;
/* successful btl creation */
mca_btl_udapl_component.udapl_btls[mca_btl_udapl_component.udapl_num_btls] = btl;
if(++mca_btl_udapl_component.udapl_num_btls >=
@ -365,17 +464,21 @@ static int mca_btl_udapl_accept_connect(mca_btl_udapl_module_t* btl,
DAT_EP_HANDLE endpoint;
int rc;
rc = dat_ep_create(btl->udapl_ia, btl->udapl_pz,
btl->udapl_evd_dto, btl->udapl_evd_dto,
btl->udapl_evd_conn, NULL, &endpoint);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_create");
rc = mca_btl_udapl_endpoint_create(btl, &endpoint);
if(OMPI_SUCCESS != rc) {
BTL_ERROR(("ERROR: mca_btl_udapl_endpoint_create"));
return OMPI_ERROR;
}
rc = dat_cr_accept(cr_handle, endpoint, 0, NULL);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_cr_accept");
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_cr_accept",
major, minor));
return OMPI_ERROR;
}
@ -402,7 +505,13 @@ static inline int mca_btl_udapl_sendrecv(mca_btl_udapl_module_t* btl,
rc = dat_ep_post_recv(endpoint, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_recv");
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_recv",
major, minor));
return OMPI_ERROR;
}
@ -424,13 +533,99 @@ static inline int mca_btl_udapl_sendrecv(mca_btl_udapl_module_t* btl,
rc = dat_ep_post_send(endpoint, 1,
&frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG);
if(DAT_SUCCESS != rc) {
MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_send");
char* major;
char* minor;
dat_strerror(rc, (const char**)&major,
(const char**)&minor);
BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send",
major, minor));
return OMPI_ERROR;
}
return OMPI_SUCCESS;
}
static inline int mca_btl_udapl_frag_progress_one(
mca_btl_udapl_module_t* udapl_btl,
mca_btl_udapl_frag_t* frag)
{
int rc;
switch(frag->type) {
case MCA_BTL_UDAPL_SEND:
rc = mca_btl_udapl_endpoint_send(frag->endpoint, frag);
break;
case MCA_BTL_UDAPL_PUT:
rc = mca_btl_udapl_put((mca_btl_base_module_t*)udapl_btl,
frag->endpoint,
(mca_btl_base_descriptor_t*)frag);
break;
default:
rc = OMPI_ERROR;
BTL_ERROR(("Error : Progressing pending operation, invalid type %d\n",
frag->type));
break;
}
return rc;
}
void mca_btl_udapl_frag_progress_pending(mca_btl_udapl_module_t* udapl_btl,
mca_btl_base_endpoint_t* endpoint,
uint32_t connection)
{
int len;
int i;
mca_btl_udapl_frag_t* frag;
if (BTL_UDAPL_EAGER_CONNECTION == connection) {
len = opal_list_get_size(&endpoint->endpoint_eager_frags);
/* progress eager frag queue as needed */
for(i = 0; i < len &&
BTL_UDAPL_TOKENS(endpoint, connection) > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag = (mca_btl_udapl_frag_t*)opal_list_remove_first(&(endpoint->endpoint_eager_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == frag) {
return;
}
if(mca_btl_udapl_frag_progress_one(udapl_btl, frag) !=
OMPI_SUCCESS) {
BTL_ERROR(("ERROR: Not able to progress on connection(%d)\n",
BTL_UDAPL_EAGER_CONNECTION));
return;
}
}
} else if (BTL_UDAPL_MAX_CONNECTION == connection) {
len = opal_list_get_size(&endpoint->endpoint_max_frags);
/* progress max frag queue as needed */
for(i = 0; i < len &&
BTL_UDAPL_TOKENS(endpoint, connection) > 0; i++) {
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
frag = (mca_btl_udapl_frag_t*)opal_list_remove_first(&(endpoint->endpoint_max_frags));
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
if(NULL == frag) {
return;
}
if(mca_btl_udapl_frag_progress_one(udapl_btl, frag) !=
OMPI_SUCCESS) {
BTL_ERROR(("ERROR: Not able to progress on connection(%d)\n",
BTL_UDAPL_MAX_CONNECTION));
return;
}
}
} else {
BTL_ERROR(("ERROR: Can not progress pending fragment on unknown connection\n"));
}
return;
}
/*
* uDAPL component progress.
@ -444,8 +639,8 @@ int mca_btl_udapl_component_progress()
#if defined(__SVR4) && defined(__sun)
DAT_COUNT nmore; /* used by dat_evd_wait, see comment below */
#endif
int i, j, rdma_ep_count;
int count = 0;
size_t i;
/* prevent deadlock - only one thread should be 'progressing' at a time */
if(OPAL_THREAD_ADD32(&inprogress, 1) > 1) {
@ -465,152 +660,200 @@ int mca_btl_udapl_component_progress()
mca_btl_udapl_frag_t* frag;
switch(event.event_number) {
case DAT_DTO_COMPLETION_EVENT:
dto = &event.event_data.dto_completion_event_data;
case DAT_DTO_COMPLETION_EVENT:
dto = &event.event_data.dto_completion_event_data;
frag = dto->user_cookie.as_ptr;
/* Was the DTO successful? */
if(DAT_DTO_SUCCESS != dto->status) {
OPAL_OUTPUT((0,
"btl_udapl ***** DTO error %d %d %d %p*****\n",
dto->status, frag->type, frag->size, dto->ep_handle));
break;
}
frag = dto->user_cookie.as_ptr;
/* if we are using the "guarantee" rdma code path
* the extra write sets cookie to NULL, when this
* happens we ignore it because the completion
* write event is coming
*/
if (frag == NULL) break;
switch(frag->type) {
case MCA_BTL_UDAPL_SEND:
{
mca_btl_udapl_endpoint_t* endpoint = frag->endpoint;
/*OPAL_OUTPUT((0, "btl_udapl UDAPL_SEND %d",
dto->transfered_length));*/
/* Was the DTO successful? */
if(DAT_DTO_SUCCESS != dto->status) {
OPAL_OUTPUT((0,
"btl_udapl ***** DTO error %d %d %d %p*****\n",
dto->status, frag->type, frag->size, dto->ep_handle));
break;
}
assert(frag->base.des_src == &frag->segment);
assert(frag->base.des_src_cnt == 1);
assert(frag->base.des_dst == NULL);
assert(frag->base.des_dst_cnt == 0);
assert(frag->type == MCA_BTL_UDAPL_SEND);
frag->base.des_cbfunc(&btl->super, frag->endpoint,
&frag->base, OMPI_SUCCESS);
if(frag->size ==
mca_btl_udapl_component.udapl_eager_frag_size) {
if(!opal_list_is_empty(
&endpoint->endpoint_eager_frags)) {
DAT_DTO_COOKIE cookie;
frag = (mca_btl_udapl_frag_t*)
opal_list_remove_first(
&endpoint->endpoint_eager_frags);
switch(frag->type) {
case MCA_BTL_UDAPL_RDMA_WRITE:
{
mca_btl_udapl_endpoint_t* endpoint = frag->endpoint;
assert(frag->triplet.segment_length ==
frag->segment.seg_len +
sizeof(mca_btl_udapl_footer_t));
assert(frag->base.des_src == &frag->segment);
assert(frag->base.des_src_cnt == 1);
assert(frag->base.des_dst == NULL);
assert(frag->base.des_dst_cnt == 0);
assert(frag->type == MCA_BTL_UDAPL_RDMA_WRITE);
frag->base.des_cbfunc(&btl->super, frag->endpoint,
&frag->base, OMPI_SUCCESS);
mca_btl_udapl_frag_progress_pending(btl,
frag->endpoint,
BTL_UDAPL_EAGER_CONNECTION);
cookie.as_ptr = frag;
dat_ep_post_send(endpoint->endpoint_eager,
1, &frag->triplet, cookie,
DAT_COMPLETION_DEFAULT_FLAG);
} else {
OPAL_THREAD_ADD32(
&endpoint->endpoint_eager_sends, 1);
}
} else {
assert(frag->size ==
mca_btl_udapl_component.udapl_max_frag_size);
if(!opal_list_is_empty(
&endpoint->endpoint_max_frags)) {
DAT_DTO_COOKIE cookie;
frag = (mca_btl_udapl_frag_t*)
opal_list_remove_first(
&endpoint->endpoint_max_frags);
assert(frag->triplet.segment_length ==
frag->segment.seg_len +
sizeof(mca_btl_udapl_footer_t));
break;
}
case MCA_BTL_UDAPL_SEND:
{
mca_btl_udapl_endpoint_t* endpoint = frag->endpoint;
cookie.as_ptr = frag;
dat_ep_post_send(endpoint->endpoint_max,
1, &frag->triplet, cookie,
DAT_COMPLETION_DEFAULT_FLAG);
} else {
OPAL_THREAD_ADD32(
&endpoint->endpoint_max_sends, 1);
}
}
assert(frag->base.des_src == &frag->segment);
assert(frag->base.des_src_cnt == 1);
assert(frag->base.des_dst == NULL);
assert(frag->base.des_dst_cnt == 0);
assert(frag->type == MCA_BTL_UDAPL_SEND);
break;
frag->base.des_cbfunc(&btl->super, frag->endpoint,
&frag->base, OMPI_SUCCESS);
if(frag->size ==
mca_btl_udapl_component.udapl_eager_frag_size) {
mca_btl_udapl_frag_progress_pending(btl,
frag->endpoint,
BTL_UDAPL_EAGER_CONNECTION);
} else {
assert(frag->size ==
mca_btl_udapl_component.udapl_max_frag_size);
mca_btl_udapl_frag_progress_pending(btl,
frag->endpoint,
BTL_UDAPL_MAX_CONNECTION);
}
case MCA_BTL_UDAPL_RECV:
{
mca_btl_base_recv_reg_t* reg;
break;
}
case MCA_BTL_UDAPL_RECV:
{
mca_btl_base_recv_reg_t* reg;
int cntrl_msg = -1;
assert(frag->base.des_dst == &frag->segment);
assert(frag->base.des_dst_cnt == 1);
assert(frag->base.des_src == NULL);
assert(frag->base.des_src_cnt == 0);
assert(frag->type == MCA_BTL_UDAPL_RECV);
assert(frag->triplet.virtual_address ==
(DAT_VADDR)frag->segment.seg_addr.pval);
assert(frag->triplet.segment_length == frag->size);
assert(frag->btl == btl);
assert(frag->base.des_dst == &frag->segment);
assert(frag->base.des_dst_cnt == 1);
assert(frag->base.des_src == NULL);
assert(frag->base.des_src_cnt == 0);
assert(frag->type == MCA_BTL_UDAPL_RECV);
assert(frag->triplet.virtual_address ==
(DAT_VADDR)frag->segment.seg_addr.pval);
assert(frag->triplet.segment_length == frag->size);
assert(frag->btl == btl);
/* setup frag ftr location and do callback */
frag->segment.seg_len = dto->transfered_length -
/* setup frag ftr location and do callback */
frag->segment.seg_len = dto->transfered_length -
sizeof(mca_btl_udapl_footer_t);
frag->ftr = (mca_btl_udapl_footer_t *)
((char *)frag->segment.seg_addr.pval +
frag->segment.seg_len);
reg = &btl->udapl_reg[frag->ftr->tag];
OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock);
reg->cbfunc(&btl->super,
frag->ftr->tag, &frag->base, reg->cbdata);
OPAL_THREAD_LOCK(&mca_btl_udapl_component.udapl_lock);
frag->ftr = (mca_btl_udapl_footer_t *)
((char *)frag->segment.seg_addr.pval +
frag->segment.seg_len);
/* Repost the frag */
frag->ftr = frag->segment.seg_addr.pval;
frag->segment.seg_len =
frag->size - sizeof(mca_btl_udapl_footer_t);
frag->base.des_flags = 0;
cntrl_msg = frag->ftr->tag;
if(frag->size ==
reg = &btl->udapl_reg[frag->ftr->tag];
OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock);
reg->cbfunc(&btl->super,
frag->ftr->tag, &frag->base, reg->cbdata);
OPAL_THREAD_LOCK(&mca_btl_udapl_component.udapl_lock);
/* Repost the frag */
frag->ftr = frag->segment.seg_addr.pval;
frag->segment.seg_len =
(frag->size - sizeof(mca_btl_udapl_footer_t) -
sizeof(mca_btl_udapl_rdma_footer_t));
frag->base.des_flags = 0;
if(frag->size ==
mca_btl_udapl_component.udapl_eager_frag_size) {
dat_ep_post_recv(frag->endpoint->endpoint_eager,
1, &frag->triplet, dto->user_cookie,
DAT_COMPLETION_DEFAULT_FLAG);
} else {
assert(frag->size ==
mca_btl_udapl_component.udapl_max_frag_size);
dat_ep_post_recv(frag->endpoint->endpoint_max,
1, &frag->triplet, dto->user_cookie,
DAT_COMPLETION_DEFAULT_FLAG);
OPAL_THREAD_ADD32(&(frag->endpoint->endpoint_sr_credits[BTL_UDAPL_EAGER_CONNECTION]), 1);
dat_ep_post_recv(frag->endpoint->endpoint_eager,
1, &frag->triplet, dto->user_cookie,
DAT_COMPLETION_DEFAULT_FLAG);
if (frag->endpoint->endpoint_sr_credits[BTL_UDAPL_EAGER_CONNECTION] >=
mca_btl_udapl_component.udapl_sr_win) {
mca_btl_udapl_endpoint_send_sr_credits(frag->endpoint,
BTL_UDAPL_EAGER_CONNECTION);
}
break;
if (MCA_BTL_TAG_BTL == cntrl_msg) {
mca_btl_udapl_frag_progress_pending(btl,
frag->endpoint,
BTL_UDAPL_EAGER_CONNECTION);
}
} else {
assert(frag->size ==
mca_btl_udapl_component.udapl_max_frag_size);
OPAL_THREAD_ADD32(&(frag->endpoint->endpoint_sr_credits[BTL_UDAPL_MAX_CONNECTION]), 1);
dat_ep_post_recv(frag->endpoint->endpoint_max,
1, &frag->triplet, dto->user_cookie,
DAT_COMPLETION_DEFAULT_FLAG);
if (frag->endpoint->endpoint_sr_credits[BTL_UDAPL_MAX_CONNECTION] >=
mca_btl_udapl_component.udapl_sr_win) {
mca_btl_udapl_endpoint_send_sr_credits(frag->endpoint,
BTL_UDAPL_MAX_CONNECTION);
}
if (MCA_BTL_TAG_BTL == cntrl_msg) {
mca_btl_udapl_frag_progress_pending(btl,
frag->endpoint,
BTL_UDAPL_MAX_CONNECTION);
}
}
case MCA_BTL_UDAPL_CONN_RECV:
mca_btl_udapl_endpoint_finish_connect(btl,
frag->segment.seg_addr.pval,
(int32_t *)((char *)frag->segment.seg_addr.pval +
sizeof(mca_btl_udapl_addr_t)),
event.event_data.connect_event_data.ep_handle);
/* No break - fall through to free */
case MCA_BTL_UDAPL_CONN_SEND:
frag->segment.seg_len =
mca_btl_udapl_module.super.btl_eager_limit;
mca_btl_udapl_free((mca_btl_base_module_t*)btl,
(mca_btl_base_descriptor_t*)frag);
break;
default:
OPAL_OUTPUT((0, "WARNING unknown frag type: %d\n",
frag->type));
}
count++;
break;
}
case MCA_BTL_UDAPL_PUT:
{
mca_btl_udapl_endpoint_t* endpoint = frag->endpoint;
assert(frag->base.des_src == &frag->segment);
assert(frag->base.des_src_cnt == 1);
assert(frag->base.des_dst_cnt == 1);
assert(frag->type == MCA_BTL_UDAPL_PUT);
frag->base.des_cbfunc(&btl->super, frag->endpoint,
&frag->base, OMPI_SUCCESS);
OPAL_THREAD_ADD32(&(frag->endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION]), 1);
mca_btl_udapl_frag_progress_pending(btl,
frag->endpoint,
BTL_UDAPL_MAX_CONNECTION);
break;
}
case MCA_BTL_UDAPL_CONN_RECV:
mca_btl_udapl_endpoint_finish_connect(btl,
frag->segment.seg_addr.pval,
(int32_t *)((char *)frag->segment.seg_addr.pval +
sizeof(mca_btl_udapl_addr_t)),
event.event_data.connect_event_data.ep_handle);
/* No break - fall through to free */
case MCA_BTL_UDAPL_CONN_SEND:
frag->segment.seg_len =
mca_btl_udapl_module.super.btl_eager_limit;
mca_btl_udapl_free((mca_btl_base_module_t*)btl,
(mca_btl_base_descriptor_t*)frag);
break;
default:
OPAL_OUTPUT((0, "WARNING unknown dto event: %d\n",
event.event_number));
OPAL_OUTPUT((0, "WARNING unknown frag type: %d\n",
frag->type));
}
count++;
break;
default:
OPAL_OUTPUT((0, "WARNING unknown dto event: %d\n",
event.event_number));
}
}
@ -622,9 +865,9 @@ int mca_btl_udapl_component_progress()
* DAT_CONNECTION_REQUEST_EVENT. Workaround is to use
* wait. This should be removed when fix available.
*/
dat_evd_wait(btl->udapl_evd_conn, 0, 1, &event, &nmore)) {
dat_evd_wait(btl->udapl_evd_conn, 0, 1, &event, &nmore)) {
#else
dat_evd_dequeue(btl->udapl_evd_conn, &event)) {
dat_evd_dequeue(btl->udapl_evd_conn, &event)) {
#endif
switch(event.event_number) {
case DAT_CONNECTION_REQUEST_EVENT:
@ -653,28 +896,119 @@ int mca_btl_udapl_component_progress()
case DAT_CONNECTION_EVENT_UNREACHABLE:
/* Need to set the BTL endpoint to MCA_BTL_UDAPL_FAILED
See dat_ep_connect documentation pdf pg 198 */
break;
BTL_OUTPUT(("WARNING : Connection event not handled : %d\n",
event.event_number));
break;
default:
OPAL_OUTPUT((0, "WARNING unknown conn event: %d\n",
event.event_number));
BTL_ERROR(("ERROR: unknown connection event : %d",
event.event_number));
}
}
/* Check async EVD */
while(DAT_SUCCESS ==
dat_evd_dequeue(btl->udapl_evd_async, &event)) {
switch(event.event_number) {
case DAT_ASYNC_ERROR_EVD_OVERFLOW:
case DAT_ASYNC_ERROR_IA_CATASTROPHIC:
case DAT_ASYNC_ERROR_EP_BROKEN:
case DAT_ASYNC_ERROR_TIMED_OUT:
case DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR:
break;
default:
OPAL_OUTPUT((0, "WARNING unknown async event: %d\n",
event.event_number));
case DAT_ASYNC_ERROR_EVD_OVERFLOW:
case DAT_ASYNC_ERROR_IA_CATASTROPHIC:
case DAT_ASYNC_ERROR_EP_BROKEN:
case DAT_ASYNC_ERROR_TIMED_OUT:
case DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR:
BTL_OUTPUT(("WARNING: async event ignored : %d",
event.event_number));
break;
default:
BTL_OUTPUT(("WARNING unknown async event: %d\n",
event.event_number));
}
}
/*
* Check eager rdma segments
*/
/* find the number of endpoints with rdma buffers */
rdma_ep_count = btl->udapl_eager_rdma_endpoint_count;
for (j = 0; j < rdma_ep_count; j++) {
mca_btl_udapl_endpoint_t* endpoint;
mca_btl_udapl_frag_t *local_rdma_frag;
DAT_LMR_TRIPLET local_rdma_segment;
endpoint =
orte_pointer_array_get_item(btl->udapl_eager_rdma_endpoints, j);
OPAL_THREAD_LOCK(&endpoint->endpoint_eager_rdma_local.lock);
local_rdma_frag =
MCA_BTL_UDAPL_GET_LOCAL_RDMA_FRAG(endpoint,
endpoint->endpoint_eager_rdma_local.head);
/* sync local memory before checking if active
* Question, will narrowing sync area to just the active byte
* one, work and two, improve performance
*/
local_rdma_segment.lmr_context =
local_rdma_frag->triplet.lmr_context;
local_rdma_segment.virtual_address =
(DAT_VADDR)local_rdma_frag->segment.seg_addr.pval;
local_rdma_segment.segment_length = local_rdma_frag->size;
dat_lmr_sync_rdma_write(endpoint->endpoint_btl->udapl_ia,
&local_rdma_segment, 1);
if (local_rdma_frag->rdma_ftr->active == 1) {
int pad = 0;
mca_btl_base_recv_reg_t* reg;
MCA_BTL_UDAPL_RDMA_NEXT_INDEX(endpoint->endpoint_eager_rdma_local.head);
OPAL_THREAD_UNLOCK(&endpoint->endpoint_eager_rdma_local.lock);
/* compute pad as needed */
MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad,
(local_rdma_frag->rdma_ftr->size +
sizeof(mca_btl_udapl_footer_t)));
/* set fragment information */
local_rdma_frag->ftr = (mca_btl_udapl_footer_t *)
((char *)local_rdma_frag->rdma_ftr -
pad -
sizeof(mca_btl_udapl_footer_t));
local_rdma_frag->segment.seg_len =
local_rdma_frag->rdma_ftr->size;
local_rdma_frag->segment.seg_addr.pval = (unsigned char *)
((char *)local_rdma_frag->ftr -
local_rdma_frag->segment.seg_len);
/* retrieve callback and callback */
reg = &btl->udapl_reg[local_rdma_frag->ftr->tag];
reg->cbfunc(&btl->super,
local_rdma_frag->ftr->tag, &local_rdma_frag->base, reg->cbdata);
/* repost */
local_rdma_frag->rdma_ftr->active = 0;
local_rdma_frag->segment.seg_addr.pval =
(unsigned char*)(local_rdma_frag + 1);
local_rdma_frag->segment.seg_len =
mca_btl_udapl_module.super.btl_eager_limit;
local_rdma_frag->base.des_flags = 0;
/* increment local rdma credits */
OPAL_THREAD_ADD32(&(endpoint->endpoint_eager_rdma_local.credits),
1);
if (endpoint->endpoint_eager_rdma_local.credits >=
mca_btl_udapl_component.udapl_eager_rdma_win) {
mca_btl_udapl_endpoint_send_eager_rdma_credits(endpoint);
}
count++;
} else {
OPAL_THREAD_UNLOCK(&endpoint->endpoint_eager_rdma_local.lock);
}
} /* end of rdma_count loop */
}
/* unlock and return */

108
ompi/mca/btl/udapl/btl_udapl_eager_rdma.h Обычный файл
Просмотреть файл

@ -0,0 +1,108 @@
/*
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef MCA_BTL_UDAPL_EAGER_RDMA_H
#define MCA_BTL_UDAPL_EAGER_RDMA_H
/* Open MPI includes */
#include "ompi/mca/btl/udapl/btl_udapl_endpoint.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
/*
* Describe endpoint local memory region.
*/
struct mca_btl_udapl_eager_rdma_local_t {
ompi_ptr_t base;
struct mca_btl_udapl_reg_t* reg;
uint8_t head; /**< RDMA buffer to poll */
int32_t credits; /**< number of local rdma buffers ready to be reclaimed,
reused. Initially equal to 0. */
opal_mutex_t lock; /**< protect access to RDMA buffer */
};
typedef struct mca_btl_udapl_eager_rdma_local_t mca_btl_udapl_eager_rdma_local_t;
/*
* Describe endpoint remote memory region.
*/
struct mca_btl_udapl_eager_rdma_remote_t {
ompi_ptr_t base;
DAT_RMR_CONTEXT rkey; /**< key required to access remote memory */
uint8_t head; /**< RDMA buffer to use */
int32_t tokens; /**< number of available rdma buffers, initially equal
to mca parameter eager_rdma_num */
opal_mutex_t lock; /**< protect access to RDMA buffer */
};
typedef struct mca_btl_udapl_eager_rdma_remote_t mca_btl_udapl_eager_rdma_remote_t;
/*
* Encapsulate data that describes a remote memory region.
*/
struct mca_btl_udapl_eager_rdma_connect_t {
mca_btl_udapl_control_header_t control;
uint32_t rkey;
ompi_ptr_t rdma_start;
};
typedef struct mca_btl_udapl_eager_rdma_connect_t mca_btl_udapl_eager_rdma_connect_t;
/*
* Encapsulate data that describes rdma credit information.
*/
struct mca_btl_udapl_eager_rdma_credit_t {
mca_btl_udapl_control_header_t control;
uint32_t credits;
};
typedef struct mca_btl_udapl_eager_rdma_credit_t mca_btl_udapl_eager_rdma_credit_t;
#define EAGER_RDMA_BUFFER_AVAILABLE (0)
#define EAGER_RDMA_BUFFER_IN_USE (0xff)
#define MCA_BTL_UDAPL_RDMA_FRAG_IN_USE(F) do { \
*(volatile uint8_t*) ((char*)(F) + \
(mca_btl_udapl_component.udapl_eager_rdma_frag_size - \
(sizeof(mca_btl_udapl_footer_t)))); \
} while (0)
#define MCA_BTL_UDAPL_RDMA_FRAG_ASSIGN_IN_USE(F) do { \
*(volatile uint8_t*) ((char*)(F) + \
(mca_btl_udapl_component.udapl_eager_rdma_frag_size- \
(sizeof(mca_btl_udapl_footer_t)))) = EAGER_RDMA_BUFFER_IN_USE; \
} while (0)
#define MCA_BTL_UDAPL_RDMA_FRAG_ASSIGN_AVAILABLE(F) do { \
*(volatile uint8_t*) ((char*)(F) + \
(mca_btl_udapl_component.udapl_eager_rdma_frag_size - \
(sizeof(mca_btl_udapl_footer_t)))) = EAGER_RDMA_BUFFER_AVAILABLE; \
} while (0)
/* Retrieve the rdma fragment at location I */
#define MCA_BTL_UDAPL_GET_LOCAL_RDMA_FRAG(E, I) \
(mca_btl_udapl_frag_t*) \
((char*)(E)->endpoint_eager_rdma_local.base.pval + \
(I) * mca_btl_udapl_component.udapl_eager_rdma_frag_size)
/*
* Increment the index I by one while not exceeding the total number of
* available eager rdma fragments
*/
#define MCA_BTL_UDAPL_RDMA_NEXT_INDEX(I) do { \
(I) = ((I) + 1); \
if((I) == \
mca_btl_udapl_component.udapl_eager_rdma_num) \
(I) = 0; \
} while (0)
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -27,13 +27,19 @@
#include "opal/event/event.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "btl_udapl_frag.h"
#include "btl_udapl.h"
#include "btl_udapl_eager_rdma.h"
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
#define BTL_UDAPL_TOKENS(E, C) ((E)->endpoint_sr_tokens[(C)] + \
(((C) == BTL_UDAPL_EAGER_CONNECTION)?(E)->endpoint_eager_rdma_remote.tokens:0))
/**
* Structure used to publish uDAPL id information to peers.
*/
@ -43,7 +49,7 @@ struct mca_btl_udapl_addr_t {
};
typedef struct mca_btl_udapl_addr_t mca_btl_udapl_addr_t;
/**
* State of uDAPL endpoint connection.
*/
@ -56,6 +62,25 @@ typedef enum {
MCA_BTL_UDAPL_FAILED
} mca_btl_udapl_endpoint_state_t;
/*
* Establish a name for the 2 connections opened per peer
*/
typedef enum {
BTL_UDAPL_EAGER_CONNECTION,
BTL_UDAPL_MAX_CONNECTION,
BTL_UDAPL_NUM_CONNECTION
} mca_btl_udapl_endpoint_conn_t;
/*
* Encapsulate data that describes sendrecv credit information.
*/
struct mca_btl_udapl_sr_credit_t {
mca_btl_udapl_control_header_t control;
uint32_t credits;
uint32_t connection; /* 0 == BTL_UDAPL_EAGER_CONNECTION;
1 == BTL_UDAPL_MAX_CONNECTION */
};
typedef struct mca_btl_udapl_sr_credit_t mca_btl_udapl_sr_credit_t;
/**
* An abstraction that represents a connection to a endpoint process.
@ -84,6 +109,12 @@ struct mca_btl_base_endpoint_t {
int32_t endpoint_max_sends;
/**< number of sends that may be posted */
int32_t endpoint_sr_tokens[BTL_UDAPL_NUM_CONNECTION];
/**< number of sends that may be posted */
int32_t endpoint_sr_credits[BTL_UDAPL_NUM_CONNECTION];
/**< number of recvs that are now available */
int32_t endpoint_connection_seq;
/**< sequence number of sendrecv message for the connection est */
@ -96,6 +127,13 @@ struct mca_btl_base_endpoint_t {
DAT_EP_HANDLE endpoint_eager;
DAT_EP_HANDLE endpoint_max;
/**< uDAPL endpoint handle */
int32_t endpoint_eager_rdma_index;
/**< index into array of endpoints with RDMA buffers */
mca_btl_udapl_eager_rdma_local_t endpoint_eager_rdma_local;
/**< info about local RDMA buffer */
mca_btl_udapl_eager_rdma_remote_t endpoint_eager_rdma_remote;
/**< info about remote RDMA buffer */
};
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
@ -126,6 +164,29 @@ int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl,
int32_t* seq,
DAT_EP_HANDLE endpoint);
/*
* Send number of eager rdma credits
*/
int mca_btl_udapl_endpoint_send_eager_rdma_credits(mca_btl_base_endpoint_t* endpoint);
/*
* Establish uDAPL endpoint parameters
*/
int mca_btl_udapl_endpoint_get_params(struct mca_btl_udapl_module_t* btl,
DAT_EP_PARAM* ep_param);
/*
* Create uDAPL endpoint
*/
int mca_btl_udapl_endpoint_create(struct mca_btl_udapl_module_t* btl,
DAT_EP_HANDLE* udapl_endpoint);
/*
* Send number of send recv credits
*/
int mca_btl_udapl_endpoint_send_sr_credits(mca_btl_base_endpoint_t* endpoint,
uint32_t connection);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -73,6 +73,18 @@ static void mca_btl_udapl_frag_user_constructor(mca_btl_udapl_frag_t* frag)
frag->registration = NULL;
}
static void mca_btl_udapl_frag_eager_rdma_constructor(mca_btl_udapl_frag_t* frag)
{
mca_btl_udapl_frag_eager_constructor(frag);
frag->segment.seg_len = mca_btl_udapl_module.super.btl_eager_limit;
frag->size = mca_btl_udapl_component.udapl_eager_frag_size;
frag->rdma_ftr = (mca_btl_udapl_rdma_footer_t *)
((char *)(frag->segment.seg_addr.pval) +
frag->size -
sizeof(mca_btl_udapl_rdma_footer_t));
frag->rdma_ftr->active=0;
}
static void mca_btl_udapl_frag_common_destructor(mca_btl_udapl_frag_t* frag)
{
#if OMPI_ENABLE_DEBUG
@ -115,3 +127,8 @@ OBJ_CLASS_INSTANCE(
mca_btl_udapl_frag_user_constructor,
NULL);
OBJ_CLASS_INSTANCE(
mca_btl_udapl_frag_eager_rdma_t,
mca_btl_base_descriptor_t,
mca_btl_udapl_frag_eager_rdma_constructor,
mca_btl_udapl_frag_common_destructor);

Просмотреть файл

@ -37,18 +37,49 @@ typedef enum {
MCA_BTL_UDAPL_PUT,
MCA_BTL_UDAPL_GET,
MCA_BTL_UDAPL_CONN_RECV,
MCA_BTL_UDAPL_CONN_SEND
MCA_BTL_UDAPL_CONN_SEND,
MCA_BTL_UDAPL_RDMA_WRITE,
MCA_BTL_UDAPL_FRAG_EAGER_RDMA,
MCA_BTL_UDAPL_IGNORE
} mca_btl_udapl_frag_type_t;
typedef enum {
MCA_BTL_UDAPL_CONTROL_NOOP,
MCA_BTL_UDAPL_CONTROL_RDMA_CONNECT,
MCA_BTL_UDAPL_CONTROL_RDMA_CREDIT,
MCA_BTL_UDAPL_CONTROL_SR_CREDIT
} mca_btl_udapl_control_t;
/* Control message header */
struct mca_btl_udapl_control_header_t {
mca_btl_udapl_control_t type;
};
typedef struct mca_btl_udapl_control_header_t mca_btl_udapl_control_header_t;
/**
* uDAPL btl footer.
* This is put after the payload packet so the PML header can be aligned.
* Must be aligned on MCA_BTL_UDAPL_FRAG_ALIGN byte boundary.
*/
struct mca_btl_udapl_footer_t {
mca_btl_base_tag_t tag;
};
typedef struct mca_btl_udapl_footer_t mca_btl_udapl_footer_t;
/**
* uDAPL BTL rdma footer.
* This is used in addtion to the uDAPL BTL footer. The two are seperate to
* allow for any padding that may be required between the two.
*/
struct mca_btl_udapl_rdma_footer_t {
uint32_t size;
uint8_t active; /* 0 = not in use; 1 = data is available to be received;
* this should always be the last entry in this structure
*/
char pad[3]; /* pad out be aligned on MCA_BTL_UDAPL_FRAG_ALIGN byte boundary */
};
typedef struct mca_btl_udapl_rdma_footer_t mca_btl_udapl_rdma_footer_t;
/**
* uDAPL fragment derived type.
*/
@ -57,13 +88,15 @@ struct mca_btl_udapl_frag_t {
mca_btl_base_segment_t segment;
struct mca_btl_udapl_module_t* btl;
struct mca_btl_base_endpoint_t *endpoint;
struct mca_btl_udapl_reg_t* registration;
struct mca_btl_base_endpoint_t* endpoint;
DAT_LMR_TRIPLET triplet;
struct mca_btl_udapl_reg_t* registration;
mca_btl_udapl_footer_t *ftr;
mca_btl_udapl_footer_t* ftr;
mca_btl_udapl_rdma_footer_t* rdma_ftr;
size_t size;
mca_btl_udapl_frag_type_t type;
uint32_t pad; /* Padding the structure to be evenly divisble by MCA_BTL_UDAPL_FRAG_ALIGN */
};
typedef struct mca_btl_udapl_frag_t mca_btl_udapl_frag_t;
OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_t);
@ -78,7 +111,10 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_max_t);
typedef struct mca_btl_udapl_frag_t mca_btl_udapl_frag_user_t;
OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t);
typedef struct mca_btl_udapl_frag_t mca_btl_udapl_frag_eager_rdma_t;
OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_eager_rdma_t);
/*
* Macros to allocate/return descriptors from module specific
* free list(s).
@ -86,7 +122,6 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t);
#define MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_udapl_module_t*)btl)->udapl_frag_eager, item, rc); \
frag = (mca_btl_udapl_frag_t*) item; \
@ -100,7 +135,6 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t);
#define MCA_BTL_UDAPL_FRAG_ALLOC_MAX(btl, frag, rc) \
{ \
\
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_udapl_module_t*)btl)->udapl_frag_max, item, rc); \
frag = (mca_btl_udapl_frag_t*) item; \
@ -112,7 +146,6 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t);
(ompi_free_list_item_t*)(frag)); \
}
#define MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc) \
{ \
ompi_free_list_item_t *item; \
@ -126,6 +159,26 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t);
(ompi_free_list_item_t*)(frag)); \
}
#define MCA_BTL_UDAPL_FRAG_ALLOC_CONTROL(btl, frag, rc) \
{ \
ompi_free_list_item_t *item; \
OMPI_FREE_LIST_WAIT(&((mca_btl_udapl_module_t*)btl)->udapl_frag_control, item, rc); \
frag = (mca_btl_udapl_frag_t*) item; \
}
#define MCA_BTL_UDAPL_FRAG_RETURN_CONTROL(btl, frag) \
{ \
OMPI_FREE_LIST_RETURN(&((mca_btl_udapl_module_t*)btl)->udapl_frag_control, \
(ompi_free_list_item_t*)(frag)); \
}
/*
* Calculate the pad value P required to align the given size S
*/
#define MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(P,S) do { \
(P) = ((S) % MCA_BTL_UDAPL_FRAG_ALIGN) == 0 ? \
0 : (MCA_BTL_UDAPL_FRAG_ALIGN - ((S) % MCA_BTL_UDAPL_FRAG_ALIGN)); \
} while (0);
#if defined(c_plusplus) || defined(__cplusplus)
}

32
ompi/mca/btl/udapl/help-mpi-btl-udapl.txt Обычный файл
Просмотреть файл

@ -0,0 +1,32 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open MPI.
#
[invalid num rdma segments]
WARNING: MCA parameter [btl_udapl_eager_rdma_num = %d] is not valid.
RDMA will not be used for short messages. Try setting to positive
value, e.g. 16.
[use default endpoint params]
WARNING: Using default uDAPL endpoint parameters not those that
would have been modified by MCA parameters.

Просмотреть файл

@ -11,6 +11,8 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -71,6 +73,8 @@ void* mca_mpool_udapl_alloc(
free(addr);
return NULL;
}
(*registration)->alloc_base = addr;
return addr;
}
@ -139,8 +143,6 @@ int mca_mpool_udapl_register(
int mca_mpool_udapl_deregister(mca_mpool_base_module_t* mpool,
mca_mpool_base_registration_t* reg)
{
int rc;
if(reg->flags & (MCA_MPOOL_FLAGS_CACHE | MCA_MPOOL_FLAGS_PERSIST)) {
mpool->rcache->rcache_delete(mpool->rcache, reg, reg->flags);
reg->flags = 0;