diff --git a/ompi/mca/btl/udapl/Makefile.am b/ompi/mca/btl/udapl/Makefile.am index 19f30bf214..2873c00c93 100644 --- a/ompi/mca/btl/udapl/Makefile.am +++ b/ompi/mca/btl/udapl/Makefile.am @@ -9,6 +9,8 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved. +# # $COPYRIGHT$ # # Additional copyrights may follow @@ -21,7 +23,10 @@ CFLAGS = $(btl_udapl_CFLAGS) -AM_CPPFLAGS = $(btl_udapl_CPPFLAGS) +AM_CPPFLAGS = $(btl_udapl_CPPFLAGS) -DPKGDATADIR=\"$(pkgdatadir)\" + +dist_pkgdata_DATA = \ + help-mpi-btl-udapl.txt udapl_sources = \ btl_udapl.c \ @@ -31,6 +36,7 @@ udapl_sources = \ btl_udapl_endpoint.h \ btl_udapl_frag.c \ btl_udapl_frag.h \ + btl_udapl_eager_rdma.h \ btl_udapl_proc.c \ btl_udapl_proc.h diff --git a/ompi/mca/btl/udapl/btl_udapl.c b/ompi/mca/btl/udapl/btl_udapl.c index a95c97142e..67caef1299 100644 --- a/ompi/mca/btl/udapl/btl_udapl.c +++ b/ompi/mca/btl/udapl/btl_udapl.c @@ -25,6 +25,7 @@ #include #include "opal/util/output.h" #include "opal/util/if.h" +#include "opal/util/show_help.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/btl/btl.h" @@ -36,6 +37,7 @@ #include "ompi/datatype/datatype.h" #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/rdma/mpool_rdma.h" +#include "ompi/mca/btl/base/btl_base_error.h" #include "ompi/proc/proc.h" static int udapl_reg_mr(void *reg_data, void *base, size_t size, @@ -61,9 +63,9 @@ mca_btl_udapl_module_t mca_btl_udapl_module = { mca_btl_udapl_alloc, mca_btl_udapl_free, mca_btl_udapl_prepare_src, - NULL, /* prepare_dst */ + mca_btl_udapl_prepare_dst, mca_btl_udapl_send, - NULL, /* put */ + mca_btl_udapl_put, NULL, /* get */ mca_btl_base_dump, NULL, /* mpool */ @@ -106,8 +108,13 @@ static int udapl_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg) if(udapl_reg->lmr != NULL) { rc = dat_lmr_free(udapl_reg->lmr); if(rc != DAT_SUCCESS) { - opal_output(0, "%s: error unpinning dapl memory errno says %s\n", - __func__, strerror(errno)); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_lmr_free", + major, minor)); return OMPI_ERROR; } } @@ -132,14 +139,26 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl) rc = dat_ia_open(ia_name, mca_btl_udapl_component.udapl_evd_qlen, &btl->udapl_evd_async, &btl->udapl_ia); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ia_open"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ia_open", + major, minor)); return OMPI_ERROR; } /* create a protection zone */ rc = dat_pz_create(btl->udapl_ia, &btl->udapl_pz); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_pz_create"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_pz_create", + major, minor)); goto failure; } @@ -148,7 +167,13 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl) rc = dat_ia_query(btl->udapl_ia, &btl->udapl_evd_async, DAT_IA_FIELD_IA_ADDRESS_PTR, &attr, 0, NULL); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ia_query"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ia_query", + major, minor)); goto failure; } @@ -159,7 +184,13 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl) mca_btl_udapl_component.udapl_evd_qlen, DAT_HANDLE_NULL, DAT_EVD_DTO_FLAG | DAT_EVD_RMR_BIND_FLAG, &btl->udapl_evd_dto); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_evd_create (dto)"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_evd_create (dto)", + major, minor)); goto failure; } @@ -167,16 +198,39 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl) mca_btl_udapl_component.udapl_evd_qlen, DAT_HANDLE_NULL, DAT_EVD_CR_FLAG | DAT_EVD_CONNECTION_FLAG, &btl->udapl_evd_conn); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_evd_create (conn)"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_evd_create (conn)", + major, minor)); goto failure; } /* create our public service point */ rc = dat_psp_create_any(btl->udapl_ia, &port, btl->udapl_evd_conn, - DAT_PSP_CONSUMER_FLAG, &btl->udapl_psp); + DAT_PSP_CONSUMER_FLAG, &btl->udapl_psp); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_psp_create_any"); - goto failure; + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_psp_create_any", + major, minor)); + goto failure; + } + + /* establish endpoint parameters */ + rc = mca_btl_udapl_endpoint_get_params(btl, &(btl->udapl_ep_param)); + if(OMPI_SUCCESS != rc) { + /* by not erroring out here we can try to continue with + * the default endpoint parameter values + */ + opal_show_help("help-mpi-btl-udapl.txt", + "use default endpoint params", + true); } /* Save the port with the address information */ @@ -211,6 +265,7 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl) OBJ_CONSTRUCT(&btl->udapl_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT(&btl->udapl_frag_max, ompi_free_list_t); OBJ_CONSTRUCT(&btl->udapl_frag_user, ompi_free_list_t); + OBJ_CONSTRUCT(&btl->udapl_frag_control, ompi_free_list_t); OBJ_CONSTRUCT(&btl->udapl_lock, opal_mutex_t); /* initialize free lists */ @@ -240,6 +295,23 @@ mca_btl_udapl_init(DAT_NAME_PTR ia_name, mca_btl_udapl_module_t* btl) mca_btl_udapl_component.udapl_free_list_inc, NULL); + ompi_free_list_init(&btl->udapl_frag_control, + sizeof(mca_btl_udapl_frag_eager_t) + + mca_btl_udapl_component.udapl_eager_frag_size, + OBJ_CLASS(mca_btl_udapl_frag_eager_t), + mca_btl_udapl_component.udapl_free_list_num, + -1, + mca_btl_udapl_component.udapl_free_list_inc, + btl->super.btl_mpool); + + /* initialize eager rdma buffer info */ + orte_pointer_array_init(&btl->udapl_eager_rdma_endpoints, + mca_btl_udapl_component.udapl_max_eager_rdma_peers, + mca_btl_udapl_component.udapl_max_eager_rdma_peers, + 0); + btl->udapl_eager_rdma_endpoint_count = 0; + OBJ_CONSTRUCT(&btl->udapl_eager_rdma_lock, opal_mutex_t); + /* TODO - Set up SRQ when it is supported */ return OMPI_SUCCESS; @@ -256,6 +328,20 @@ int mca_btl_udapl_finalize(struct mca_btl_base_module_t* base_btl) { mca_btl_udapl_module_t* udapl_btl = (mca_btl_udapl_module_t*) base_btl; + /* + * Cleaning up the endpoints here because mca_btl_udapl_del_procs + * is never called by upper layers. + * Note: this is only looking at those endpoints which are available + * off of the btl module rdma list. + */ + for (int i=0; i < udapl_btl->udapl_eager_rdma_endpoint_count; i++) { + mca_btl_udapl_endpoint_t* endpoint = + orte_pointer_array_get_item(udapl_btl->udapl_eager_rdma_endpoints, + i); + + OBJ_DESTRUCT(endpoint); + } + /* release uDAPL resources */ dat_evd_free(udapl_btl->udapl_evd_dto); dat_evd_free(udapl_btl->udapl_evd_conn); @@ -267,7 +353,9 @@ int mca_btl_udapl_finalize(struct mca_btl_base_module_t* base_btl) OBJ_DESTRUCT(&udapl_btl->udapl_frag_eager); OBJ_DESTRUCT(&udapl_btl->udapl_frag_max); OBJ_DESTRUCT(&udapl_btl->udapl_frag_user); - + OBJ_DESTRUCT(&udapl_btl->udapl_frag_control); + OBJ_DESTRUCT(&udapl_btl->udapl_eager_rdma_lock); + free(udapl_btl); return OMPI_SUCCESS; } @@ -377,29 +465,33 @@ mca_btl_base_descriptor_t* mca_btl_udapl_alloc( mca_btl_udapl_module_t* udapl_btl = (mca_btl_udapl_module_t*) btl; mca_btl_udapl_frag_t* frag; int rc; + int pad = 0; + + /* compute pad as needed */ + MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad, + (size + sizeof(mca_btl_udapl_footer_t))); - if(size <= btl->btl_eager_limit) { + if((size + pad) <= btl->btl_eager_limit) { MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(udapl_btl, frag, rc); - frag->segment.seg_len = - size <= btl->btl_eager_limit ? - size : btl->btl_eager_limit; } else if(size <= btl->btl_max_send_size) { MCA_BTL_UDAPL_FRAG_ALLOC_MAX(udapl_btl, frag, rc); - frag->segment.seg_len = - size <= btl->btl_max_send_size ? - size : btl->btl_max_send_size; } else { return NULL; } - /* Set up the LMR triplet from the frag segment */ - /* Note that this triplet defines a sub-region of a registered LMR */ + frag->segment.seg_len = size; + + /* Set up the LMR triplet from the frag segment. + * Note: The triplet.segment_len is set to what is required for + * actually sending the fragment, if later it is determined + * that rdma can be used to transfer the fragment the + * triplet.segment_len will have to change. + */ frag->triplet.virtual_address = (DAT_VADDR)frag->segment.seg_addr.pval; - frag->ftr = (mca_btl_udapl_footer_t *) - ((char *)frag->segment.seg_addr.pval + frag->segment.seg_len); frag->triplet.segment_length = frag->segment.seg_len + sizeof(mca_btl_udapl_footer_t); - assert(frag->triplet.lmr_context == frag->registration->lmr_triplet.lmr_context); + assert(frag->triplet.lmr_context == + frag->registration->lmr_triplet.lmr_context); frag->btl = udapl_btl; frag->base.des_src = &frag->segment; @@ -424,13 +516,13 @@ int mca_btl_udapl_free( if(frag->size == 0 && frag->registration != NULL) { btl->btl_mpool->mpool_deregister(btl->btl_mpool, - (mca_mpool_base_registration_t*)frag->registration); + (mca_mpool_base_registration_t*)frag->registration); MCA_BTL_UDAPL_FRAG_RETURN_USER(btl, frag); } else if(frag->size == mca_btl_udapl_component.udapl_eager_frag_size) { MCA_BTL_UDAPL_FRAG_RETURN_EAGER(btl, frag); } else if(frag->size == mca_btl_udapl_component.udapl_max_frag_size) { MCA_BTL_UDAPL_FRAG_RETURN_MAX(btl, frag); - } else { + } else { OPAL_OUTPUT((0, "[%s:%d] mca_btl_udapl_free: invalid descriptor\n", __FILE__,__LINE__)); return OMPI_ERR_BAD_PARAM; } @@ -453,152 +545,99 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src( size_t* size ) { - mca_btl_udapl_frag_t* frag; + mca_btl_udapl_frag_t* frag = NULL; struct iovec iov; uint32_t iov_count = 1; size_t max_data = *size; int rc; + int pad = 0; -#if 0 - /* - * If the data has already been pinned and is contigous than we can - * use it in place. - */ - if (NULL != registration && 0 == ompi_convertor_need_buffers(convertor)) { - size_t reg_len; - OPAL_OUTPUT((0, "udapl_prepare_src 1\n")); + /* compute pad as needed */ + MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad, + (max_data + reserve + sizeof(mca_btl_udapl_footer_t))); - MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc); - if(NULL == frag){ - return NULL; - } + if(ompi_convertor_need_buffers(convertor) == false && 0 == reserve) { + if(registration != NULL || max_data > btl->btl_max_send_size) { - iov.iov_len = max_data; - iov.iov_base = NULL; + MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc); + if(NULL == frag){ + return NULL; + } - ompi_convertor_pack(convertor, &iov, + iov.iov_len = max_data; + iov.iov_base = NULL; + + ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - frag->triplet.segment_length = max_data; - frag->triplet.virtual_address = (DAT_VADDR)iov.iov_base; - - reg_len = (unsigned char*)registration->bound - - (unsigned char*)iov.iov_base + 1; + *size = max_data; - /* bump reference count as so that the registration - * doesn't go away when the operation completes - */ - btl->btl_mpool->mpool_retain(btl->btl_mpool, registration); - frag->registration = registration; - frag->triplet.lmr_context = - ((mca_mpool_udapl_registration_t*)registration)->lmr_triplet.lmr_context; + if(NULL == registration) { + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, iov.iov_base, + max_data, 0, + ®istration); - /* - * if the data is not already pinned - but the leave pinned option is set, - * then go ahead and pin contigous data. however, if a reserve is required - * then we must allocate a fragment w/ buffer space - */ - } else if (max_data > btl->btl_max_send_size && - ompi_convertor_need_buffers(convertor) == 0 && - reserve == 0) { + if(rc != OMPI_SUCCESS) { + MCA_BTL_UDAPL_FRAG_RETURN_USER(btl,frag); + return NULL; + } + /* keep track of the registration we did */ + frag->registration = (mca_btl_udapl_reg_t*)registration; + } - mca_mpool_base_module_t* mpool = btl->btl_mpool; - MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc); - if(NULL == frag){ - return NULL; - } - - OPAL_OUTPUT((0, "udapl_prepare_src 2\n")); + frag->segment.seg_len = max_data; + frag->segment.seg_addr.pval = iov.iov_base; + frag->triplet.segment_length = max_data; + frag->triplet.virtual_address = (DAT_VADDR)iov.iov_base; + frag->triplet.lmr_context = + ((mca_btl_udapl_reg_t*)registration)->lmr_triplet.lmr_context; - iov.iov_len = max_data; - iov.iov_base = NULL; - - ompi_convertor_pack(convertor, &iov, - &iov_count, &max_data ); - - - rc = mpool->mpool_register( - mpool, - iov.iov_base, - max_data, - 0, - ®istration); - - if(rc != OMPI_SUCCESS) { - MCA_BTL_UDAPL_FRAG_RETURN_USER(btl,frag); - return NULL; - } - - frag->registration = registration; - frag->triplet.lmr_context = - ((mca_mpool_udapl_registration_t*)registration)->lmr_triplet.lmr_context; - /* TODO - should our base addr be frag->ftr? */ - frag->segment.seg_len = max_data; - frag->segment.seg_addr.pval = iov.iov_base; - frag->triplet.segment_length = max_data; - frag->triplet.virtual_address = (DAT_VADDR)iov.iov_base; - } - - /* - * if we aren't pinning the data and the requested size is less - * than the eager limit pack into a fragment from the eager pool - */ - else -#endif - if(max_data + reserve <= btl->btl_eager_limit) { - MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(btl, frag, rc); - if(NULL == frag) { - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = (char *) frag->segment.seg_addr.pval + reserve; - - rc = ompi_convertor_pack(convertor, - &iov, &iov_count, &max_data ); - *size = max_data; - if(rc < 0) { - MCA_BTL_UDAPL_FRAG_RETURN_EAGER(btl, frag); - return NULL; + /* initialize base descriptor */ + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + + return &frag->base; } } - /* - * otherwise pack as much data as we can into a fragment - * that is the max send size. - */ - else { + if(max_data + pad + reserve <= btl->btl_eager_limit) { + /* the data is small enough to fit in the eager frag and + * memory is not prepinned */ + MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(btl, frag, rc); + } + + if(NULL == frag) { + /* the data doesn't fit into eager frag or eager frag is + * not available */ MCA_BTL_UDAPL_FRAG_ALLOC_MAX(btl, frag, rc); if(NULL == frag) { return NULL; } - - if(max_data + reserve > btl->btl_max_send_size){ + if(max_data + reserve > btl->btl_max_send_size) { max_data = btl->btl_max_send_size - reserve; } - - iov.iov_len = max_data; - iov.iov_base = (char *) frag->segment.seg_addr.pval + reserve; - - rc = ompi_convertor_pack(convertor, - &iov, &iov_count, &max_data ); - *size = max_data; - - if(rc < 0) { - MCA_BTL_UDAPL_FRAG_RETURN_MAX(btl, frag); - return NULL; - } } + + iov.iov_len = max_data; + iov.iov_base = (char *) frag->segment.seg_addr.pval + reserve; + + rc = ompi_convertor_pack(convertor, + &iov, &iov_count, &max_data ); + if(rc < 0) { + MCA_BTL_UDAPL_FRAG_RETURN_MAX(btl, frag); + return NULL; + } + + *size = max_data; /* setup lengths and addresses to send out data */ frag->segment.seg_len = max_data + reserve; frag->triplet.segment_length = - max_data + reserve + sizeof(mca_btl_udapl_footer_t); + max_data + reserve + sizeof(mca_btl_udapl_footer_t); frag->triplet.virtual_address = (DAT_VADDR)frag->segment.seg_addr.pval; - frag->ftr = (mca_btl_udapl_footer_t *) - ((char *)frag->segment.seg_addr.pval + frag->segment.seg_len); /* initialize base descriptor */ frag->base.des_src = &frag->segment; @@ -606,13 +645,14 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src( frag->base.des_dst = NULL; frag->base.des_dst_cnt = 0; frag->base.des_flags = 0; + return &frag->base; } /** * Prepare a descriptor for send/rdma using the supplied - * convertor. If the convertor references data that is contigous, + * convertor. If the convertor references data that is contiguous, * the descriptor may simply point to the user buffer. Otherwise, * this routine is responsible for allocating buffer space and * packing if required. @@ -623,7 +663,6 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_src( * @param reserve (IN) Additional bytes requested by upper layer to precede user data * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) */ -#if 0 mca_btl_base_descriptor_t* mca_btl_udapl_prepare_dst( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, @@ -633,12 +672,9 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_dst( size_t* size) { mca_btl_udapl_frag_t* frag; - mca_mpool_base_module_t* mpool = btl->btl_mpool; ptrdiff_t lb; int rc; - OPAL_OUTPUT((0, "udapl_prepare_dst\n")); - MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc); if(NULL == frag) { return NULL; @@ -648,46 +684,40 @@ mca_btl_base_descriptor_t* mca_btl_udapl_prepare_dst( frag->segment.seg_len = *size; frag->segment.seg_addr.pval = convertor->pBaseBuf + lb + convertor->bConverted; + if(NULL == registration) { + /* didn't get a memory registration passed in, so must + * register the region now + */ + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, + frag->segment.seg_addr.pval, + frag->segment.seg_len, + 0, + ®istration); + if(OMPI_SUCCESS != rc || NULL == registration) { + MCA_BTL_UDAPL_FRAG_RETURN_USER(btl,frag); + return NULL; + } + frag->registration = (mca_btl_udapl_reg_t*)registration; + } + frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->base.des_dst = &frag->segment; frag->base.des_dst_cnt = 1; frag->base.des_flags = 0; - if(NULL != registration) { - /* bump reference count as so that the registration - * doesn't go away when the operation completes - */ - - mpool->mpool_retain(mpool, - (mca_mpool_base_registration_t*) registration); - - frag->registration = registration; - } else { - - rc = mpool->mpool_register( - mpool, - frag->segment.seg_addr.pval, - frag->segment.seg_len, - 0, - ®istration); - if(rc != OMPI_SUCCESS) { - MCA_BTL_UDAPL_FRAG_RETURN_USER(btl,frag); - return NULL; - } - - frag->registration = registration; - } + frag->segment.seg_key.key32[0] = + ((mca_btl_udapl_reg_t*)registration)->rmr_context; + return &frag->base; } -#endif /** * Initiate an asynchronous send. * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transfered + * @param descriptor (IN) Description of the data to be transferred * @param tag (IN) The tag value used to notify the peer. */ @@ -700,10 +730,9 @@ int mca_btl_udapl_send( { mca_btl_udapl_frag_t* frag = (mca_btl_udapl_frag_t*)des; - frag->btl = (mca_btl_udapl_module_t*)btl; frag->endpoint = endpoint; frag->ftr = (mca_btl_udapl_footer_t *) - ((char *)frag->segment.seg_addr.pval + frag->segment.seg_len); + ((char *)frag->segment.seg_addr.pval + frag->segment.seg_len); frag->ftr->tag = tag; frag->type = MCA_BTL_UDAPL_SEND; @@ -726,8 +755,57 @@ int mca_btl_udapl_put( mca_btl_base_endpoint_t* endpoint, mca_btl_base_descriptor_t* des) { - OPAL_OUTPUT((0, "udapl_put\n")); - return OMPI_ERR_NOT_IMPLEMENTED; + DAT_RMR_TRIPLET remote_buffer; + DAT_DTO_COOKIE cookie; + int rc = OMPI_SUCCESS; + + mca_btl_udapl_frag_t* frag = (mca_btl_udapl_frag_t*)des; + mca_btl_base_segment_t *src_segment = des->des_src; + mca_btl_base_segment_t *dst_segment = des->des_dst; + + frag->btl = (mca_btl_udapl_module_t *)btl; + frag->endpoint = endpoint; + frag->type = MCA_BTL_UDAPL_PUT; + + if(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], -1) < 0) { + OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], 1); + OPAL_THREAD_LOCK(&endpoint->endpoint_lock); + opal_list_append(&endpoint->endpoint_max_frags, + (opal_list_item_t*)frag); + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + opal_progress(); + } else { + frag->triplet.segment_length = frag->segment.seg_len; + + remote_buffer.rmr_context = + (DAT_RMR_CONTEXT)dst_segment->seg_key.key32[0]; + remote_buffer.target_address = + (DAT_VADDR)dst_segment->seg_addr.pval; + remote_buffer.segment_length = dst_segment->seg_len; + + cookie.as_ptr = frag; + + OPAL_THREAD_LOCK(&endpoint->endpoint_lock); + rc = dat_ep_post_rdma_write(endpoint->endpoint_max, + 1, + &frag->triplet, + cookie, + &remote_buffer, + DAT_COMPLETION_DEFAULT_FLAG); + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + if(DAT_SUCCESS != rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write", + major, minor)); + rc = OMPI_ERROR; + } + } + + return rc; } diff --git a/ompi/mca/btl/udapl/btl_udapl.h b/ompi/mca/btl/udapl/btl_udapl.h index 2f1fdc3da7..2a73c69198 100644 --- a/ompi/mca/btl/udapl/btl_udapl.h +++ b/ompi/mca/btl/udapl/btl_udapl.h @@ -29,6 +29,7 @@ #include /* Open MPI includes */ +#include "orte/class/orte_pointer_array.h" #include "ompi/class/ompi_free_list.h" #include "ompi/class/ompi_bitmap.h" #include "opal/event/event.h" @@ -56,17 +57,45 @@ struct mca_btl_udapl_component_t { size_t udapl_max_btls; /**< maximum number of supported hcas */ struct mca_btl_udapl_module_t **udapl_btls; /**< array of available BTL modules */ size_t udapl_evd_qlen; + size_t udapl_max_request_dtos; /**< maximum number of outstanding consumer + submitted sends and rdma operations, see + section 6.6.6 of uDAPL Spec */ + size_t udapl_max_recv_dtos; /**< maximum number of outstanding consumer + submitted recv operations, see section 6.6.6 + of uDAPL Spec */ int32_t udapl_num_recvs; /**< number of recv buffers to keep posted */ int32_t udapl_num_sends; /**< number of sends to post on endpoint */ + int32_t udapl_sr_win; /**< number of fragments recieved before + returnting credits to sendier */ int32_t udapl_timeout; /**< connection timeout, in microseconds */ - + int32_t udapl_eager_rdma_guarantee;/**< uDAPL does not guarantee + the order of data written to + buffer, if the interface + card in use guarantees front + to back order of data + written then this flag + should remain as set by + default (off) otherwise + latency overhead will + increase if turned on */ size_t udapl_eager_frag_size; size_t udapl_max_frag_size; - + size_t udapl_eager_rdma_frag_size; /* size of the rdma fragement including data + * payload space + */ + int udapl_free_list_num; /**< initial size of free lists */ int udapl_free_list_max; /**< maximum size of free lists */ int udapl_free_list_inc; /**< number of elements to alloc when growing */ - + int32_t udapl_eager_rdma_num; /**< number of rdma buffers allocated + for short messages */ + int32_t udapl_max_eager_rdma_peers; /**< maximum number of peers allowed to + use RDMA for short messages (cap) + */ + int32_t udapl_eager_rdma_win; /**< number of eager RDMA fragments + recieved before returning credits to + sender */ + opal_list_t udapl_procs; /**< list of udapl proc structures */ opal_mutex_t udapl_lock; /**< lock for accessing module state */ char* udapl_mpool_name; /**< name of memory pool */ @@ -90,7 +119,8 @@ struct mca_btl_udapl_module_t { DAT_IA_HANDLE udapl_ia; DAT_PZ_HANDLE udapl_pz; DAT_PSP_HANDLE udapl_psp; - + DAT_EP_PARAM udapl_ep_param; + /* event dispatchers - async, data transfer, connection negotiation */ DAT_EVD_HANDLE udapl_evd_async; DAT_EVD_HANDLE udapl_evd_dto; @@ -100,8 +130,19 @@ struct mca_btl_udapl_module_t { ompi_free_list_t udapl_frag_eager; ompi_free_list_t udapl_frag_max; ompi_free_list_t udapl_frag_user; - + ompi_free_list_t udapl_frag_control; + opal_mutex_t udapl_lock; /* lock for accessing module state */ + + opal_mutex_t udapl_eager_rdma_lock; /* eager rdma lock */ + uint32_t udapl_eager_rdma_endpoint_count; /* count of the number of + * endpoints in + * udapl_eager_rdma_endpoints + */ + orte_pointer_array_t *udapl_eager_rdma_endpoints; /* array of endpoints + * with eager rdma + * connections + */ }; typedef struct mca_btl_udapl_module_t mca_btl_udapl_module_t; extern mca_btl_udapl_module_t mca_btl_udapl_module; @@ -231,7 +272,7 @@ extern int mca_btl_udapl_del_procs( * * @param btl (IN) BTL module * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transfered + * @param descriptor (IN) Description of the data to be transferred * @param tag (IN) The tag value used to notify the peer. */ diff --git a/ompi/mca/btl/udapl/btl_udapl_component.c b/ompi/mca/btl/udapl/btl_udapl_component.c index 18e6684a6e..f55d5c919e 100644 --- a/ompi/mca/btl/udapl/btl_udapl_component.c +++ b/ompi/mca/btl/udapl/btl_udapl_component.c @@ -154,13 +154,27 @@ int mca_btl_udapl_component_open(void) mca_btl_udapl_param_register_int("max_modules", 8); mca_btl_udapl_component.udapl_evd_qlen = mca_btl_udapl_param_register_int("evd_qlen", 32); + mca_btl_udapl_component.udapl_max_request_dtos = + mca_btl_udapl_param_register_int("max_request_dtos", 18); + mca_btl_udapl_component.udapl_max_recv_dtos = + mca_btl_udapl_param_register_int("max_recv_dtos", 18); mca_btl_udapl_component.udapl_num_recvs = mca_btl_udapl_param_register_int("num_recvs", 8); mca_btl_udapl_component.udapl_num_sends = - mca_btl_udapl_param_register_int("num_sends", 8); + mca_btl_udapl_param_register_int("num_sends", 7); + mca_btl_udapl_component.udapl_sr_win = + mca_btl_udapl_param_register_int("sr_win", 4); + mca_btl_udapl_component.udapl_eager_rdma_num = + mca_btl_udapl_param_register_int("eager_rdma_num", 8); + mca_btl_udapl_component.udapl_max_eager_rdma_peers = + mca_btl_udapl_param_register_int("max_eager_rdma_peers", 16); + mca_btl_udapl_component.udapl_eager_rdma_win = + mca_btl_udapl_param_register_int("eager_rdma_win", 4); mca_btl_udapl_component.udapl_timeout = mca_btl_udapl_param_register_int("timeout", 10000000); - + mca_btl_udapl_component.udapl_eager_rdma_guarantee = + mca_btl_udapl_param_register_int("eager_rdma_guarantee", 0); + /* register uDAPL module parameters */ mca_btl_udapl_module.super.btl_exclusivity = mca_btl_udapl_param_register_int ("exclusivity", @@ -175,6 +189,8 @@ int mca_btl_udapl_component_open(void) mca_btl_udapl_param_register_int("min_rdma_size", 512*1024); mca_btl_udapl_module.super.btl_max_rdma_size = mca_btl_udapl_param_register_int("max_rdma_size", 128*1024); + mca_btl_udapl_module.super.btl_flags = + mca_btl_udapl_param_register_int("flags", MCA_BTL_FLAGS_PUT); mca_btl_udapl_module.super.btl_bandwidth = mca_btl_udapl_param_register_int("bandwidth", 225); @@ -182,13 +198,17 @@ int mca_btl_udapl_component_open(void) mca_btl_udapl_component.udapl_eager_frag_size = mca_btl_udapl_module.super.btl_eager_limit; mca_btl_udapl_module.super.btl_eager_limit -= - sizeof(mca_btl_udapl_footer_t); + (sizeof(mca_btl_udapl_footer_t) + sizeof(mca_btl_udapl_rdma_footer_t)); mca_btl_udapl_component.udapl_max_frag_size = mca_btl_udapl_module.super.btl_max_send_size; mca_btl_udapl_module.super.btl_max_send_size -= - sizeof(mca_btl_udapl_footer_t); + (sizeof(mca_btl_udapl_footer_t) + sizeof(mca_btl_udapl_rdma_footer_t)); + /* compute udapl_eager_rdma_frag_size */ + mca_btl_udapl_component.udapl_eager_rdma_frag_size = + sizeof(mca_btl_udapl_frag_eager_rdma_t) + + mca_btl_udapl_component.udapl_eager_frag_size; /* leave pinned option */ value = 0; @@ -247,6 +267,81 @@ mca_btl_udapl_modex_send(void) } +/* + * Callback function used for udapl btl internal control messages. + * + * @param btl (IN) BTL module + * @param tag (IN) Not used but part of callback interface + * @param descriptor (IN) Description of the data that was just transferred + * @param cbdata (IN) Data used by call back function. Not used. + * + */ +static void mca_btl_udapl_receive_control(struct mca_btl_base_module_t* btl, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t* descriptor, + void* cbdata) +{ + mca_btl_udapl_frag_t* frag = (mca_btl_udapl_frag_t*)descriptor; + mca_btl_udapl_endpoint_t* endpoint = frag->endpoint; + mca_btl_udapl_control_header_t* ctl_hdr = + frag->segment.seg_addr.pval; + + switch (ctl_hdr->type) { + case MCA_BTL_UDAPL_CONTROL_RDMA_CONNECT: + { + mca_btl_udapl_eager_rdma_connect_t* rdma_connect = + frag->segment.seg_addr.pval; + + if (endpoint->endpoint_eager_rdma_remote.base.pval) { + BTL_ERROR(("ERROR: Received RDMA connect twice!")); + return; + } + endpoint->endpoint_eager_rdma_remote.rkey = rdma_connect->rkey; + endpoint->endpoint_eager_rdma_remote.base.pval = + rdma_connect->rdma_start.pval; + + OPAL_THREAD_ADD32(&(endpoint->endpoint_eager_rdma_remote.tokens), + mca_btl_udapl_component.udapl_eager_rdma_num); + + break; + } + case MCA_BTL_UDAPL_CONTROL_RDMA_CREDIT: + { + mca_btl_udapl_eager_rdma_credit_t* rdma_credit = + frag->segment.seg_addr.pval; + + /* don't return credits used for rdma credit control message */ + OPAL_THREAD_ADD32( + &(endpoint->endpoint_sr_credits[BTL_UDAPL_EAGER_CONNECTION]), + -1); + + OPAL_THREAD_ADD32(&(endpoint->endpoint_eager_rdma_remote.tokens), + rdma_credit->credits); + + break; + } + case MCA_BTL_UDAPL_CONTROL_SR_CREDIT: + { + mca_btl_udapl_sr_credit_t* sr_credit = + frag->segment.seg_addr.pval; + + /* don't return credits used for sr credit control message */ + OPAL_THREAD_ADD32( + &(endpoint->endpoint_sr_credits[sr_credit->connection]), -1); + + OPAL_THREAD_ADD32( + &(endpoint->endpoint_sr_tokens[sr_credit->connection]), + sr_credit->credits); + + break; + } + default: + BTL_ERROR(("ERROR: Unknown contrl message type received by BTL")); + break; + } +} + + /* * Initialize the uDAPL component, * check how many interfaces are available and create a btl module for each. @@ -316,6 +411,10 @@ mca_btl_udapl_component_init (int *num_btl_modules, continue; } + /* register internal control message callback */ + btl->udapl_reg[MCA_BTL_TAG_BTL].cbfunc = mca_btl_udapl_receive_control; + btl->udapl_reg[MCA_BTL_TAG_BTL].cbdata = NULL; + /* successful btl creation */ mca_btl_udapl_component.udapl_btls[mca_btl_udapl_component.udapl_num_btls] = btl; if(++mca_btl_udapl_component.udapl_num_btls >= @@ -365,17 +464,21 @@ static int mca_btl_udapl_accept_connect(mca_btl_udapl_module_t* btl, DAT_EP_HANDLE endpoint; int rc; - rc = dat_ep_create(btl->udapl_ia, btl->udapl_pz, - btl->udapl_evd_dto, btl->udapl_evd_dto, - btl->udapl_evd_conn, NULL, &endpoint); - if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_create"); + rc = mca_btl_udapl_endpoint_create(btl, &endpoint); + if(OMPI_SUCCESS != rc) { + BTL_ERROR(("ERROR: mca_btl_udapl_endpoint_create")); return OMPI_ERROR; } - + rc = dat_cr_accept(cr_handle, endpoint, 0, NULL); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_cr_accept"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_cr_accept", + major, minor)); return OMPI_ERROR; } @@ -402,7 +505,13 @@ static inline int mca_btl_udapl_sendrecv(mca_btl_udapl_module_t* btl, rc = dat_ep_post_recv(endpoint, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_recv"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_recv", + major, minor)); return OMPI_ERROR; } @@ -424,13 +533,99 @@ static inline int mca_btl_udapl_sendrecv(mca_btl_udapl_module_t* btl, rc = dat_ep_post_send(endpoint, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_send"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", + major, minor)); return OMPI_ERROR; } return OMPI_SUCCESS; } +static inline int mca_btl_udapl_frag_progress_one( + mca_btl_udapl_module_t* udapl_btl, + mca_btl_udapl_frag_t* frag) +{ + int rc; + + switch(frag->type) { + case MCA_BTL_UDAPL_SEND: + rc = mca_btl_udapl_endpoint_send(frag->endpoint, frag); + break; + case MCA_BTL_UDAPL_PUT: + rc = mca_btl_udapl_put((mca_btl_base_module_t*)udapl_btl, + frag->endpoint, + (mca_btl_base_descriptor_t*)frag); + break; + default: + rc = OMPI_ERROR; + BTL_ERROR(("Error : Progressing pending operation, invalid type %d\n", + frag->type)); + break; + } + + return rc; +} + +void mca_btl_udapl_frag_progress_pending(mca_btl_udapl_module_t* udapl_btl, + mca_btl_base_endpoint_t* endpoint, + uint32_t connection) +{ + int len; + int i; + mca_btl_udapl_frag_t* frag; + + if (BTL_UDAPL_EAGER_CONNECTION == connection) { + len = opal_list_get_size(&endpoint->endpoint_eager_frags); + + /* progress eager frag queue as needed */ + for(i = 0; i < len && + BTL_UDAPL_TOKENS(endpoint, connection) > 0; i++) { + + OPAL_THREAD_LOCK(&endpoint->endpoint_lock); + frag = (mca_btl_udapl_frag_t*)opal_list_remove_first(&(endpoint->endpoint_eager_frags)); + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + if(NULL == frag) { + return; + } + if(mca_btl_udapl_frag_progress_one(udapl_btl, frag) != + OMPI_SUCCESS) { + BTL_ERROR(("ERROR: Not able to progress on connection(%d)\n", + BTL_UDAPL_EAGER_CONNECTION)); + return; + } + } + + } else if (BTL_UDAPL_MAX_CONNECTION == connection) { + len = opal_list_get_size(&endpoint->endpoint_max_frags); + + /* progress max frag queue as needed */ + for(i = 0; i < len && + BTL_UDAPL_TOKENS(endpoint, connection) > 0; i++) { + + OPAL_THREAD_LOCK(&endpoint->endpoint_lock); + frag = (mca_btl_udapl_frag_t*)opal_list_remove_first(&(endpoint->endpoint_max_frags)); + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + if(NULL == frag) { + return; + } + if(mca_btl_udapl_frag_progress_one(udapl_btl, frag) != + OMPI_SUCCESS) { + BTL_ERROR(("ERROR: Not able to progress on connection(%d)\n", + BTL_UDAPL_MAX_CONNECTION)); + return; + } + } + + } else { + BTL_ERROR(("ERROR: Can not progress pending fragment on unknown connection\n")); + } + return; +} /* * uDAPL component progress. @@ -444,8 +639,8 @@ int mca_btl_udapl_component_progress() #if defined(__SVR4) && defined(__sun) DAT_COUNT nmore; /* used by dat_evd_wait, see comment below */ #endif + int i, j, rdma_ep_count; int count = 0; - size_t i; /* prevent deadlock - only one thread should be 'progressing' at a time */ if(OPAL_THREAD_ADD32(&inprogress, 1) > 1) { @@ -465,152 +660,200 @@ int mca_btl_udapl_component_progress() mca_btl_udapl_frag_t* frag; switch(event.event_number) { - case DAT_DTO_COMPLETION_EVENT: - dto = &event.event_data.dto_completion_event_data; + case DAT_DTO_COMPLETION_EVENT: + dto = &event.event_data.dto_completion_event_data; - frag = dto->user_cookie.as_ptr; - /* Was the DTO successful? */ - if(DAT_DTO_SUCCESS != dto->status) { - OPAL_OUTPUT((0, - "btl_udapl ***** DTO error %d %d %d %p*****\n", - dto->status, frag->type, frag->size, dto->ep_handle)); - break; - } + frag = dto->user_cookie.as_ptr; + /* if we are using the "guarantee" rdma code path + * the extra write sets cookie to NULL, when this + * happens we ignore it because the completion + * write event is coming + */ + if (frag == NULL) break; - switch(frag->type) { - case MCA_BTL_UDAPL_SEND: - { - mca_btl_udapl_endpoint_t* endpoint = frag->endpoint; - /*OPAL_OUTPUT((0, "btl_udapl UDAPL_SEND %d", - dto->transfered_length));*/ + /* Was the DTO successful? */ + if(DAT_DTO_SUCCESS != dto->status) { + OPAL_OUTPUT((0, + "btl_udapl ***** DTO error %d %d %d %p*****\n", + dto->status, frag->type, frag->size, dto->ep_handle)); + break; + } - assert(frag->base.des_src == &frag->segment); - assert(frag->base.des_src_cnt == 1); - assert(frag->base.des_dst == NULL); - assert(frag->base.des_dst_cnt == 0); - assert(frag->type == MCA_BTL_UDAPL_SEND); - - frag->base.des_cbfunc(&btl->super, frag->endpoint, - &frag->base, OMPI_SUCCESS); - - if(frag->size == - mca_btl_udapl_component.udapl_eager_frag_size) { - if(!opal_list_is_empty( - &endpoint->endpoint_eager_frags)) { - DAT_DTO_COOKIE cookie; - - frag = (mca_btl_udapl_frag_t*) - opal_list_remove_first( - &endpoint->endpoint_eager_frags); + switch(frag->type) { + case MCA_BTL_UDAPL_RDMA_WRITE: + { + mca_btl_udapl_endpoint_t* endpoint = frag->endpoint; - assert(frag->triplet.segment_length == - frag->segment.seg_len + - sizeof(mca_btl_udapl_footer_t)); + assert(frag->base.des_src == &frag->segment); + assert(frag->base.des_src_cnt == 1); + assert(frag->base.des_dst == NULL); + assert(frag->base.des_dst_cnt == 0); + assert(frag->type == MCA_BTL_UDAPL_RDMA_WRITE); + + frag->base.des_cbfunc(&btl->super, frag->endpoint, + &frag->base, OMPI_SUCCESS); + + mca_btl_udapl_frag_progress_pending(btl, + frag->endpoint, + BTL_UDAPL_EAGER_CONNECTION); - cookie.as_ptr = frag; - dat_ep_post_send(endpoint->endpoint_eager, - 1, &frag->triplet, cookie, - DAT_COMPLETION_DEFAULT_FLAG); - } else { - OPAL_THREAD_ADD32( - &endpoint->endpoint_eager_sends, 1); - } - } else { - assert(frag->size == - mca_btl_udapl_component.udapl_max_frag_size); - if(!opal_list_is_empty( - &endpoint->endpoint_max_frags)) { - DAT_DTO_COOKIE cookie; - - frag = (mca_btl_udapl_frag_t*) - opal_list_remove_first( - &endpoint->endpoint_max_frags); - - assert(frag->triplet.segment_length == - frag->segment.seg_len + - sizeof(mca_btl_udapl_footer_t)); + break; + } + case MCA_BTL_UDAPL_SEND: + { + mca_btl_udapl_endpoint_t* endpoint = frag->endpoint; - cookie.as_ptr = frag; - dat_ep_post_send(endpoint->endpoint_max, - 1, &frag->triplet, cookie, - DAT_COMPLETION_DEFAULT_FLAG); - } else { - OPAL_THREAD_ADD32( - &endpoint->endpoint_max_sends, 1); - } - } + assert(frag->base.des_src == &frag->segment); + assert(frag->base.des_src_cnt == 1); + assert(frag->base.des_dst == NULL); + assert(frag->base.des_dst_cnt == 0); + assert(frag->type == MCA_BTL_UDAPL_SEND); - break; + frag->base.des_cbfunc(&btl->super, frag->endpoint, + &frag->base, OMPI_SUCCESS); + + if(frag->size == + mca_btl_udapl_component.udapl_eager_frag_size) { + + mca_btl_udapl_frag_progress_pending(btl, + frag->endpoint, + BTL_UDAPL_EAGER_CONNECTION); + } else { + assert(frag->size == + mca_btl_udapl_component.udapl_max_frag_size); + + mca_btl_udapl_frag_progress_pending(btl, + frag->endpoint, + BTL_UDAPL_MAX_CONNECTION); } - case MCA_BTL_UDAPL_RECV: - { - mca_btl_base_recv_reg_t* reg; + break; + } + case MCA_BTL_UDAPL_RECV: + { + mca_btl_base_recv_reg_t* reg; + int cntrl_msg = -1; + + assert(frag->base.des_dst == &frag->segment); + assert(frag->base.des_dst_cnt == 1); + assert(frag->base.des_src == NULL); + assert(frag->base.des_src_cnt == 0); + assert(frag->type == MCA_BTL_UDAPL_RECV); + assert(frag->triplet.virtual_address == + (DAT_VADDR)frag->segment.seg_addr.pval); + assert(frag->triplet.segment_length == frag->size); + assert(frag->btl == btl); - assert(frag->base.des_dst == &frag->segment); - assert(frag->base.des_dst_cnt == 1); - assert(frag->base.des_src == NULL); - assert(frag->base.des_src_cnt == 0); - assert(frag->type == MCA_BTL_UDAPL_RECV); - assert(frag->triplet.virtual_address == - (DAT_VADDR)frag->segment.seg_addr.pval); - assert(frag->triplet.segment_length == frag->size); - assert(frag->btl == btl); - - /* setup frag ftr location and do callback */ - frag->segment.seg_len = dto->transfered_length - + /* setup frag ftr location and do callback */ + frag->segment.seg_len = dto->transfered_length - sizeof(mca_btl_udapl_footer_t); - frag->ftr = (mca_btl_udapl_footer_t *) - ((char *)frag->segment.seg_addr.pval + - frag->segment.seg_len); - reg = &btl->udapl_reg[frag->ftr->tag]; - OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock); - reg->cbfunc(&btl->super, - frag->ftr->tag, &frag->base, reg->cbdata); - OPAL_THREAD_LOCK(&mca_btl_udapl_component.udapl_lock); + frag->ftr = (mca_btl_udapl_footer_t *) + ((char *)frag->segment.seg_addr.pval + + frag->segment.seg_len); - /* Repost the frag */ - frag->ftr = frag->segment.seg_addr.pval; - frag->segment.seg_len = - frag->size - sizeof(mca_btl_udapl_footer_t); - frag->base.des_flags = 0; + cntrl_msg = frag->ftr->tag; - if(frag->size == + reg = &btl->udapl_reg[frag->ftr->tag]; + OPAL_THREAD_UNLOCK(&mca_btl_udapl_component.udapl_lock); + + reg->cbfunc(&btl->super, + frag->ftr->tag, &frag->base, reg->cbdata); + OPAL_THREAD_LOCK(&mca_btl_udapl_component.udapl_lock); + + /* Repost the frag */ + frag->ftr = frag->segment.seg_addr.pval; + frag->segment.seg_len = + (frag->size - sizeof(mca_btl_udapl_footer_t) - + sizeof(mca_btl_udapl_rdma_footer_t)); + frag->base.des_flags = 0; + + if(frag->size == mca_btl_udapl_component.udapl_eager_frag_size) { - dat_ep_post_recv(frag->endpoint->endpoint_eager, - 1, &frag->triplet, dto->user_cookie, - DAT_COMPLETION_DEFAULT_FLAG); - } else { - assert(frag->size == - mca_btl_udapl_component.udapl_max_frag_size); - dat_ep_post_recv(frag->endpoint->endpoint_max, - 1, &frag->triplet, dto->user_cookie, - DAT_COMPLETION_DEFAULT_FLAG); + + OPAL_THREAD_ADD32(&(frag->endpoint->endpoint_sr_credits[BTL_UDAPL_EAGER_CONNECTION]), 1); + + dat_ep_post_recv(frag->endpoint->endpoint_eager, + 1, &frag->triplet, dto->user_cookie, + DAT_COMPLETION_DEFAULT_FLAG); + + if (frag->endpoint->endpoint_sr_credits[BTL_UDAPL_EAGER_CONNECTION] >= + mca_btl_udapl_component.udapl_sr_win) { + mca_btl_udapl_endpoint_send_sr_credits(frag->endpoint, + BTL_UDAPL_EAGER_CONNECTION); } - break; + if (MCA_BTL_TAG_BTL == cntrl_msg) { + mca_btl_udapl_frag_progress_pending(btl, + frag->endpoint, + BTL_UDAPL_EAGER_CONNECTION); + } + + } else { + assert(frag->size == + mca_btl_udapl_component.udapl_max_frag_size); + + OPAL_THREAD_ADD32(&(frag->endpoint->endpoint_sr_credits[BTL_UDAPL_MAX_CONNECTION]), 1); + + dat_ep_post_recv(frag->endpoint->endpoint_max, + 1, &frag->triplet, dto->user_cookie, + DAT_COMPLETION_DEFAULT_FLAG); + + if (frag->endpoint->endpoint_sr_credits[BTL_UDAPL_MAX_CONNECTION] >= + mca_btl_udapl_component.udapl_sr_win) { + mca_btl_udapl_endpoint_send_sr_credits(frag->endpoint, + BTL_UDAPL_MAX_CONNECTION); + } + + if (MCA_BTL_TAG_BTL == cntrl_msg) { + mca_btl_udapl_frag_progress_pending(btl, + frag->endpoint, + BTL_UDAPL_MAX_CONNECTION); + } } - case MCA_BTL_UDAPL_CONN_RECV: - mca_btl_udapl_endpoint_finish_connect(btl, - frag->segment.seg_addr.pval, - (int32_t *)((char *)frag->segment.seg_addr.pval + - sizeof(mca_btl_udapl_addr_t)), - event.event_data.connect_event_data.ep_handle); - /* No break - fall through to free */ - case MCA_BTL_UDAPL_CONN_SEND: - frag->segment.seg_len = - mca_btl_udapl_module.super.btl_eager_limit; - mca_btl_udapl_free((mca_btl_base_module_t*)btl, - (mca_btl_base_descriptor_t*)frag); - break; - default: - OPAL_OUTPUT((0, "WARNING unknown frag type: %d\n", - frag->type)); - } - count++; + + break; + } + case MCA_BTL_UDAPL_PUT: + { + mca_btl_udapl_endpoint_t* endpoint = frag->endpoint; + + assert(frag->base.des_src == &frag->segment); + assert(frag->base.des_src_cnt == 1); + assert(frag->base.des_dst_cnt == 1); + assert(frag->type == MCA_BTL_UDAPL_PUT); + + frag->base.des_cbfunc(&btl->super, frag->endpoint, + &frag->base, OMPI_SUCCESS); + + OPAL_THREAD_ADD32(&(frag->endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION]), 1); + + mca_btl_udapl_frag_progress_pending(btl, + frag->endpoint, + BTL_UDAPL_MAX_CONNECTION); + + break; + } + case MCA_BTL_UDAPL_CONN_RECV: + mca_btl_udapl_endpoint_finish_connect(btl, + frag->segment.seg_addr.pval, + (int32_t *)((char *)frag->segment.seg_addr.pval + + sizeof(mca_btl_udapl_addr_t)), + event.event_data.connect_event_data.ep_handle); + /* No break - fall through to free */ + case MCA_BTL_UDAPL_CONN_SEND: + frag->segment.seg_len = + mca_btl_udapl_module.super.btl_eager_limit; + mca_btl_udapl_free((mca_btl_base_module_t*)btl, + (mca_btl_base_descriptor_t*)frag); break; default: - OPAL_OUTPUT((0, "WARNING unknown dto event: %d\n", - event.event_number)); + OPAL_OUTPUT((0, "WARNING unknown frag type: %d\n", + frag->type)); + } + count++; + break; + default: + OPAL_OUTPUT((0, "WARNING unknown dto event: %d\n", + event.event_number)); } } @@ -622,9 +865,9 @@ int mca_btl_udapl_component_progress() * DAT_CONNECTION_REQUEST_EVENT. Workaround is to use * wait. This should be removed when fix available. */ - dat_evd_wait(btl->udapl_evd_conn, 0, 1, &event, &nmore)) { + dat_evd_wait(btl->udapl_evd_conn, 0, 1, &event, &nmore)) { #else - dat_evd_dequeue(btl->udapl_evd_conn, &event)) { + dat_evd_dequeue(btl->udapl_evd_conn, &event)) { #endif switch(event.event_number) { case DAT_CONNECTION_REQUEST_EVENT: @@ -653,28 +896,119 @@ int mca_btl_udapl_component_progress() case DAT_CONNECTION_EVENT_UNREACHABLE: /* Need to set the BTL endpoint to MCA_BTL_UDAPL_FAILED See dat_ep_connect documentation pdf pg 198 */ - break; + BTL_OUTPUT(("WARNING : Connection event not handled : %d\n", + event.event_number)); + break; default: - OPAL_OUTPUT((0, "WARNING unknown conn event: %d\n", - event.event_number)); + BTL_ERROR(("ERROR: unknown connection event : %d", + event.event_number)); } } /* Check async EVD */ while(DAT_SUCCESS == dat_evd_dequeue(btl->udapl_evd_async, &event)) { + switch(event.event_number) { - case DAT_ASYNC_ERROR_EVD_OVERFLOW: - case DAT_ASYNC_ERROR_IA_CATASTROPHIC: - case DAT_ASYNC_ERROR_EP_BROKEN: - case DAT_ASYNC_ERROR_TIMED_OUT: - case DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR: - break; - default: - OPAL_OUTPUT((0, "WARNING unknown async event: %d\n", - event.event_number)); + case DAT_ASYNC_ERROR_EVD_OVERFLOW: + case DAT_ASYNC_ERROR_IA_CATASTROPHIC: + case DAT_ASYNC_ERROR_EP_BROKEN: + case DAT_ASYNC_ERROR_TIMED_OUT: + case DAT_ASYNC_ERROR_PROVIDER_INTERNAL_ERROR: + BTL_OUTPUT(("WARNING: async event ignored : %d", + event.event_number)); + break; + default: + BTL_OUTPUT(("WARNING unknown async event: %d\n", + event.event_number)); } } + + /* + * Check eager rdma segments + */ + + /* find the number of endpoints with rdma buffers */ + rdma_ep_count = btl->udapl_eager_rdma_endpoint_count; + + for (j = 0; j < rdma_ep_count; j++) { + mca_btl_udapl_endpoint_t* endpoint; + mca_btl_udapl_frag_t *local_rdma_frag; + DAT_LMR_TRIPLET local_rdma_segment; + + endpoint = + orte_pointer_array_get_item(btl->udapl_eager_rdma_endpoints, j); + + OPAL_THREAD_LOCK(&endpoint->endpoint_eager_rdma_local.lock); + + local_rdma_frag = + MCA_BTL_UDAPL_GET_LOCAL_RDMA_FRAG(endpoint, + endpoint->endpoint_eager_rdma_local.head); + + /* sync local memory before checking if active + * Question, will narrowing sync area to just the active byte + * one, work and two, improve performance + */ + local_rdma_segment.lmr_context = + local_rdma_frag->triplet.lmr_context; + local_rdma_segment.virtual_address = + (DAT_VADDR)local_rdma_frag->segment.seg_addr.pval; + local_rdma_segment.segment_length = local_rdma_frag->size; + + dat_lmr_sync_rdma_write(endpoint->endpoint_btl->udapl_ia, + &local_rdma_segment, 1); + + if (local_rdma_frag->rdma_ftr->active == 1) { + int pad = 0; + mca_btl_base_recv_reg_t* reg; + + MCA_BTL_UDAPL_RDMA_NEXT_INDEX(endpoint->endpoint_eager_rdma_local.head); + OPAL_THREAD_UNLOCK(&endpoint->endpoint_eager_rdma_local.lock); + + /* compute pad as needed */ + MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad, + (local_rdma_frag->rdma_ftr->size + + sizeof(mca_btl_udapl_footer_t))); + + /* set fragment information */ + local_rdma_frag->ftr = (mca_btl_udapl_footer_t *) + ((char *)local_rdma_frag->rdma_ftr - + pad - + sizeof(mca_btl_udapl_footer_t)); + local_rdma_frag->segment.seg_len = + local_rdma_frag->rdma_ftr->size; + local_rdma_frag->segment.seg_addr.pval = (unsigned char *) + ((char *)local_rdma_frag->ftr - + local_rdma_frag->segment.seg_len); + + /* retrieve callback and callback */ + reg = &btl->udapl_reg[local_rdma_frag->ftr->tag]; + reg->cbfunc(&btl->super, + local_rdma_frag->ftr->tag, &local_rdma_frag->base, reg->cbdata); + + /* repost */ + local_rdma_frag->rdma_ftr->active = 0; + local_rdma_frag->segment.seg_addr.pval = + (unsigned char*)(local_rdma_frag + 1); + local_rdma_frag->segment.seg_len = + mca_btl_udapl_module.super.btl_eager_limit; + local_rdma_frag->base.des_flags = 0; + + /* increment local rdma credits */ + OPAL_THREAD_ADD32(&(endpoint->endpoint_eager_rdma_local.credits), + 1); + + if (endpoint->endpoint_eager_rdma_local.credits >= + mca_btl_udapl_component.udapl_eager_rdma_win) { + mca_btl_udapl_endpoint_send_eager_rdma_credits(endpoint); + } + + count++; + + } else { + OPAL_THREAD_UNLOCK(&endpoint->endpoint_eager_rdma_local.lock); + } + } /* end of rdma_count loop */ } /* unlock and return */ diff --git a/ompi/mca/btl/udapl/btl_udapl_eager_rdma.h b/ompi/mca/btl/udapl/btl_udapl_eager_rdma.h new file mode 100644 index 0000000000..b1ea4c985e --- /dev/null +++ b/ompi/mca/btl/udapl/btl_udapl_eager_rdma.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_UDAPL_EAGER_RDMA_H +#define MCA_BTL_UDAPL_EAGER_RDMA_H + +/* Open MPI includes */ +#include "ompi/mca/btl/udapl/btl_udapl_endpoint.h" + + +#if defined(c_plusplus) || defined(__cplusplus) +extern "C" { +#endif + +/* + * Describe endpoint local memory region. + */ +struct mca_btl_udapl_eager_rdma_local_t { + ompi_ptr_t base; + struct mca_btl_udapl_reg_t* reg; + uint8_t head; /**< RDMA buffer to poll */ + int32_t credits; /**< number of local rdma buffers ready to be reclaimed, + reused. Initially equal to 0. */ + opal_mutex_t lock; /**< protect access to RDMA buffer */ +}; +typedef struct mca_btl_udapl_eager_rdma_local_t mca_btl_udapl_eager_rdma_local_t; + +/* + * Describe endpoint remote memory region. + */ +struct mca_btl_udapl_eager_rdma_remote_t { + ompi_ptr_t base; + DAT_RMR_CONTEXT rkey; /**< key required to access remote memory */ + uint8_t head; /**< RDMA buffer to use */ + int32_t tokens; /**< number of available rdma buffers, initially equal + to mca parameter eager_rdma_num */ + opal_mutex_t lock; /**< protect access to RDMA buffer */ +}; +typedef struct mca_btl_udapl_eager_rdma_remote_t mca_btl_udapl_eager_rdma_remote_t; + +/* + * Encapsulate data that describes a remote memory region. + */ +struct mca_btl_udapl_eager_rdma_connect_t { + mca_btl_udapl_control_header_t control; + uint32_t rkey; + ompi_ptr_t rdma_start; +}; +typedef struct mca_btl_udapl_eager_rdma_connect_t mca_btl_udapl_eager_rdma_connect_t; + +/* + * Encapsulate data that describes rdma credit information. + */ +struct mca_btl_udapl_eager_rdma_credit_t { + mca_btl_udapl_control_header_t control; + uint32_t credits; +}; +typedef struct mca_btl_udapl_eager_rdma_credit_t mca_btl_udapl_eager_rdma_credit_t; + +#define EAGER_RDMA_BUFFER_AVAILABLE (0) +#define EAGER_RDMA_BUFFER_IN_USE (0xff) + +#define MCA_BTL_UDAPL_RDMA_FRAG_IN_USE(F) do { \ + *(volatile uint8_t*) ((char*)(F) + \ + (mca_btl_udapl_component.udapl_eager_rdma_frag_size - \ + (sizeof(mca_btl_udapl_footer_t)))); \ + } while (0) + +#define MCA_BTL_UDAPL_RDMA_FRAG_ASSIGN_IN_USE(F) do { \ + *(volatile uint8_t*) ((char*)(F) + \ + (mca_btl_udapl_component.udapl_eager_rdma_frag_size- \ + (sizeof(mca_btl_udapl_footer_t)))) = EAGER_RDMA_BUFFER_IN_USE; \ + } while (0) + +#define MCA_BTL_UDAPL_RDMA_FRAG_ASSIGN_AVAILABLE(F) do { \ + *(volatile uint8_t*) ((char*)(F) + \ + (mca_btl_udapl_component.udapl_eager_rdma_frag_size - \ + (sizeof(mca_btl_udapl_footer_t)))) = EAGER_RDMA_BUFFER_AVAILABLE; \ + } while (0) + +/* Retrieve the rdma fragment at location I */ +#define MCA_BTL_UDAPL_GET_LOCAL_RDMA_FRAG(E, I) \ + (mca_btl_udapl_frag_t*) \ + ((char*)(E)->endpoint_eager_rdma_local.base.pval + \ + (I) * mca_btl_udapl_component.udapl_eager_rdma_frag_size) + +/* + * Increment the index I by one while not exceeding the total number of + * available eager rdma fragments + */ +#define MCA_BTL_UDAPL_RDMA_NEXT_INDEX(I) do { \ + (I) = ((I) + 1); \ + if((I) == \ + mca_btl_udapl_component.udapl_eager_rdma_num) \ + (I) = 0; \ + } while (0) + +#if defined(c_plusplus) || defined(__cplusplus) +} +#endif +#endif diff --git a/ompi/mca/btl/udapl/btl_udapl_endpoint.c b/ompi/mca/btl/udapl/btl_udapl_endpoint.c index 70bb0bd162..fc582f1a5a 100644 --- a/ompi/mca/btl/udapl/btl_udapl_endpoint.c +++ b/ompi/mca/btl/udapl/btl_udapl_endpoint.c @@ -25,108 +25,343 @@ #include #include #include "ompi/types.h" +#include "opal/util/show_help.h" #include "orte/mca/ns/base/base.h" #include "orte/mca/oob/base/base.h" #include "orte/mca/rml/rml.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/dss/dss.h" +#include "orte/class/orte_pointer_array.h" +#include "ompi/class/ompi_free_list.h" #include "ompi/mca/mpool/rdma/mpool_rdma.h" +#include "ompi/mca/btl/base/btl_base_error.h" #include "btl_udapl.h" #include "btl_udapl_endpoint.h" #include "btl_udapl_proc.h" #include "btl_udapl_frag.h" - static void mca_btl_udapl_endpoint_send_cb(int status, orte_process_name_t* endpoint, - orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); + orte_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); static int mca_btl_udapl_start_connect(mca_btl_base_endpoint_t* endpoint); static int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint, size_t size); void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint); void mca_btl_udapl_endpoint_recv(int status, orte_process_name_t* endpoint, - orte_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata); + orte_buffer_t* buffer, orte_rml_tag_t tag, + void* cbdata); static int mca_btl_udapl_endpoint_finish_eager(mca_btl_udapl_endpoint_t*); static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t*); +static void mca_btl_udapl_endpoint_connect_eager_rdma(mca_btl_udapl_endpoint_t* endpoint); +static int mca_btl_udapl_endpoint_write_eager(mca_btl_base_endpoint_t* endpoint, + mca_btl_udapl_frag_t* frag); +static int mca_btl_udapl_endpoint_eager_rdma_set_remote(void); +static void mca_btl_udapl_endpoint_control_send_cb(mca_btl_base_module_t* btl, + mca_btl_base_endpoint_t* endpoint, + mca_btl_base_descriptor_t* descriptor, + int status); +static int mca_btl_udapl_endpoint_send_eager_rdma(mca_btl_base_endpoint_t* endpoint); + + +/* + * Write a fragment + * + * @param endpoint (IN) BTL addressing information + * @param frag (IN) Fragment to be transferred + * + * @return OMPI_SUCCESS or OMPI_ERROR + */ +int mca_btl_udapl_endpoint_write_eager(mca_btl_base_endpoint_t* endpoint, + mca_btl_udapl_frag_t* frag) +{ + DAT_DTO_COOKIE cookie; + mca_btl_udapl_frag_eager_rdma_t* remote_frag; + char* remote_buf; + DAT_RMR_TRIPLET remote_buffer; + DAT_LMR_TRIPLET local_iov; /* one contiguous write */ + int rc = OMPI_SUCCESS; + int pad = 0; + uint8_t head = endpoint->endpoint_eager_rdma_remote.head; + + /* now that we have the head update it */ + MCA_BTL_UDAPL_RDMA_NEXT_INDEX(endpoint->endpoint_eager_rdma_remote.head); + + MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad, + (frag->segment.seg_len + sizeof(mca_btl_udapl_footer_t))); + + /* set the rdma footer information */ + frag->rdma_ftr = (mca_btl_udapl_rdma_footer_t *) + ((char *)frag->segment.seg_addr.pval + + frag->segment.seg_len + + sizeof(mca_btl_udapl_footer_t) + + pad); + frag->rdma_ftr->active = 1; + frag->rdma_ftr->size = frag->segment.seg_len; /* this is size PML wants; + * will have to calc + * alignment + * at the other end + */ + + /* find remote fragment to be used */ + remote_frag = (mca_btl_udapl_frag_eager_rdma_t *) + ((char *)(endpoint->endpoint_eager_rdma_remote.base.pval) + + (head * mca_btl_udapl_component.udapl_eager_rdma_frag_size)); + + /* prep the fragment to be written out */ + frag->type = MCA_BTL_UDAPL_RDMA_WRITE; + frag->triplet.segment_length = frag->segment.seg_len + + sizeof(mca_btl_udapl_footer_t) + + pad + + sizeof(mca_btl_udapl_rdma_footer_t); + + /* set remote_buf to start of the remote write location; + * compute by first finding the end of the entire fragment + * and then working way back + */ + remote_buf = (char *)remote_frag + + (sizeof(mca_btl_udapl_frag_eager_rdma_t) + frag->size) - + frag->triplet.segment_length; + + if (mca_btl_udapl_component.udapl_eager_rdma_guarantee == 0) { + /* execute transfer with one contiguous write */ + + /* establish remote memory region */ + remote_buffer.rmr_context = + (DAT_RMR_CONTEXT)endpoint->endpoint_eager_rdma_remote.rkey; + remote_buffer.target_address = (DAT_VADDR)remote_buf; + remote_buffer.segment_length = frag->triplet.segment_length; + + /* write the data out */ + cookie.as_ptr = frag; + rc = dat_ep_post_rdma_write(endpoint->endpoint_eager, + 1, + &(frag->triplet), + cookie, + &remote_buffer, + DAT_COMPLETION_DEFAULT_FLAG); + if(DAT_SUCCESS != rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write", + major, minor)); + return OMPI_ERROR; + } + } else { + /* One must perform a few extra steps to guarantee that the last + * byte written is indeed the "active" value; This is + * acomplished by doing write-read-write; See Sections + * 6.6.21.0.1 and 6.8.2.1 in Verion 1.2 9/15/2004 of the UDAPL Spec. + * + * Since the frag->triplet is already prep'ed for the non + * guarantee single write case above, here we perform 2 writes: + * first the data and the udapl footer, skipping the pad, + * and then writing just the rdma footer. With the read + * inbetween as required to guarantee delivery of the + * second write after the first. + */ + + /* establish remote memory region for data and udapl footer */ + remote_buffer.rmr_context = + (DAT_RMR_CONTEXT)endpoint->endpoint_eager_rdma_remote.rkey; + remote_buffer.target_address = (DAT_VADDR)remote_buf; + remote_buffer.segment_length = (frag->triplet.segment_length - + sizeof(mca_btl_udapl_rdma_footer_t) - pad); + + /* establish local memory region for data and udapl footer */ + local_iov.lmr_context = frag->triplet.lmr_context; + local_iov.virtual_address = (DAT_VADDR)frag->triplet.virtual_address; + local_iov.segment_length = (frag->triplet.segment_length - + sizeof(mca_btl_udapl_rdma_footer_t) - pad); + + /* write the data */ + cookie.as_ptr = NULL; + rc = dat_ep_post_rdma_write(endpoint->endpoint_eager, + 1, + &local_iov, + cookie, + &remote_buffer, + DAT_COMPLETION_DEFAULT_FLAG); + if(DAT_SUCCESS != rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write", + major, minor)); + return OMPI_ERROR; + } + + /* perform zero byte read of the remote memory region */ + remote_buffer.target_address = (DAT_VADDR)remote_buf; + remote_buffer.segment_length = frag->triplet.segment_length; + local_iov.virtual_address = NULL; + local_iov.segment_length = 0; + + cookie.as_ptr = NULL; + rc = dat_ep_post_rdma_read(endpoint->endpoint_eager, + 0, + &local_iov, + cookie, + &remote_buffer, + DAT_COMPLETION_DEFAULT_FLAG); + if(DAT_SUCCESS != rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_read", + major, minor)); + return OMPI_ERROR; + } + + /* establish remote memory region for rdma footer */ + remote_buffer.target_address = (DAT_VADDR)((char *)remote_buf + + frag->triplet.segment_length - sizeof(mca_btl_udapl_rdma_footer_t)); + remote_buffer.segment_length = sizeof(mca_btl_udapl_rdma_footer_t); + + /* establish local memory region for rdma footer */ + local_iov.virtual_address = (DAT_VADDR)(frag->rdma_ftr); + local_iov.segment_length = sizeof(mca_btl_udapl_rdma_footer_t); + + /* write the footer */ + cookie.as_ptr = frag; + rc = dat_ep_post_rdma_write(endpoint->endpoint_eager, + 1, + &local_iov, + cookie, + &remote_buffer, + DAT_COMPLETION_DEFAULT_FLAG); + if(DAT_SUCCESS != rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_rdma_write", + major, minor)); + return OMPI_ERROR; + } + } + return rc; +} int mca_btl_udapl_endpoint_send(mca_btl_base_endpoint_t* endpoint, mca_btl_udapl_frag_t* frag) { int rc = OMPI_SUCCESS; + DAT_RETURN dat_rc; DAT_DTO_COOKIE cookie; - + bool call_progress = false; + /* Fix up the segment length before we do anything with the frag */ frag->triplet.segment_length = frag->segment.seg_len + sizeof(mca_btl_udapl_footer_t); OPAL_THREAD_LOCK(&endpoint->endpoint_lock); switch(endpoint->endpoint_state) { - case MCA_BTL_UDAPL_CONNECTED: - /* just send it already.. */ - cookie.as_ptr = frag; - if(frag->size == - mca_btl_udapl_component.udapl_eager_frag_size) { + case MCA_BTL_UDAPL_CONNECTED: + /* just send it already.. */ + if(frag->size == + mca_btl_udapl_component.udapl_eager_frag_size) { + if(OPAL_THREAD_ADD32(&endpoint->endpoint_eager_rdma_remote.tokens, -1) < 0) { + /* no rdma segment available so either send or queue */ + OPAL_THREAD_ADD32(&endpoint->endpoint_eager_rdma_remote.tokens, 1); - if(OPAL_THREAD_ADD32(&endpoint->endpoint_eager_sends, -1) < 0) { - OPAL_THREAD_ADD32(&endpoint->endpoint_eager_sends, 1); + if(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_EAGER_CONNECTION], -1) < 0) { + OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_EAGER_CONNECTION], 1); opal_list_append(&endpoint->endpoint_eager_frags, - (opal_list_item_t*)frag); - } else { - rc = dat_ep_post_send(endpoint->endpoint_eager, 1, - &frag->triplet, cookie, - DAT_COMPLETION_DEFAULT_FLAG); - } - } else { - assert(frag->size == - mca_btl_udapl_component.udapl_max_frag_size); - if(OPAL_THREAD_ADD32(&endpoint->endpoint_max_sends, -1) < 0) { - OPAL_THREAD_ADD32(&endpoint->endpoint_max_sends, 1); - opal_list_append(&endpoint->endpoint_max_frags, - (opal_list_item_t*)frag); - } else { - rc = dat_ep_post_send(endpoint->endpoint_max, 1, - &frag->triplet, cookie, - DAT_COMPLETION_DEFAULT_FLAG); - } - } - - if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_send"); - rc = OMPI_ERROR; - } - - break; - case MCA_BTL_UDAPL_CLOSED: - /* Initiate a new connection, add this send to a queue */ - rc = mca_btl_udapl_start_connect(endpoint); - if(OMPI_SUCCESS != rc) { - endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; - break; - } - - /* Fall through on purpose to queue the send */ - case MCA_BTL_UDAPL_CONN_EAGER: - case MCA_BTL_UDAPL_CONN_MAX: - /* Add this send to a queue */ - if(frag->size == - mca_btl_udapl_component.udapl_eager_frag_size) { - opal_list_append(&endpoint->endpoint_eager_frags, (opal_list_item_t*)frag); + call_progress = true; + + } else { + cookie.as_ptr = frag; + + dat_rc = dat_ep_post_send(endpoint->endpoint_eager, 1, + &frag->triplet, cookie, + DAT_COMPLETION_DEFAULT_FLAG); + + if(DAT_SUCCESS != dat_rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", + major, minor)); + endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; + rc = OMPI_ERROR; + } + } + } else { - assert(frag->size == - mca_btl_udapl_component.udapl_max_frag_size); - OPAL_THREAD_ADD32(&endpoint->endpoint_max_sends, -1); + rc = mca_btl_udapl_endpoint_write_eager(endpoint, frag); + } + + } else { + assert(frag->size == + mca_btl_udapl_component.udapl_max_frag_size); + if(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], -1) < 0) { + OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], 1); opal_list_append(&endpoint->endpoint_max_frags, - (opal_list_item_t*)frag); - } + (opal_list_item_t*)frag); + call_progress = true; + } else { + cookie.as_ptr = frag; + + dat_rc = dat_ep_post_send(endpoint->endpoint_max, 1, + &frag->triplet, cookie, + DAT_COMPLETION_DEFAULT_FLAG); + + if(DAT_SUCCESS != dat_rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", + major, minor)); + rc = OMPI_ERROR; + } + } + } + + break; + case MCA_BTL_UDAPL_CLOSED: + /* Initiate a new connection, add this send to a queue */ + rc = mca_btl_udapl_start_connect(endpoint); + if(OMPI_SUCCESS != rc) { + endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; break; - case MCA_BTL_UDAPL_FAILED: - rc = OMPI_ERR_UNREACH; - break; + } + + /* Fall through on purpose to queue the send */ + case MCA_BTL_UDAPL_CONN_EAGER: + case MCA_BTL_UDAPL_CONN_MAX: + /* Add this send to a queue */ + if(frag->size == + mca_btl_udapl_component.udapl_eager_frag_size) { + opal_list_append(&endpoint->endpoint_eager_frags, + (opal_list_item_t*)frag); + } else { + assert(frag->size == + mca_btl_udapl_component.udapl_max_frag_size); + OPAL_THREAD_ADD32(&endpoint->endpoint_max_sends, -1); + opal_list_append(&endpoint->endpoint_max_frags, + (opal_list_item_t*)frag); + } + + break; + case MCA_BTL_UDAPL_FAILED: + rc = OMPI_ERR_UNREACH; + break; } OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + if(call_progress) opal_progress(); + return rc; } @@ -138,6 +373,119 @@ static void mca_btl_udapl_endpoint_send_cb(int status, orte_process_name_t* endp } +/* + * Set uDAPL endpoint parameters as required in ep_param. Accomplished + * by retrieving the default set of parameters from temporary (dummy) + * endpoint and then setting any other parameters as required by + * this BTL. + * + * @param btl (IN) BTL module + * @param ep_param (IN/OUT)Pointer to a valid endpoint parameter location + * + * @return OMPI_SUCCESS or error status on failure + */ +int mca_btl_udapl_endpoint_get_params(mca_btl_udapl_module_t* btl, + DAT_EP_PARAM* ep_param) +{ + int rc = OMPI_SUCCESS; + DAT_EP_HANDLE dummy_ep; + DAT_EP_ATTR* ep_attr = &((*ep_param).ep_attr); + + /* open dummy endpoint, used to find default endpoint parameters */ + rc = dat_ep_create(btl->udapl_ia, + btl->udapl_pz, + btl->udapl_evd_dto, + btl->udapl_evd_dto, + btl->udapl_evd_conn, + NULL, + &dummy_ep); + if (rc != DAT_SUCCESS) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_create", + major, minor)); + /* this could be recoverable, by just using defaults */ + ep_attr = NULL; + return OMPI_ERROR; + } + + rc = dat_ep_query(dummy_ep, + DAT_EP_FIELD_ALL, + ep_param); + if (rc != DAT_SUCCESS) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_query", + major, minor)); + + /* this could be recoverable, by just using defaults */ + ep_attr = NULL; + return OMPI_ERROR; + } + + /* Set values from mca parameters */ + (*ep_attr).max_recv_dtos = + mca_btl_udapl_component.udapl_max_recv_dtos; + (*ep_attr).max_request_dtos = + mca_btl_udapl_component.udapl_max_request_dtos; + + /* close the dummy endpoint */ + rc = dat_ep_free(dummy_ep); + if (rc != DAT_SUCCESS) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("WARNING: %s %s %s\n", "dat_ep_free", + major, minor)); + /* this could be recoverable, by just using defaults */ + } + + return rc; +} + +/* + * Create a uDAPL endpoint + * + * @param btl (IN) BTL module + * @param ep_endpoint (IN) uDAPL endpoint information + * + * @return OMPI_SUCCESS or error status on failure + */ +int mca_btl_udapl_endpoint_create(mca_btl_udapl_module_t* btl, + DAT_EP_HANDLE* udapl_endpoint) +{ + int rc = OMPI_SUCCESS; + DAT_EP_PARAM ep_param; + + /* Create a new uDAPL endpoint and start the connection process */ + rc = dat_ep_create(btl->udapl_ia, btl->udapl_pz, + btl->udapl_evd_dto, btl->udapl_evd_dto, btl->udapl_evd_conn, + &(btl->udapl_ep_param.ep_attr), udapl_endpoint); + + if(DAT_SUCCESS != rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_create", + major, minor)); + dat_ep_free(udapl_endpoint); + udapl_endpoint = DAT_HANDLE_NULL; + } + + return rc; +} + + static int mca_btl_udapl_start_connect(mca_btl_base_endpoint_t* endpoint) { mca_btl_udapl_addr_t* addr = &endpoint->endpoint_btl->udapl_addr; @@ -256,11 +604,9 @@ void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint) } /* Create a new uDAPL endpoint and start the connection process */ - rc = dat_ep_create(btl->udapl_ia, btl->udapl_pz, - btl->udapl_evd_dto, btl->udapl_evd_dto, btl->udapl_evd_conn, - NULL, &endpoint->endpoint_eager); + rc = mca_btl_udapl_endpoint_create(btl, &endpoint->endpoint_eager); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_create (eager)"); + BTL_ERROR(("mca_btl_udapl_endpoint_create")); goto failure_create; } @@ -268,7 +614,13 @@ void mca_btl_udapl_endpoint_connect(mca_btl_udapl_endpoint_t* endpoint) endpoint->endpoint_addr.port, mca_btl_udapl_component.udapl_timeout, 0, NULL, 0, DAT_CONNECT_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_connect (eager)"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_connect", + major, minor)); goto failure; } @@ -369,16 +721,21 @@ static int mca_btl_udapl_endpoint_finish_eager( endpoint->endpoint_state = MCA_BTL_UDAPL_CONN_MAX; OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + /* establish eager rdma connection */ + if (btl->udapl_eager_rdma_endpoint_count < + mca_btl_udapl_component.udapl_max_eager_rdma_peers) { + mca_btl_udapl_endpoint_connect_eager_rdma(endpoint); + } + /* Only one side does dat_ep_connect() */ if(0 < orte_ns.compare_fields(ORTE_NS_CMP_ALL, &endpoint->endpoint_proc->proc_guid, &ompi_proc_local()->proc_name)) { - rc = dat_ep_create(btl->udapl_ia, btl->udapl_pz, - btl->udapl_evd_dto, btl->udapl_evd_dto, btl->udapl_evd_conn, - NULL, &endpoint->endpoint_max); + rc = mca_btl_udapl_endpoint_create(btl, &endpoint->endpoint_max); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_create (max)"); + endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); return OMPI_ERROR; } @@ -387,7 +744,13 @@ static int mca_btl_udapl_endpoint_finish_eager( mca_btl_udapl_component.udapl_timeout, 0, NULL, 0, DAT_CONNECT_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_connect (max)"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_connect", + major, minor)); dat_ep_free(endpoint->endpoint_max); return OMPI_ERROR; } @@ -412,17 +775,16 @@ static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t* endpoint) mca_btl_udapl_endpoint_post_recv(endpoint, mca_btl_udapl_component.udapl_max_frag_size); - /* post queued sends */ assert(endpoint->endpoint_eager_sends == mca_btl_udapl_component.udapl_num_sends); - while(OPAL_THREAD_ADD32(&endpoint->endpoint_eager_sends, -1) >= 0 && + while(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_EAGER_CONNECTION], -1) >= 0 && NULL != (frag = (mca_btl_udapl_frag_t*) opal_list_remove_first(&endpoint->endpoint_eager_frags))) { cookie.as_ptr = frag; assert(frag->triplet.virtual_address == - (DAT_VADDR)frag->segment.seg_addr.pval); + (DAT_VADDR)frag->segment.seg_addr.pval); assert(frag->triplet.segment_length == frag->segment.seg_len + sizeof(mca_btl_udapl_footer_t)); assert(frag->size == @@ -430,20 +792,26 @@ static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t* endpoint) rc = dat_ep_post_send(endpoint->endpoint_eager, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_send (eager)"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", + major, minor)); endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; ret = OMPI_ERROR; break; } } - if(endpoint->endpoint_eager_sends < 0) { - OPAL_THREAD_ADD32(&endpoint->endpoint_eager_sends, 1); + if(endpoint->endpoint_sr_tokens[BTL_UDAPL_EAGER_CONNECTION] < 0) { + OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_EAGER_CONNECTION], 1); } assert(endpoint->endpoint_max_sends == mca_btl_udapl_component.udapl_num_sends); - while(OPAL_THREAD_ADD32(&endpoint->endpoint_max_sends, -1) >= 0 && + while(OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], -1) >= 0 && NULL != (frag = (mca_btl_udapl_frag_t*) opal_list_remove_first(&endpoint->endpoint_max_frags))) { cookie.as_ptr = frag; @@ -457,15 +825,21 @@ static int mca_btl_udapl_endpoint_finish_max(mca_btl_udapl_endpoint_t* endpoint) rc = dat_ep_post_send(endpoint->endpoint_max, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_send (max)"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", + major, minor)); endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; ret = OMPI_ERROR; break; } } - if(endpoint->endpoint_max_sends < 0) { - OPAL_THREAD_ADD32(&endpoint->endpoint_max_sends, 1); + if(endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION] < 0) { + OPAL_THREAD_ADD32(&endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION], 1); } OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); return ret; @@ -515,7 +889,13 @@ static int mca_btl_udapl_endpoint_post_recv(mca_btl_udapl_endpoint_t* endpoint, rc = dat_ep_post_recv(ep, 1, &frag->triplet, cookie, DAT_COMPLETION_DEFAULT_FLAG); if(DAT_SUCCESS != rc) { - MCA_BTL_UDAPL_ERROR(rc, "dat_ep_post_recv"); + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_recv", + major, minor)); return OMPI_ERROR; } } @@ -542,11 +922,25 @@ static void mca_btl_udapl_endpoint_construct(mca_btl_base_endpoint_t* endpoint) endpoint->endpoint_eager = DAT_HANDLE_NULL; endpoint->endpoint_max = DAT_HANDLE_NULL; + endpoint->endpoint_sr_tokens[BTL_UDAPL_EAGER_CONNECTION] = + endpoint->endpoint_eager_sends; + endpoint->endpoint_sr_tokens[BTL_UDAPL_MAX_CONNECTION] = + endpoint->endpoint_max_sends; + endpoint->endpoint_sr_credits[BTL_UDAPL_EAGER_CONNECTION] = 0; + endpoint->endpoint_sr_credits[BTL_UDAPL_MAX_CONNECTION] = 0; + OBJ_CONSTRUCT(&endpoint->endpoint_eager_frags, opal_list_t); OBJ_CONSTRUCT(&endpoint->endpoint_max_frags, opal_list_t); OBJ_CONSTRUCT(&endpoint->endpoint_lock, opal_mutex_t); -} + /* initialize eager RDMA */ + memset(&endpoint->endpoint_eager_rdma_local, 0, + sizeof(mca_btl_udapl_eager_rdma_local_t)); + memset (&endpoint->endpoint_eager_rdma_remote, 0, + sizeof(mca_btl_udapl_eager_rdma_remote_t)); + OBJ_CONSTRUCT(&endpoint->endpoint_eager_rdma_local.lock, opal_mutex_t); + OBJ_CONSTRUCT(&endpoint->endpoint_eager_rdma_remote.lock, opal_mutex_t); +} /* * Destroy a endpoint @@ -555,9 +949,379 @@ static void mca_btl_udapl_endpoint_construct(mca_btl_base_endpoint_t* endpoint) static void mca_btl_udapl_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) { + mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; + mca_mpool_base_registration_t *reg = + (mca_mpool_base_registration_t*)endpoint->endpoint_eager_rdma_local.reg; + OBJ_DESTRUCT(&endpoint->endpoint_eager_frags); OBJ_DESTRUCT(&endpoint->endpoint_max_frags); OBJ_DESTRUCT(&endpoint->endpoint_lock); + + /* release eager rdma resources */ + udapl_btl->super.btl_mpool->mpool_free(udapl_btl->super.btl_mpool, + NULL, + reg); +} + + +/* + * Release the fragment used to send the eager rdma control message. + * Callback to be executed upon receiving local completion event + * from sending a control message operation. Should essentially do + * the same thing as mca_btl_udapl_free(). + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param descriptor (IN) Description of the data to be transferred + * @param status (IN/OUT) + */ +static void mca_btl_udapl_endpoint_control_send_cb( + mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct mca_btl_base_descriptor_t* descriptor, + int status) +{ + MCA_BTL_UDAPL_FRAG_RETURN_CONTROL(((mca_btl_udapl_module_t*)btl), + ((mca_btl_udapl_frag_t*)descriptor)); +} + +/* + * Allocate and initialize descriptor to be used in sending uDAPL BTL + * control messages. Should essentially accomplish same as would be + * from calling mca_btl_udapl_alloc(). + * + * @param btl (IN) BTL module + * @param size (IN) Size of segment required to be transferred + * + * @return descriptor (IN) Description of the data to be transferred + */ +static mca_btl_base_descriptor_t* mca_btl_udapl_endpoint_initialize_control_message( + struct mca_btl_base_module_t* btl, + size_t size) +{ + mca_btl_udapl_module_t* udapl_btl = (mca_btl_udapl_module_t*) btl; + mca_btl_udapl_frag_t* frag; + int rc; + int pad = 0; + + /* compute pad as needed */ + MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(pad, + (size + sizeof(mca_btl_udapl_footer_t))); + + /* control messages size should never be greater than eager message size */ + assert((size+pad) <= btl->btl_eager_limit); + + MCA_BTL_UDAPL_FRAG_ALLOC_CONTROL(udapl_btl, frag, rc); + + /* Set up the LMR triplet from the frag segment */ + frag->segment.seg_len = (uint32_t)size; + frag->triplet.virtual_address = (DAT_VADDR)frag->segment.seg_addr.pval; + + /* assume send/recv as default when computing segment_length */ + frag->triplet.segment_length = + frag->segment.seg_len + sizeof(mca_btl_udapl_footer_t); + + assert(frag->triplet.lmr_context == + ((mca_btl_udapl_reg_t*)frag->registration)->lmr_triplet.lmr_context); + + frag->btl = udapl_btl; + frag->base.des_src = &frag->segment; + frag->base.des_src_cnt = 1; + frag->base.des_dst = NULL; + frag->base.des_dst_cnt = 0; + frag->base.des_flags = 0; + frag->base.des_cbfunc = mca_btl_udapl_endpoint_control_send_cb; + frag->base.des_cbdata = NULL; + + return &frag->base; +} + +/* + * Transfer the given endpoints rdma segment information. Expects that + * the endpoints rdma segment has previoulsy been created and + * registered as required. + * + * @param endpoint (IN) BTL addressing information + * + * @return OMPI_SUCCESS or error status on failure + */ +static int mca_btl_udapl_endpoint_send_eager_rdma( + mca_btl_base_endpoint_t* endpoint) +{ + mca_btl_udapl_eager_rdma_connect_t* rdma_connect; + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; + size_t cntrl_msg_size = sizeof(mca_btl_udapl_eager_rdma_connect_t); + int rc = OMPI_SUCCESS; + + des = mca_btl_udapl_endpoint_initialize_control_message( + (mca_btl_base_module_t *)udapl_btl, cntrl_msg_size); + + des->des_flags = 0; + des->des_cbfunc = mca_btl_udapl_endpoint_control_send_cb; + des->des_cbdata = NULL; + + /* fill in data */ + segment = des->des_src; + rdma_connect = + (mca_btl_udapl_eager_rdma_connect_t*)segment->seg_addr.pval; + rdma_connect->control.type = + MCA_BTL_UDAPL_CONTROL_RDMA_CONNECT; + rdma_connect->rkey = + endpoint->endpoint_eager_rdma_local.reg->rmr_context; + rdma_connect->rdma_start.pval = + endpoint->endpoint_eager_rdma_local.base.pval; + + /* send fragment */ + rc = mca_btl_udapl_send((mca_btl_base_module_t *)udapl_btl, endpoint, + des, MCA_BTL_TAG_BTL); + + return rc; +} + +/* + * Endpoint handed in is the local process peer. This routine + * creates and initializes a local memory region which will be used for + * reading from locally. This memory region will be made available to peer + * for writing into by sending a description of the area to the given + * endpoint. + * + * @param endpoint (IN) BTL addressing information + */ +void mca_btl_udapl_endpoint_connect_eager_rdma( + mca_btl_udapl_endpoint_t* endpoint) +{ + char* buf; + size_t size; + int i; + mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; + + OPAL_THREAD_LOCK(&endpoint->endpoint_eager_rdma_local.lock); + if (endpoint->endpoint_eager_rdma_local.base.pval) + goto unlock_rdma_local; + + if (mca_btl_udapl_component.udapl_eager_rdma_num <= 0) { + /* NOTE: Need to find a more generic way to check ranges + * for all mca parameters. + */ + opal_show_help("help-mpi-btl-udapl.txt", + "invalid num rdma segments", + true, + mca_btl_udapl_component.udapl_eager_rdma_num); + goto unlock_rdma_local; + } + + /* determine total size of buffer region */ + size = mca_btl_udapl_component.udapl_eager_rdma_frag_size * + mca_btl_udapl_component.udapl_eager_rdma_num; + + /* create and register memory */ + buf = udapl_btl->super.btl_mpool->mpool_alloc(udapl_btl->super.btl_mpool, + size, 0, 0, + (mca_mpool_base_registration_t**)&endpoint->endpoint_eager_rdma_local.reg); + + if(!buf) + goto unlock_rdma_local; + + /* initialize the rdma segments */ + for(i = 0; i < mca_btl_udapl_component.udapl_eager_rdma_num; i++) { + mca_btl_udapl_frag_eager_rdma_t* local_rdma_frag; + ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf + + i*mca_btl_udapl_component.udapl_eager_rdma_frag_size); + item->user_data = endpoint->endpoint_eager_rdma_local.reg; + OBJ_CONSTRUCT(item, mca_btl_udapl_frag_eager_rdma_t); + + local_rdma_frag = ((mca_btl_udapl_frag_eager_rdma_t*)item); + + local_rdma_frag->base.des_dst = &local_rdma_frag->segment; + local_rdma_frag->base.des_dst_cnt = 1; + local_rdma_frag->base.des_src = NULL; + local_rdma_frag->base.des_src_cnt = 0; + local_rdma_frag->btl = endpoint->endpoint_btl; + + + local_rdma_frag->endpoint = endpoint; + local_rdma_frag->type = MCA_BTL_UDAPL_FRAG_EAGER_RDMA; + local_rdma_frag->triplet.segment_length = local_rdma_frag->size; + } + + OPAL_THREAD_LOCK(&udapl_btl->udapl_eager_rdma_lock); + if(orte_pointer_array_add (&endpoint->endpoint_eager_rdma_index, + udapl_btl->udapl_eager_rdma_endpoints, endpoint) < 0) + goto cleanup; + + endpoint->endpoint_eager_rdma_local.base.pval = buf; + udapl_btl->udapl_eager_rdma_endpoint_count++; + + /* send the relevant data describing the registered space to the endpoint */ + if (mca_btl_udapl_endpoint_send_eager_rdma(endpoint) == 0) { + OPAL_THREAD_UNLOCK(&udapl_btl->udapl_eager_rdma_lock); + OPAL_THREAD_UNLOCK(&endpoint->endpoint_eager_rdma_local.lock); + return; + } + + udapl_btl->udapl_eager_rdma_endpoint_count--; + endpoint->endpoint_eager_rdma_local.base.pval = NULL; + orte_pointer_array_set_item(udapl_btl->udapl_eager_rdma_endpoints, + endpoint->endpoint_eager_rdma_index, NULL); + +cleanup: + /* this would fail if we hit the max and can not add anymore to the array + * and this could happen because we do not lock before checking if max has + * been reached + */ + OPAL_THREAD_UNLOCK(&udapl_btl->udapl_eager_rdma_lock); + udapl_btl->super.btl_mpool->mpool_free(udapl_btl->super.btl_mpool, + buf, + (mca_mpool_base_registration_t*)endpoint->endpoint_eager_rdma_local.reg); + + unlock_rdma_local: + OPAL_THREAD_UNLOCK(&endpoint->endpoint_eager_rdma_local.lock); + +} + +/* + * Send control message with the number of credits available on the + * endpoint. Update the credit value accordingly. + * + * @param endpoint (IN) BTL addressing information + * + * @return OMPI_SUCCESS or error status on failure + */ +int mca_btl_udapl_endpoint_send_eager_rdma_credits( + mca_btl_base_endpoint_t* endpoint) +{ + mca_btl_udapl_eager_rdma_credit_t *rdma_credit; + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + DAT_DTO_COOKIE cookie; + mca_btl_udapl_frag_t* frag; + mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; + size_t cntrl_msg_size = sizeof(mca_btl_udapl_eager_rdma_credit_t); + int rc = OMPI_SUCCESS; + + des = mca_btl_udapl_endpoint_initialize_control_message( + (mca_btl_base_module_t *)udapl_btl, cntrl_msg_size); + + /* fill in data */ + segment = des->des_src; + rdma_credit = (mca_btl_udapl_eager_rdma_credit_t*)segment->seg_addr.pval; + rdma_credit->control.type = MCA_BTL_UDAPL_CONTROL_RDMA_CREDIT; + rdma_credit->credits = endpoint->endpoint_eager_rdma_local.credits; + + /* reset local credits value */ + OPAL_THREAD_LOCK(&endpoint->endpoint_eager_rdma_local.lock); + endpoint->endpoint_eager_rdma_local.credits -= rdma_credit->credits; + + /* prep and send fragment : control messages do not count + * against the token/credit number so do not subtract from tokens + * with this send + */ + frag = (mca_btl_udapl_frag_t*)des; + frag->endpoint = endpoint; + frag->ftr = (mca_btl_udapl_footer_t *) + ((char *)frag->segment.seg_addr.pval + frag->segment.seg_len); + frag->ftr->tag = MCA_BTL_TAG_BTL; + frag->type = MCA_BTL_UDAPL_SEND; + cookie.as_ptr = frag; + + rc = dat_ep_post_send(endpoint->endpoint_eager, 1, + &frag->triplet, cookie, + DAT_COMPLETION_DEFAULT_FLAG); + + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + + if(DAT_SUCCESS != rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", + major, minor)); + endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; + rc = OMPI_ERROR; + } + + return rc; +} + +/* + * Send control message with the number of credits available on the + * endpoint. Update the credit value accordingly. + * + * @param endpoint (IN) BTL addressing information + * + * @param connection (IN) 0 for eager and 1 for max connection + * + * @return OMPI_SUCCESS or error status on failure + */ +int mca_btl_udapl_endpoint_send_sr_credits( + mca_btl_base_endpoint_t* endpoint, uint32_t connection) +{ + mca_btl_udapl_sr_credit_t *sr_credit; + mca_btl_base_descriptor_t* des; + mca_btl_base_segment_t* segment; + DAT_DTO_COOKIE cookie; + mca_btl_udapl_frag_t* frag; + mca_btl_udapl_module_t* udapl_btl = endpoint->endpoint_btl; + size_t cntrl_msg_size = sizeof(mca_btl_udapl_sr_credit_t); + int rc = OMPI_SUCCESS; + + des = mca_btl_udapl_endpoint_initialize_control_message( + (mca_btl_base_module_t *)udapl_btl, cntrl_msg_size); + + /* fill in data */ + segment = des->des_src; + sr_credit = (mca_btl_udapl_sr_credit_t*)segment->seg_addr.pval; + sr_credit->control.type = MCA_BTL_UDAPL_CONTROL_SR_CREDIT; + OPAL_THREAD_LOCK(&endpoint->endpoint_lock); + sr_credit->credits = endpoint->endpoint_sr_credits[connection]; + sr_credit->connection = connection; + + /* reset local credits value */ + endpoint->endpoint_sr_credits[connection] = 0; + + /* prep and send fragment : control messages do not count + * against the token/credit count so do not subtract from tokens + * with this send + */ + frag = (mca_btl_udapl_frag_t*)des; + frag->endpoint = endpoint; + frag->ftr = (mca_btl_udapl_footer_t *) + ((char *)frag->segment.seg_addr.pval + frag->segment.seg_len); + frag->ftr->tag = MCA_BTL_TAG_BTL; + frag->type = MCA_BTL_UDAPL_SEND; + cookie.as_ptr = frag; + + if (BTL_UDAPL_EAGER_CONNECTION == connection) { + rc = dat_ep_post_send(endpoint->endpoint_eager, 1, + &frag->triplet, cookie, + DAT_COMPLETION_DEFAULT_FLAG); + + } else { + assert(BTL_UDAPL_MAX_CONNECTION == connection); + rc = dat_ep_post_send(endpoint->endpoint_max, 1, + &frag->triplet, cookie, + DAT_COMPLETION_DEFAULT_FLAG); + } + + OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); + + if(DAT_SUCCESS != rc) { + char* major; + char* minor; + + dat_strerror(rc, (const char**)&major, + (const char**)&minor); + BTL_ERROR(("ERROR: %s %s %s\n", "dat_ep_post_send", + major, minor)); + endpoint->endpoint_state = MCA_BTL_UDAPL_FAILED; + rc = OMPI_ERROR; + } + + return rc; } diff --git a/ompi/mca/btl/udapl/btl_udapl_endpoint.h b/ompi/mca/btl/udapl/btl_udapl_endpoint.h index 0ddd1bdbe3..6f2e64b8df 100644 --- a/ompi/mca/btl/udapl/btl_udapl_endpoint.h +++ b/ompi/mca/btl/udapl/btl_udapl_endpoint.h @@ -27,13 +27,19 @@ #include "opal/event/event.h" #include "ompi/mca/pml/pml.h" #include "ompi/mca/btl/btl.h" +#include "ompi/mca/btl/base/btl_base_error.h" #include "btl_udapl_frag.h" #include "btl_udapl.h" +#include "btl_udapl_eager_rdma.h" + #if defined(c_plusplus) || defined(__cplusplus) extern "C" { #endif +#define BTL_UDAPL_TOKENS(E, C) ((E)->endpoint_sr_tokens[(C)] + \ + (((C) == BTL_UDAPL_EAGER_CONNECTION)?(E)->endpoint_eager_rdma_remote.tokens:0)) + /** * Structure used to publish uDAPL id information to peers. */ @@ -43,7 +49,7 @@ struct mca_btl_udapl_addr_t { }; typedef struct mca_btl_udapl_addr_t mca_btl_udapl_addr_t; - + /** * State of uDAPL endpoint connection. */ @@ -56,6 +62,25 @@ typedef enum { MCA_BTL_UDAPL_FAILED } mca_btl_udapl_endpoint_state_t; +/* + * Establish a name for the 2 connections opened per peer + */ +typedef enum { + BTL_UDAPL_EAGER_CONNECTION, + BTL_UDAPL_MAX_CONNECTION, + BTL_UDAPL_NUM_CONNECTION +} mca_btl_udapl_endpoint_conn_t; + +/* + * Encapsulate data that describes sendrecv credit information. + */ +struct mca_btl_udapl_sr_credit_t { + mca_btl_udapl_control_header_t control; + uint32_t credits; + uint32_t connection; /* 0 == BTL_UDAPL_EAGER_CONNECTION; + 1 == BTL_UDAPL_MAX_CONNECTION */ +}; +typedef struct mca_btl_udapl_sr_credit_t mca_btl_udapl_sr_credit_t; /** * An abstraction that represents a connection to a endpoint process. @@ -84,6 +109,12 @@ struct mca_btl_base_endpoint_t { int32_t endpoint_max_sends; /**< number of sends that may be posted */ + int32_t endpoint_sr_tokens[BTL_UDAPL_NUM_CONNECTION]; + /**< number of sends that may be posted */ + + int32_t endpoint_sr_credits[BTL_UDAPL_NUM_CONNECTION]; + /**< number of recvs that are now available */ + int32_t endpoint_connection_seq; /**< sequence number of sendrecv message for the connection est */ @@ -96,6 +127,13 @@ struct mca_btl_base_endpoint_t { DAT_EP_HANDLE endpoint_eager; DAT_EP_HANDLE endpoint_max; /**< uDAPL endpoint handle */ + + int32_t endpoint_eager_rdma_index; + /**< index into array of endpoints with RDMA buffers */ + mca_btl_udapl_eager_rdma_local_t endpoint_eager_rdma_local; + /**< info about local RDMA buffer */ + mca_btl_udapl_eager_rdma_remote_t endpoint_eager_rdma_remote; + /**< info about remote RDMA buffer */ }; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; @@ -126,6 +164,29 @@ int mca_btl_udapl_endpoint_finish_connect(struct mca_btl_udapl_module_t* btl, int32_t* seq, DAT_EP_HANDLE endpoint); +/* + * Send number of eager rdma credits + */ +int mca_btl_udapl_endpoint_send_eager_rdma_credits(mca_btl_base_endpoint_t* endpoint); + +/* + * Establish uDAPL endpoint parameters + */ +int mca_btl_udapl_endpoint_get_params(struct mca_btl_udapl_module_t* btl, + DAT_EP_PARAM* ep_param); + +/* + * Create uDAPL endpoint + */ +int mca_btl_udapl_endpoint_create(struct mca_btl_udapl_module_t* btl, + DAT_EP_HANDLE* udapl_endpoint); + + /* + * Send number of send recv credits + */ +int mca_btl_udapl_endpoint_send_sr_credits(mca_btl_base_endpoint_t* endpoint, + uint32_t connection); + #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/ompi/mca/btl/udapl/btl_udapl_frag.c b/ompi/mca/btl/udapl/btl_udapl_frag.c index 302c095b24..4c67dc139d 100644 --- a/ompi/mca/btl/udapl/btl_udapl_frag.c +++ b/ompi/mca/btl/udapl/btl_udapl_frag.c @@ -73,6 +73,18 @@ static void mca_btl_udapl_frag_user_constructor(mca_btl_udapl_frag_t* frag) frag->registration = NULL; } +static void mca_btl_udapl_frag_eager_rdma_constructor(mca_btl_udapl_frag_t* frag) +{ + mca_btl_udapl_frag_eager_constructor(frag); + frag->segment.seg_len = mca_btl_udapl_module.super.btl_eager_limit; + frag->size = mca_btl_udapl_component.udapl_eager_frag_size; + frag->rdma_ftr = (mca_btl_udapl_rdma_footer_t *) + ((char *)(frag->segment.seg_addr.pval) + + frag->size - + sizeof(mca_btl_udapl_rdma_footer_t)); + frag->rdma_ftr->active=0; +} + static void mca_btl_udapl_frag_common_destructor(mca_btl_udapl_frag_t* frag) { #if OMPI_ENABLE_DEBUG @@ -115,3 +127,8 @@ OBJ_CLASS_INSTANCE( mca_btl_udapl_frag_user_constructor, NULL); +OBJ_CLASS_INSTANCE( + mca_btl_udapl_frag_eager_rdma_t, + mca_btl_base_descriptor_t, + mca_btl_udapl_frag_eager_rdma_constructor, + mca_btl_udapl_frag_common_destructor); diff --git a/ompi/mca/btl/udapl/btl_udapl_frag.h b/ompi/mca/btl/udapl/btl_udapl_frag.h index 14f9366e78..ce089ae6bf 100644 --- a/ompi/mca/btl/udapl/btl_udapl_frag.h +++ b/ompi/mca/btl/udapl/btl_udapl_frag.h @@ -37,18 +37,49 @@ typedef enum { MCA_BTL_UDAPL_PUT, MCA_BTL_UDAPL_GET, MCA_BTL_UDAPL_CONN_RECV, - MCA_BTL_UDAPL_CONN_SEND + MCA_BTL_UDAPL_CONN_SEND, + MCA_BTL_UDAPL_RDMA_WRITE, + MCA_BTL_UDAPL_FRAG_EAGER_RDMA, + MCA_BTL_UDAPL_IGNORE } mca_btl_udapl_frag_type_t; +typedef enum { + MCA_BTL_UDAPL_CONTROL_NOOP, + MCA_BTL_UDAPL_CONTROL_RDMA_CONNECT, + MCA_BTL_UDAPL_CONTROL_RDMA_CREDIT, + MCA_BTL_UDAPL_CONTROL_SR_CREDIT +} mca_btl_udapl_control_t; + +/* Control message header */ +struct mca_btl_udapl_control_header_t { + mca_btl_udapl_control_t type; +}; +typedef struct mca_btl_udapl_control_header_t mca_btl_udapl_control_header_t; + /** * uDAPL btl footer. * This is put after the payload packet so the PML header can be aligned. + * Must be aligned on MCA_BTL_UDAPL_FRAG_ALIGN byte boundary. */ struct mca_btl_udapl_footer_t { mca_btl_base_tag_t tag; }; typedef struct mca_btl_udapl_footer_t mca_btl_udapl_footer_t; +/** + * uDAPL BTL rdma footer. + * This is used in addtion to the uDAPL BTL footer. The two are seperate to + * allow for any padding that may be required between the two. + */ +struct mca_btl_udapl_rdma_footer_t { + uint32_t size; + uint8_t active; /* 0 = not in use; 1 = data is available to be received; + * this should always be the last entry in this structure + */ + char pad[3]; /* pad out be aligned on MCA_BTL_UDAPL_FRAG_ALIGN byte boundary */ +}; +typedef struct mca_btl_udapl_rdma_footer_t mca_btl_udapl_rdma_footer_t; + /** * uDAPL fragment derived type. */ @@ -57,13 +88,15 @@ struct mca_btl_udapl_frag_t { mca_btl_base_segment_t segment; struct mca_btl_udapl_module_t* btl; - struct mca_btl_base_endpoint_t *endpoint; - struct mca_btl_udapl_reg_t* registration; + struct mca_btl_base_endpoint_t* endpoint; DAT_LMR_TRIPLET triplet; + struct mca_btl_udapl_reg_t* registration; - mca_btl_udapl_footer_t *ftr; + mca_btl_udapl_footer_t* ftr; + mca_btl_udapl_rdma_footer_t* rdma_ftr; size_t size; mca_btl_udapl_frag_type_t type; + uint32_t pad; /* Padding the structure to be evenly divisble by MCA_BTL_UDAPL_FRAG_ALIGN */ }; typedef struct mca_btl_udapl_frag_t mca_btl_udapl_frag_t; OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_t); @@ -78,7 +111,10 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_max_t); typedef struct mca_btl_udapl_frag_t mca_btl_udapl_frag_user_t; OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t); +typedef struct mca_btl_udapl_frag_t mca_btl_udapl_frag_eager_rdma_t; +OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_eager_rdma_t); + /* * Macros to allocate/return descriptors from module specific * free list(s). @@ -86,7 +122,6 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t); #define MCA_BTL_UDAPL_FRAG_ALLOC_EAGER(btl, frag, rc) \ { \ - \ ompi_free_list_item_t *item; \ OMPI_FREE_LIST_WAIT(&((mca_btl_udapl_module_t*)btl)->udapl_frag_eager, item, rc); \ frag = (mca_btl_udapl_frag_t*) item; \ @@ -100,7 +135,6 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t); #define MCA_BTL_UDAPL_FRAG_ALLOC_MAX(btl, frag, rc) \ { \ - \ ompi_free_list_item_t *item; \ OMPI_FREE_LIST_WAIT(&((mca_btl_udapl_module_t*)btl)->udapl_frag_max, item, rc); \ frag = (mca_btl_udapl_frag_t*) item; \ @@ -112,7 +146,6 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t); (ompi_free_list_item_t*)(frag)); \ } - #define MCA_BTL_UDAPL_FRAG_ALLOC_USER(btl, frag, rc) \ { \ ompi_free_list_item_t *item; \ @@ -126,6 +159,26 @@ OBJ_CLASS_DECLARATION(mca_btl_udapl_frag_user_t); (ompi_free_list_item_t*)(frag)); \ } +#define MCA_BTL_UDAPL_FRAG_ALLOC_CONTROL(btl, frag, rc) \ +{ \ + ompi_free_list_item_t *item; \ + OMPI_FREE_LIST_WAIT(&((mca_btl_udapl_module_t*)btl)->udapl_frag_control, item, rc); \ + frag = (mca_btl_udapl_frag_t*) item; \ +} + +#define MCA_BTL_UDAPL_FRAG_RETURN_CONTROL(btl, frag) \ +{ \ + OMPI_FREE_LIST_RETURN(&((mca_btl_udapl_module_t*)btl)->udapl_frag_control, \ + (ompi_free_list_item_t*)(frag)); \ +} + +/* + * Calculate the pad value P required to align the given size S + */ +#define MCA_BTL_UDAPL_FRAG_CALC_ALIGNMENT_PAD(P,S) do { \ + (P) = ((S) % MCA_BTL_UDAPL_FRAG_ALIGN) == 0 ? \ + 0 : (MCA_BTL_UDAPL_FRAG_ALIGN - ((S) % MCA_BTL_UDAPL_FRAG_ALIGN)); \ +} while (0); #if defined(c_plusplus) || defined(__cplusplus) } diff --git a/ompi/mca/btl/udapl/help-mpi-btl-udapl.txt b/ompi/mca/btl/udapl/help-mpi-btl-udapl.txt new file mode 100644 index 0000000000..3f7c58440d --- /dev/null +++ b/ompi/mca/btl/udapl/help-mpi-btl-udapl.txt @@ -0,0 +1,32 @@ +# -*- text -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2006 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for Open MPI. +# +[invalid num rdma segments] + +WARNING: MCA parameter [btl_udapl_eager_rdma_num = %d] is not valid. +RDMA will not be used for short messages. Try setting to positive +value, e.g. 16. + +[use default endpoint params] + +WARNING: Using default uDAPL endpoint parameters not those that +would have been modified by MCA parameters. diff --git a/ompi/mca/mpool/udapl/mpool_udapl_module.c b/ompi/mca/mpool/udapl/mpool_udapl_module.c index 718b636eb5..4e58e3d5e6 100644 --- a/ompi/mca/mpool/udapl/mpool_udapl_module.c +++ b/ompi/mca/mpool/udapl/mpool_udapl_module.c @@ -11,6 +11,8 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. + * Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -71,6 +73,8 @@ void* mca_mpool_udapl_alloc( free(addr); return NULL; } + (*registration)->alloc_base = addr; + return addr; } @@ -139,8 +143,6 @@ int mca_mpool_udapl_register( int mca_mpool_udapl_deregister(mca_mpool_base_module_t* mpool, mca_mpool_base_registration_t* reg) { - int rc; - if(reg->flags & (MCA_MPOOL_FLAGS_CACHE | MCA_MPOOL_FLAGS_PERSIST)) { mpool->rcache->rcache_delete(mpool->rcache, reg, reg->flags); reg->flags = 0;