From 90fb58de4f59ce9c42e773464a153ff0634d13af Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 5 Mar 2007 14:17:50 +0000 Subject: [PATCH] When frags are allocated from mpool by free_list the frag structure is also allocated from mpool memory (which is registered memory for RDMA transports) This is not a problem for a small jobs, but for a big number of ranks an amount of waisted memory is big. This commit was SVN r13921. --- ompi/class/ompi_free_list.c | 111 ++++++++++---------- ompi/class/ompi_free_list.h | 8 +- ompi/mca/btl/gm/btl_gm_frag.c | 2 +- ompi/mca/btl/mvapi/btl_mvapi_component.c | 5 +- ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h | 6 +- ompi/mca/btl/mvapi/btl_mvapi_endpoint.c | 24 +++-- ompi/mca/btl/mvapi/btl_mvapi_frag.c | 4 +- ompi/mca/btl/openib/btl_openib_component.c | 40 +++---- ompi/mca/btl/openib/btl_openib_eager_rdma.h | 8 +- ompi/mca/btl/openib/btl_openib_endpoint.c | 29 +++-- ompi/mca/btl/openib/btl_openib_frag.c | 4 +- ompi/mca/btl/openib/btl_openib_frag.h | 3 - ompi/mca/btl/openib/btl_openib_mca.c | 11 +- ompi/mca/btl/openib/help-mpi-btl-openib.txt | 3 + ompi/mca/btl/sm/btl_sm.c | 28 +++-- ompi/mca/btl/sm/btl_sm.h | 1 + ompi/mca/btl/sm/btl_sm_component.c | 74 +++++++------ ompi/mca/btl/sm/btl_sm_fifo.h | 4 +- ompi/mca/btl/sm/btl_sm_frag.c | 18 +++- ompi/mca/btl/sm/btl_sm_frag.h | 33 +++++- ompi/mca/btl/udapl/btl_udapl_endpoint.c | 3 +- ompi/mca/btl/udapl/btl_udapl_frag.c | 5 +- opal/include/opal/align.h | 1 + 23 files changed, 254 insertions(+), 171 deletions(-) diff --git a/ompi/class/ompi_free_list.c b/ompi/class/ompi_free_list.c index 9547a74766..6c5fd48d90 100644 --- a/ompi/class/ompi_free_list.c +++ b/ompi/class/ompi_free_list.c @@ -20,26 +20,11 @@ #include "ompi_config.h" #include "ompi/class/ompi_free_list.h" +#include "opal/include/opal/align.h" #include "opal/sys/cache.h" #include "opal/util/output.h" #include "ompi/mca/mpool/mpool.h" -static inline size_t align_to(size_t val, size_t alignment); -static inline size_t align_to(size_t val, size_t alignment) -{ - size_t mod; - - if(0 == alignment) - return val; - - mod = val % alignment; - - if(mod) - val += (alignment - mod); - - return val; -} - static void ompi_free_list_construct(ompi_free_list_t* fl); static void ompi_free_list_destruct(ompi_free_list_t* fl); @@ -49,6 +34,7 @@ OBJ_CLASS_INSTANCE(ompi_free_list_t, opal_atomic_lifo_t, struct ompi_free_list_memory_t { opal_list_item_t super; mca_mpool_base_registration_t *registration; + void *base_ptr; }; typedef struct ompi_free_list_memory_t ompi_free_list_memory_t; static OBJ_CLASS_INSTANCE(ompi_free_list_memory_t, @@ -69,7 +55,6 @@ static void ompi_free_list_construct(ompi_free_list_t* fl) fl->fl_num_waiting = 0; fl->fl_elem_size = sizeof(ompi_free_list_item_t); fl->fl_elem_class = OBJ_CLASS(ompi_free_list_item_t); - fl->fl_header_space = 0; fl->fl_alignment = 0; fl->fl_mpool = 0; OBJ_CONSTRUCT(&(fl->fl_allocations), opal_list_t); @@ -78,6 +63,7 @@ static void ompi_free_list_construct(ompi_free_list_t* fl) static void ompi_free_list_destruct(ompi_free_list_t* fl) { opal_list_item_t *item; + ompi_free_list_memory_t *fl_mem; #if 0 && OMPI_ENABLE_DEBUG if(opal_list_get_size(&fl->super) != fl->fl_num_allocated) { @@ -87,21 +73,15 @@ static void ompi_free_list_destruct(ompi_free_list_t* fl) } #endif - if (NULL != fl->fl_mpool) { - ompi_free_list_memory_t *fl_mem; - - while (NULL != (item = opal_list_remove_first(&(fl->fl_allocations)))) { - /* destruct the item (we constructed it), then free the memory chunk */ - OBJ_DESTRUCT(item); - fl_mem = (ompi_free_list_memory_t*) item; - fl->fl_mpool->mpool_free(fl->fl_mpool, item, fl_mem->registration); - } - } else { - while (NULL != (item = opal_list_remove_first(&(fl->fl_allocations)))) { - /* destruct the item (we constructed it), then free the memory chunk */ - OBJ_DESTRUCT(item); - free(item); + while(NULL != (item = opal_list_remove_first(&(fl->fl_allocations)))) { + fl_mem = (ompi_free_list_memory_t*)item; + if(fl->fl_mpool != NULL) { + fl->fl_mpool->mpool_free(fl->fl_mpool, fl_mem->base_ptr, + fl_mem->registration); } + /* destruct the item (we constructed it), then free the memory chunk */ + OBJ_DESTRUCT(item); + free(item); } OBJ_DESTRUCT(&fl->fl_allocations); @@ -112,7 +92,6 @@ static void ompi_free_list_destruct(ompi_free_list_t* fl) int ompi_free_list_init_ex( ompi_free_list_t *flist, size_t elem_size, - size_t header_space, size_t alignment, opal_class_t* elem_class, int num_elements_to_alloc, @@ -120,6 +99,10 @@ int ompi_free_list_init_ex( int num_elements_per_alloc, mca_mpool_base_module_t* mpool) { + /* alignment must be more than zero and power of two */ + if(alignment <= 1 || (alignment & (alignment - 1))) + return OMPI_ERROR; + if(elem_size > flist->fl_elem_size) flist->fl_elem_size = elem_size; if(elem_class) @@ -128,9 +111,7 @@ int ompi_free_list_init_ex( flist->fl_num_allocated = 0; flist->fl_num_per_alloc = num_elements_per_alloc; flist->fl_mpool = mpool; - flist->fl_header_space = header_space; flist->fl_alignment = alignment; - flist->fl_elem_size = align_to(flist->fl_elem_size, flist->fl_alignment); if(num_elements_to_alloc) return ompi_free_list_grow(flist, num_elements_to_alloc); return OMPI_SUCCESS; @@ -138,51 +119,67 @@ int ompi_free_list_init_ex( int ompi_free_list_grow(ompi_free_list_t* flist, size_t num_elements) { - unsigned char* ptr; + unsigned char *ptr, *mpool_alloc_ptr = NULL; ompi_free_list_memory_t *alloc_ptr; - size_t i, alloc_size; - mca_mpool_base_registration_t* user_out = NULL; + size_t i, alloc_size, head_size, elem_size = 0; + mca_mpool_base_registration_t *reg = NULL; - if (flist->fl_max_to_alloc > 0) - if (flist->fl_num_allocated + num_elements > flist->fl_max_to_alloc) + if(flist->fl_max_to_alloc > 0) + if(flist->fl_num_allocated + num_elements > flist->fl_max_to_alloc) num_elements = flist->fl_max_to_alloc - flist->fl_num_allocated; - if (num_elements == 0) + if(num_elements == 0) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - alloc_size = num_elements * flist->fl_elem_size + - sizeof(ompi_free_list_memory_t) + flist->fl_header_space + + head_size = (NULL == flist->fl_mpool) ? flist->fl_elem_size: + flist->fl_elem_class->cls_sizeof; + head_size = OPAL_ALIGN(head_size, flist->fl_alignment, size_t); + + /* calculate head allocation size */ + alloc_size = num_elements * head_size + sizeof(ompi_free_list_memory_t) + flist->fl_alignment; - if (NULL != flist->fl_mpool) - alloc_ptr = (ompi_free_list_memory_t*)flist->fl_mpool->mpool_alloc(flist->fl_mpool, - alloc_size, 0, MCA_MPOOL_FLAGS_CACHE_BYPASS, &user_out); - else - alloc_ptr = (ompi_free_list_memory_t*)malloc(alloc_size); + alloc_ptr = (ompi_free_list_memory_t*)malloc(alloc_size); if(NULL == alloc_ptr) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; - /* make the alloc_ptr a list item, save the chunk in the allocations list, and - have ptr point to memory right after the list item structure */ + /* allocate the rest from the mpool */ + if(flist->fl_mpool != NULL) { + elem_size = OPAL_ALIGN(flist->fl_elem_size - + flist->fl_elem_class->cls_sizeof, flist->fl_alignment, size_t); + if(elem_size != 0) { + mpool_alloc_ptr = flist->fl_mpool->mpool_alloc(flist->fl_mpool, + num_elements * elem_size, flist->fl_alignment, + MCA_MPOOL_FLAGS_CACHE_BYPASS, ®); + if(NULL == mpool_alloc_ptr) { + free(alloc_ptr); + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } + } + } + + /* make the alloc_ptr a list item, save the chunk in the allocations list, + * and have ptr point to memory right after the list item structure */ OBJ_CONSTRUCT(alloc_ptr, ompi_free_list_memory_t); - opal_list_append(&(flist->fl_allocations), (opal_list_item_t*) alloc_ptr); + opal_list_append(&(flist->fl_allocations), (opal_list_item_t*)alloc_ptr); - alloc_ptr->registration = user_out; + alloc_ptr->registration = reg; + alloc_ptr->base_ptr = mpool_alloc_ptr; - ptr = (unsigned char*) alloc_ptr + sizeof(ompi_free_list_memory_t); - - ptr = (unsigned char*)(align_to((size_t)ptr + flist->fl_header_space, - flist->fl_alignment) - flist->fl_header_space); + ptr = (unsigned char*)alloc_ptr + sizeof(ompi_free_list_memory_t); + ptr = OPAL_ALIGN_PTR(ptr, flist->fl_alignment, unsigned char*); for(i=0; iuser_data = user_out; + item->registration = reg; + item->ptr = mpool_alloc_ptr; OBJ_CONSTRUCT_INTERNAL(item, flist->fl_elem_class); opal_atomic_lifo_push(&(flist->super), &(item->super)); - ptr += flist->fl_elem_size; + ptr += head_size; + mpool_alloc_ptr += elem_size; } flist->fl_num_allocated += num_elements; diff --git a/ompi/class/ompi_free_list.h b/ompi/class/ompi_free_list.h index fa15ede12e..f83c5cd2ec 100644 --- a/ompi/class/ompi_free_list.h +++ b/ompi/class/ompi_free_list.h @@ -40,7 +40,6 @@ struct ompi_free_list_t size_t fl_num_per_alloc; size_t fl_num_waiting; size_t fl_elem_size; - size_t fl_header_space; size_t fl_alignment; opal_class_t* fl_elem_class; struct mca_mpool_base_module_t* fl_mpool; @@ -51,10 +50,12 @@ struct ompi_free_list_t typedef struct ompi_free_list_t ompi_free_list_t; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_free_list_t); +struct mca_mpool_base_registration_t; struct ompi_free_list_item_t { opal_list_item_t super; - void* user_data; + struct mca_mpool_base_registration_t *registration; + void *ptr; }; typedef struct ompi_free_list_item_t ompi_free_list_item_t; @@ -75,7 +76,6 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_free_list_item_t); OMPI_DECLSPEC int ompi_free_list_init_ex( ompi_free_list_t *free_list, size_t element_size, - size_t header_size, size_t alignment, opal_class_t* element_class, int num_elements_to_alloc, @@ -92,7 +92,7 @@ static inline int ompi_free_list_init( int num_elements_per_alloc, struct mca_mpool_base_module_t* mpool) { - return ompi_free_list_init_ex(free_list, element_size, 0, CACHE_LINE_SIZE, + return ompi_free_list_init_ex(free_list, element_size, CACHE_LINE_SIZE, element_class, num_elements_to_alloc, max_elements_to_alloc, num_elements_per_alloc, mpool); } diff --git a/ompi/mca/btl/gm/btl_gm_frag.c b/ompi/mca/btl/gm/btl_gm_frag.c index c92d9dadbb..25c67aed06 100644 --- a/ompi/mca/btl/gm/btl_gm_frag.c +++ b/ompi/mca/btl/gm/btl_gm_frag.c @@ -29,7 +29,7 @@ do { \ static void mca_btl_gm_frag_eager_constructor(mca_btl_gm_frag_t* frag) { - frag->hdr = (mca_btl_base_header_t*)(frag + 1); + frag->hdr = (mca_btl_base_header_t*)frag->base.super.ptr; frag->segment.seg_addr.pval = (unsigned char*)(frag->hdr + 1); frag->segment.seg_len = mca_btl_gm_module.super.btl_eager_limit - sizeof(mca_btl_base_header_t); frag->size = mca_btl_gm_component.gm_eager_frag_size; diff --git a/ompi/mca/btl/mvapi/btl_mvapi_component.c b/ompi/mca/btl/mvapi/btl_mvapi_component.c index 2b5e3672d9..4d72f7114c 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_component.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_component.c @@ -583,7 +583,10 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules, 2*MCA_BTL_IB_FRAG_ALIGN; mvapi_btl->eager_rdma_frag_size = - length & ~(2 * MCA_BTL_IB_FRAG_ALIGN - 1); + (sizeof(mca_btl_mvapi_header_t) + + sizeof(mca_btl_mvapi_footer_t) + + mvapi_btl->super.btl_eager_limit + + 2*MCA_BTL_IB_FRAG_ALIGN) & ~(2 * MCA_BTL_IB_FRAG_ALIGN - 1); ompi_free_list_init(&mvapi_btl->send_free_eager, length, diff --git a/ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h b/ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h index dd33349a9f..25e5e2f225 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h +++ b/ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h @@ -18,9 +18,11 @@ extern "C" { #endif struct mca_btl_mvapi_reg_t; +struct mca_btl_mvapi_frag_t; struct mca_btl_mvapi_eager_rdma_local_t { ompi_ptr_t base; /**< buffer for RDMAing eager messages */ + struct mca_btl_mvapi_frag_t *frags; struct mca_btl_mvapi_reg_t *reg; uint16_t head; /**< RDMA buffer to poll */ uint16_t tail; /**< Needed for credit managment */ @@ -73,9 +75,7 @@ typedef struct mca_btl_mvapi_eager_rdma_remote_t mca_btl_mvapi_eager_rdma_remote }while (0) #define MCA_BTL_MVAPI_GET_LOCAL_RDMA_FRAG(E, I) \ - (mca_btl_mvapi_frag_t*) \ - ((char*)(E)->eager_rdma_local.base.pval + \ - (I) * (E)->endpoint_btl->eager_rdma_frag_size) + (&(E)->eager_rdma_local.frags[(I)]) #define MCA_BTL_MVAPI_RDMA_NEXT_INDEX(I) do { \ (I) = ((I) + 1) % \ diff --git a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c index 018a6069bc..48554be50e 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_endpoint.c @@ -165,10 +165,9 @@ static inline int mca_btl_mvapi_endpoint_post_send( #endif frag->desc.sr_desc.r_key = (VAPI_rkey_t)endpoint->eager_rdma_remote.rkey; frag->desc.sr_desc.remote_addr = (VAPI_virt_addr_t) - endpoint->eager_rdma_remote.base.lval + + (uintptr_t)endpoint->eager_rdma_remote.base.pval + endpoint->eager_rdma_remote.head * mvapi_btl->eager_rdma_frag_size + - sizeof(mca_btl_mvapi_frag_t) + sizeof(mca_btl_mvapi_header_t) + frag->size + sizeof(mca_btl_mvapi_footer_t); @@ -1223,12 +1222,20 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma( { mca_btl_mvapi_module_t* mvapi_btl = endpoint->endpoint_btl; char *buf; + mca_btl_mvapi_recv_frag_eager_t *headers_buf; unsigned int i; OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock); if (endpoint->eager_rdma_local.base.pval) goto unlock_rdma_local; + headers_buf = (mca_btl_mvapi_recv_frag_eager_t*) + malloc(sizeof(mca_btl_mvapi_recv_frag_eager_t) * + mca_btl_mvapi_component.eager_rdma_num); + + if(NULL == headers_buf) + goto unlock_rdma_local; + buf = mvapi_btl->super.btl_mpool->mpool_alloc(mvapi_btl->super.btl_mpool, mvapi_btl->eager_rdma_frag_size * mca_btl_mvapi_component.eager_rdma_num, 0, @@ -1236,12 +1243,13 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma( (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); if(!buf) - goto unlock_rdma_local; + goto free_headers_buf; for(i = 0; i < mca_btl_mvapi_component.eager_rdma_num; i++) { - ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf + - i*mvapi_btl->eager_rdma_frag_size); - item->user_data = (void*)endpoint->eager_rdma_local.reg; + ompi_free_list_item_t *item; + item = (ompi_free_list_item_t *)&headers_buf[i]; + item->registration = (void*)endpoint->eager_rdma_local.reg; + item->ptr = buf + i * mvapi_btl->eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_mvapi_recv_frag_eager_t); ((mca_btl_mvapi_frag_t*)item)->endpoint = endpoint; ((mca_btl_mvapi_frag_t*)item)->type = MCA_BTL_MVAPI_FRAG_EAGER_RDMA; @@ -1253,6 +1261,7 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma( goto cleanup; endpoint->eager_rdma_local.base.pval = buf; + endpoint->eager_rdma_local.frags = headers_buf; mvapi_btl->eager_rdma_buffers_count++; if (mca_btl_mvapi_endpoint_send_eager_rdma(endpoint) == 0) { OPAL_THREAD_UNLOCK(&mvapi_btl->eager_rdma_lock); @@ -1262,6 +1271,7 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma( mvapi_btl->eager_rdma_buffers_count--; endpoint->eager_rdma_local.base.pval = NULL; + endpoint->eager_rdma_local.frags = NULL; orte_pointer_array_set_item(mvapi_btl->eager_rdma_buffers, endpoint->eager_rdma_index, NULL); @@ -1269,6 +1279,8 @@ cleanup: OPAL_THREAD_UNLOCK(&mvapi_btl->eager_rdma_lock); mvapi_btl->super.btl_mpool->mpool_free(mvapi_btl->super.btl_mpool, buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); +free_headers_buf: + free(headers_buf); unlock_rdma_local: OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock); } diff --git a/ompi/mca/btl/mvapi/btl_mvapi_frag.c b/ompi/mca/btl/mvapi/btl_mvapi_frag.c index 2ba288c95a..a0c38c32f9 100644 --- a/ompi/mca/btl/mvapi/btl_mvapi_frag.c +++ b/ompi/mca/btl/mvapi/btl_mvapi_frag.c @@ -23,8 +23,8 @@ static void mca_btl_mvapi_frag_common_constructor( mca_btl_mvapi_frag_t* frag) { mca_btl_mvapi_reg_t* mem_hndl = - (mca_btl_mvapi_reg_t*)frag->base.super.user_data; - frag->hdr = (mca_btl_mvapi_header_t*) (frag+1); /* initialize btl header to start at end of frag */ + (mca_btl_mvapi_reg_t*)frag->base.super.registration; + frag->hdr = (mca_btl_mvapi_header_t*)frag->base.super.ptr; frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_mvapi_header_t); /* init the segment address to start after the btl header */ diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index c083d97f42..c354bcad57 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -234,23 +234,21 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl, case MCA_BTL_OPENIB_CONTROL_RDMA: rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)ctl_hdr; - BTL_VERBOSE(("prior to NTOH received rkey %lu, rdma_start.lval %llu, pval %p, ival %u, frag_t_len %llu\n", + BTL_VERBOSE(("prior to NTOH received rkey %lu, rdma_start.lval %llu, pval %p, ival %u\n", rdma_hdr->rkey, (unsigned long) rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, - rdma_hdr->rdma_start.ival, - (unsigned long) rdma_hdr->frag_t_len + rdma_hdr->rdma_start.ival )); if(endpoint->nbo) { BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH((*rdma_hdr)); - BTL_VERBOSE(("received rkey %lu, rdma_start.lval %llu, pval %p, ival %u, frag_t_len %llu\n", + BTL_VERBOSE(("received rkey %lu, rdma_start.lval %llu, pval %p, ival %u\n", rdma_hdr->rkey, (unsigned long) rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, - rdma_hdr->rdma_start.ival, - (unsigned long) rdma_hdr->frag_t_len + rdma_hdr->rdma_start.ival )); } @@ -261,7 +259,6 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl, } endpoint->eager_rdma_remote.rkey = rdma_hdr->rkey; endpoint->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval; - endpoint->eager_rdma_remote.frag_t_len = rdma_hdr->frag_t_len; endpoint->eager_rdma_remote.tokens = mca_btl_openib_component.eager_rdma_num - 1; break; @@ -699,17 +696,19 @@ btl_openib_component_init(int *num_btl_modules, openib_btl->super.btl_mpool = openib_btl->hca->mpool; /* Initialize pool of send fragments */ - length = sizeof(mca_btl_openib_frag_t) + + length = sizeof(mca_btl_openib_send_frag_eager_t) + sizeof(mca_btl_openib_header_t) + sizeof(mca_btl_openib_footer_t) + openib_btl->super.btl_eager_limit; - openib_btl->eager_rdma_frag_size = OPAL_ALIGN(length, - mca_btl_openib_component.buffer_alignment, int); + openib_btl->eager_rdma_frag_size = OPAL_ALIGN( + sizeof(mca_btl_openib_header_t) + + sizeof(mca_btl_openib_footer_t) + + openib_btl->super.btl_eager_limit, + mca_btl_openib_component.buffer_alignment, size_t); ompi_free_list_init_ex(&openib_btl->send_free_eager, length, - sizeof(mca_btl_openib_frag_t), mca_btl_openib_component.buffer_alignment, OBJ_CLASS(mca_btl_openib_send_frag_eager_t), mca_btl_openib_component.ib_free_list_num, @@ -717,9 +716,13 @@ btl_openib_component_init(int *num_btl_modules, mca_btl_openib_component.ib_free_list_inc, openib_btl->super.btl_mpool); + length = sizeof(mca_btl_openib_recv_frag_eager_t) + + sizeof(mca_btl_openib_header_t) + + sizeof(mca_btl_openib_footer_t) + + openib_btl->super.btl_eager_limit; + ompi_free_list_init_ex(&openib_btl->recv_free_eager, length, - sizeof(mca_btl_openib_frag_t), mca_btl_openib_component.buffer_alignment, OBJ_CLASS(mca_btl_openib_recv_frag_eager_t), mca_btl_openib_component.ib_free_list_num, @@ -727,13 +730,12 @@ btl_openib_component_init(int *num_btl_modules, mca_btl_openib_component.ib_free_list_inc, openib_btl->super.btl_mpool); - length = sizeof(mca_btl_openib_frag_t) + + length = sizeof(mca_btl_openib_send_frag_max_t) + sizeof(mca_btl_openib_header_t) + openib_btl->super.btl_max_send_size; ompi_free_list_init_ex(&openib_btl->send_free_max, length, - sizeof(mca_btl_openib_frag_t), mca_btl_openib_component.buffer_alignment, OBJ_CLASS(mca_btl_openib_send_frag_max_t), mca_btl_openib_component.ib_free_list_num, @@ -741,25 +743,27 @@ btl_openib_component_init(int *num_btl_modules, mca_btl_openib_component.ib_free_list_inc, openib_btl->super.btl_mpool); + length = sizeof(mca_btl_openib_recv_frag_max_t) + + sizeof(mca_btl_openib_header_t) + + openib_btl->super.btl_max_send_size; + /* Initialize pool of receive fragments */ ompi_free_list_init_ex(&openib_btl->recv_free_max, length, - sizeof(mca_btl_openib_frag_t), mca_btl_openib_component.buffer_alignment, - OBJ_CLASS (mca_btl_openib_recv_frag_max_t), + OBJ_CLASS(mca_btl_openib_recv_frag_max_t), mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, openib_btl->super.btl_mpool); - length = sizeof(mca_btl_openib_frag_t) + + length = sizeof(mca_btl_openib_send_frag_control_t) + sizeof(mca_btl_openib_header_t) + sizeof(mca_btl_openib_footer_t) + sizeof(mca_btl_openib_eager_rdma_header_t); ompi_free_list_init_ex(&openib_btl->send_free_control, length, - sizeof(mca_btl_openib_frag_t), mca_btl_openib_component.buffer_alignment, OBJ_CLASS(mca_btl_openib_send_frag_control_t), mca_btl_openib_component.ib_free_list_num, diff --git a/ompi/mca/btl/openib/btl_openib_eager_rdma.h b/ompi/mca/btl/openib/btl_openib_eager_rdma.h index 02a3f07af7..b39fda4611 100644 --- a/ompi/mca/btl/openib/btl_openib_eager_rdma.h +++ b/ompi/mca/btl/openib/btl_openib_eager_rdma.h @@ -19,6 +19,7 @@ extern "C" { struct mca_btl_openib_eager_rdma_local_t { ompi_ptr_t base; /**< buffer for RDMAing eager messages */ + mca_btl_openib_recv_frag_eager_t *frags; mca_btl_openib_reg_t *reg; uint16_t head; /**< RDMA buffer to poll */ uint16_t tail; /**< Needed for credit managment */ @@ -38,7 +39,6 @@ struct mca_btl_openib_eager_rdma_remote_t { #if OMPI_ENABLE_DEBUG uint32_t seq; #endif - uint64_t frag_t_len; /**< remote's sizeof(mca_btl_openib_frag_t) */ }; typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remote_t; @@ -72,10 +72,8 @@ typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remo ((volatile uint8_t*)(F)->u.buf)[3] = EAGER_RDMA_BUFFER_LOCAL; \ }while (0) -#define MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(E, I) \ - (mca_btl_openib_frag_t*) \ - ((char*)(E)->eager_rdma_local.base.pval + \ - (I) * (E)->endpoint_btl->eager_rdma_frag_size) +#define MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(E, I) \ + (&(E)->eager_rdma_local.frags[(I)]) #define MCA_BTL_OPENIB_RDMA_NEXT_INDEX(I) do { \ (I) = ((I) + 1); \ diff --git a/ompi/mca/btl/openib/btl_openib_endpoint.c b/ompi/mca/btl/openib/btl_openib_endpoint.c index 74b3d4e955..71121a274a 100644 --- a/ompi/mca/btl/openib/btl_openib_endpoint.c +++ b/ompi/mca/btl/openib/btl_openib_endpoint.c @@ -173,7 +173,6 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope endpoint->eager_rdma_remote.base.lval + endpoint->eager_rdma_remote.head * openib_btl->eager_rdma_frag_size + - endpoint->eager_rdma_remote.frag_t_len + sizeof(mca_btl_openib_header_t) + mca_btl_openib_component.eager_limit + sizeof(mca_btl_openib_footer_t); @@ -1168,7 +1167,6 @@ static int mca_btl_openib_endpoint_send_eager_rdma( rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)frag->segment.seg_addr.pval; rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA; rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey; - rdma_hdr->frag_t_len = sizeof(mca_btl_openib_frag_t); rdma_hdr->rdma_start.lval = ompi_ptr_ptol(endpoint->eager_rdma_local.base.pval); BTL_VERBOSE(("sending rkey %lu, rdma_start.lval %llu, pval %p, ival %u type %d and sizeof(rdma_hdr) %d\n", rdma_hdr->rkey, @@ -1204,6 +1202,7 @@ void mca_btl_openib_endpoint_connect_eager_rdma( { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; char *buf; + mca_btl_openib_recv_frag_eager_t *headers_buf; unsigned int i; orte_std_cntr_t index; @@ -1213,6 +1212,13 @@ void mca_btl_openib_endpoint_connect_eager_rdma( (void*)1)) return; + headers_buf = (mca_btl_openib_recv_frag_eager_t*) + malloc(sizeof(mca_btl_openib_recv_frag_eager_t) * + mca_btl_openib_component.eager_rdma_num); + + if(NULL == headers_buf) + goto unlock_rdma_local; + buf = openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool, openib_btl->eager_rdma_frag_size * mca_btl_openib_component.eager_rdma_num, @@ -1221,22 +1227,24 @@ void mca_btl_openib_endpoint_connect_eager_rdma( (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); if(!buf) - goto unlock_rdma_local; + goto free_headers_buf; buf = buf + openib_btl->eager_rdma_frag_size - sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit - - sizeof(mca_btl_openib_header_t) - - sizeof(mca_btl_openib_recv_frag_eager_t); + sizeof(mca_btl_openib_header_t); for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) { - ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf + - i*openib_btl->eager_rdma_frag_size); - item->user_data = (void*)endpoint->eager_rdma_local.reg; + ompi_free_list_item_t *item; + item = (ompi_free_list_item_t*)&headers_buf[i]; + item->registration = (void*)endpoint->eager_rdma_local.reg; + item->ptr = buf + i * openib_btl->eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_eager_t); ((mca_btl_openib_frag_t*)item)->endpoint = endpoint; ((mca_btl_openib_frag_t*)item)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA; } - + + endpoint->eager_rdma_local.frags = headers_buf; + /* set local rdma pointer to real value */ opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1, buf); @@ -1253,8 +1261,11 @@ void mca_btl_openib_endpoint_connect_eager_rdma( openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); +free_headers_buf: + free(headers_buf); unlock_rdma_local: /* set local rdma pointer back to zero. Will retry later */ opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, endpoint->eager_rdma_local.base.pval, NULL); + endpoint->eager_rdma_local.frags = NULL; } diff --git a/ompi/mca/btl/openib/btl_openib_frag.c b/ompi/mca/btl/openib/btl_openib_frag.c index 5dad416599..39324d4ce2 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.c +++ b/ompi/mca/btl/openib/btl_openib_frag.c @@ -23,9 +23,9 @@ static void mca_btl_openib_frag_common_constructor( mca_btl_openib_frag_t* frag) { mca_btl_openib_reg_t* registration = - (mca_btl_openib_reg_t*)frag->base.super.user_data; + (mca_btl_openib_reg_t*)frag->base.super.registration; - frag->hdr = (mca_btl_openib_header_t*) (frag+1); /* initialize the btl header to start at end of frag */ + frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr; frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); /* init the segment address to start after the btl header */ diff --git a/ompi/mca/btl/openib/btl_openib_frag.h b/ompi/mca/btl/openib/btl_openib_frag.h index 34a49565f0..d5f3152376 100644 --- a/ompi/mca/btl/openib/btl_openib_frag.h +++ b/ompi/mca/btl/openib/btl_openib_frag.h @@ -113,7 +113,6 @@ struct mca_btl_openib_eager_rdma_header_t { uint8_t padding[3]; uint32_t rkey; ompi_ptr_t rdma_start; - uint64_t frag_t_len; }; typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_header_t; @@ -121,14 +120,12 @@ typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_head do { \ h.rkey = htonl(h.rkey); \ h.rdma_start.lval = hton64(h.rdma_start.lval); \ - h.frag_t_len = hton64(h.frag_t_len); \ } while (0) #define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(h) \ do { \ h.rkey = ntohl(h.rkey); \ h.rdma_start.lval = ntoh64(h.rdma_start.lval); \ - h.frag_t_len = ntoh64(h.frag_t_len); \ } while (0) diff --git a/ompi/mca/btl/openib/btl_openib_mca.c b/ompi/mca/btl/openib/btl_openib_mca.c index 03c6f7af0c..481a0e5825 100644 --- a/ompi/mca/btl/openib/btl_openib_mca.c +++ b/ompi/mca/btl/openib/btl_openib_mca.c @@ -23,6 +23,7 @@ #include #include "opal/util/output.h" +#include "opal/util/show_help.h" #include "opal/mca/base/mca_base_param.h" #include "btl_openib.h" #include "btl_openib_mca.h" @@ -337,9 +338,15 @@ int btl_openib_register_mca_params(void) CHECK(reg_int("buffer_alignment", "Prefered communication buffer alignment, in bytes " - "(must be >= 0)", + "(must be > 0 and power of two)", 64, &ival, REGINT_GE_ZERO)); - mca_btl_openib_component.buffer_alignment = (uint32_t) ival; + if(ival <= 1 || (ival & (ival - 1))) { + opal_show_help("help-mpi-btl-openib.txt", "wrong buffer alignment", + true, ival, orte_system_info.nodename, 64); + mca_btl_openib_component.buffer_alignment = 64; + } else { + mca_btl_openib_component.buffer_alignment = (uint32_t) ival; + } CHECK(reg_int("eager_limit", "Eager send limit, in bytes " "(must be >= 1)", diff --git a/ompi/mca/btl/openib/help-mpi-btl-openib.txt b/ompi/mca/btl/openib/help-mpi-btl-openib.txt index 56d9244476..3b4e1f18a8 100644 --- a/ompi/mca/btl/openib/help-mpi-btl-openib.txt +++ b/ompi/mca/btl/openib/help-mpi-btl-openib.txt @@ -169,3 +169,6 @@ Please see this FAQ entry for more details: NOTE: You can turn off this warning by setting the MCA parameter btl_openib_warn_default_gid_prefix to 0. +[wrong buffer alignment] +Wrong buffer alignment %d configured on host '%s'. Should be bigger +than zero and power of two. Use default %d instead. diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index e503f84eb4..47cb94c97b 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -542,7 +542,8 @@ int mca_btl_sm_add_procs_same_base_addr( /* initialize fragment descriptor free lists */ /* allocation will be for the fragment descriptor and payload buffer */ - length=sizeof(mca_btl_sm_frag_t) + mca_btl_sm_component.eager_limit; + length = sizeof(mca_btl_sm_frag1_t) + sizeof(mca_btl_sm_hdr_t) + + mca_btl_sm_component.eager_limit; ompi_free_list_init(&mca_btl_sm_component.sm_frags1, length, OBJ_CLASS(mca_btl_sm_frag1_t), mca_btl_sm_component.sm_free_list_num, @@ -550,7 +551,8 @@ int mca_btl_sm_add_procs_same_base_addr( mca_btl_sm_component.sm_free_list_inc, mca_btl_sm_component.sm_mpool); /* use shared-memory pool */ - length=sizeof(mca_btl_sm_frag_t) + mca_btl_sm_component.max_frag_size; + length = sizeof(mca_btl_sm_frag2_t) + sizeof(mca_btl_sm_hdr_t) + + mca_btl_sm_component.max_frag_size; ompi_free_list_init(&mca_btl_sm_component.sm_frags2, length, OBJ_CLASS(mca_btl_sm_frag2_t), mca_btl_sm_component.sm_free_list_num, @@ -558,6 +560,14 @@ int mca_btl_sm_add_procs_same_base_addr( mca_btl_sm_component.sm_free_list_inc, mca_btl_sm_component.sm_mpool); /* use shared-memory pool */ + ompi_free_list_init(&mca_btl_sm_component.sm_frags, + sizeof(mca_btl_sm_frag_t), + OBJ_CLASS(mca_btl_sm_frag_t), + mca_btl_sm_component.sm_free_list_num, + -1, + mca_btl_sm_component.sm_free_list_inc, + NULL); + /* set up mca_btl_sm_component.list_smp_procs_same_base_addr */ mca_btl_sm_component.list_smp_procs_same_base_addr=(int *) malloc(mca_btl_sm_component.sm_max_procs*sizeof(int)); @@ -854,7 +864,9 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( max_data = frag->size - reserve; } iov.iov_len = max_data; - iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)(frag+1)) + reserve); + iov.iov_base = + (IOVBASE_TYPE*)(((unsigned char*)(frag->segment.seg_addr.pval)) + + reserve); rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data ); if(rc < 0) { @@ -883,16 +895,14 @@ int mca_btl_sm_send( mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)descriptor; int rc; - frag->tag = tag; - frag->type = MCA_BTL_SM_FRAG_SEND; - frag->rc = OMPI_SUCCESS; + frag->hdr->u.s.len = frag->segment.seg_len; + frag->hdr->u.s.tag = tag; + frag->hdr->type = MCA_BTL_SM_FRAG_SEND; /* * post the descriptor in the queue - post with the relative * address */ - MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank, endpoint->peer_smp_rank, frag, rc); + MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank, endpoint->peer_smp_rank, frag->hdr, rc); return rc; } - - diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index d29ffcf535..badb4c1095 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -129,6 +129,7 @@ struct mca_btl_sm_component_t { * SMP specfic data structures. */ ompi_free_list_t sm_frags1; /**< free list of sm first */ ompi_free_list_t sm_frags2; /**< free list of sm second */ + ompi_free_list_t sm_frags; /**< free list of frags without data */ ompi_free_list_t sm_first_frags_to_progress; /**< list of first fragments that are awaiting resources */ diff --git a/ompi/mca/btl/sm/btl_sm_component.c b/ompi/mca/btl/sm/btl_sm_component.c index e73a7dad46..7bdf5845c7 100644 --- a/ompi/mca/btl/sm/btl_sm_component.c +++ b/ompi/mca/btl/sm/btl_sm_component.c @@ -165,6 +165,7 @@ int mca_btl_sm_component_open(void) /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_sm_component.sm_lock, opal_mutex_t); + OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags1, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags2, ompi_free_list_t); return OMPI_SUCCESS; @@ -342,6 +343,7 @@ int mca_btl_sm_component_progress(void) unsigned int peer_smp_rank ; mca_btl_sm_frag_t *frag; ompi_fifo_t *fifo = NULL; + mca_btl_sm_hdr_t *hdr; int my_smp_rank=mca_btl_sm_component.my_smp_rank; int proc; int rc = 0, btl = 0; @@ -377,7 +379,7 @@ int mca_btl_sm_component_progress(void) * that we have the same base address as the sender, so no * translation is necessary when accessing the fifo. Hence, * we use the _same_base_addr varient. */ - frag = (mca_btl_sm_frag_t *) + hdr = (mca_btl_sm_hdr_t *) ompi_fifo_read_from_tail_same_base_addr( fifo ); /* release thread lock */ @@ -385,26 +387,33 @@ int mca_btl_sm_component_progress(void) opal_atomic_unlock(&(fifo->tail_lock)); } - if( OMPI_CB_FREE == frag ) { + if( OMPI_CB_FREE == hdr ) { continue; } /* dispatch fragment by type */ - switch(frag->type) { + switch(hdr->type) { case MCA_BTL_SM_FRAG_ACK: { + frag = hdr->frag; /* completion callback */ - frag->base.des_cbfunc(&mca_btl_sm[0].super, frag->endpoint, &frag->base, frag->rc); + frag->base.des_cbfunc(&mca_btl_sm[0].super, frag->endpoint, &frag->base, hdr->u.rc); break; } case MCA_BTL_SM_FRAG_SEND: { /* recv upcall */ - mca_btl_sm_recv_reg_t* reg = mca_btl_sm[0].sm_reg + frag->tag; - reg->cbfunc(&mca_btl_sm[0].super,frag->tag,&frag->base,reg->cbdata); - frag->type = MCA_BTL_SM_FRAG_ACK; + mca_btl_sm_recv_reg_t* reg = mca_btl_sm[0].sm_reg + hdr->u.s.tag; + MCA_BTL_SM_FRAG_ALLOC(frag, rc); + frag->segment.seg_addr.pval = ((char*)hdr) + + sizeof(mca_btl_sm_hdr_t); + frag->segment.seg_len = hdr->u.s.len; + reg->cbfunc(&mca_btl_sm[0].super,hdr->u.s.tag,&frag->base,reg->cbdata); + MCA_BTL_SM_FRAG_RETURN(frag); + hdr->type = MCA_BTL_SM_FRAG_ACK; + hdr->u.rc = OMPI_SUCCESS; MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank], - my_smp_rank, peer_smp_rank, frag, rc ); + my_smp_rank, peer_smp_rank, hdr, rc ); if(OMPI_SUCCESS != rc) goto err; break; @@ -412,10 +421,10 @@ int mca_btl_sm_component_progress(void) default: { /* unknown */ - frag->rc = OMPI_ERROR; - frag->type = MCA_BTL_SM_FRAG_ACK; + hdr->u.rc = OMPI_ERROR; + hdr->type = MCA_BTL_SM_FRAG_ACK; MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank], - my_smp_rank, peer_smp_rank, frag, rc ); + my_smp_rank, peer_smp_rank, hdr, rc ); if(OMPI_SUCCESS != rc) goto err; break; @@ -450,9 +459,9 @@ int mca_btl_sm_component_progress(void) * translate every access into the fifo to be relevant to our * memory space. Hence, we do *not* use the _same_base_addr * variant. */ - frag=(mca_btl_sm_frag_t *)ompi_fifo_read_from_tail( fifo, + hdr=(mca_btl_sm_hdr_t *)ompi_fifo_read_from_tail( fifo, mca_btl_sm_component.sm_offset[peer_smp_rank]); - if( OMPI_CB_FREE == frag ) { + if( OMPI_CB_FREE == hdr ) { /* release thread lock */ if( opal_using_threads() ) { opal_atomic_unlock(&(fifo->tail_lock)); @@ -467,37 +476,32 @@ int mca_btl_sm_component_progress(void) /* change the address from address relative to the shared * memory address, to a true virtual address */ - frag = (mca_btl_sm_frag_t *)( (char *)frag + + hdr = (mca_btl_sm_hdr_t *)( (char *)hdr + mca_btl_sm_component.sm_offset[peer_smp_rank]); /* dispatch fragment by type */ - switch(frag->type) { + switch(hdr->type) { case MCA_BTL_SM_FRAG_ACK: { + frag = hdr->frag; /* completion callback */ - frag->base.des_src = - ( mca_btl_base_segment_t* )((ptrdiff_t)frag->base.des_dst + mca_btl_sm_component.sm_offset[peer_smp_rank]); - frag->base.des_src->seg_addr.pval = (void*) - ((ptrdiff_t)frag->base.des_src->seg_addr.pval + - mca_btl_sm_component.sm_offset[peer_smp_rank]); - frag->base.des_dst = frag->base.des_src; - frag->base.des_cbfunc(&mca_btl_sm[1].super, frag->endpoint, &frag->base, frag->rc); + frag->base.des_cbfunc(&mca_btl_sm[1].super, frag->endpoint, &frag->base, hdr->u.rc); break; } case MCA_BTL_SM_FRAG_SEND: { /* recv upcall */ - mca_btl_sm_recv_reg_t* reg = mca_btl_sm[1].sm_reg + frag->tag; - frag->base.des_dst = (mca_btl_base_segment_t*) - ((ptrdiff_t)frag->base.des_src + mca_btl_sm_component.sm_offset[peer_smp_rank]); - frag->base.des_dst->seg_addr.pval = (void*) - ((ptrdiff_t)frag->base.des_dst->seg_addr.pval + - mca_btl_sm_component.sm_offset[peer_smp_rank]); - frag->base.des_src = frag->base.des_dst; - reg->cbfunc(&mca_btl_sm[1].super,frag->tag,&frag->base,reg->cbdata); - frag->type = MCA_BTL_SM_FRAG_ACK; + mca_btl_sm_recv_reg_t* reg = mca_btl_sm[1].sm_reg + hdr->u.s.tag; + MCA_BTL_SM_FRAG_ALLOC(frag, rc); + frag->segment.seg_addr.pval = ((char*)hdr) + + sizeof(mca_btl_sm_hdr_t); + frag->segment.seg_len = hdr->u.s.len; + reg->cbfunc(&mca_btl_sm[1].super,hdr->u.s.tag,&frag->base,reg->cbdata); + MCA_BTL_SM_FRAG_RETURN(frag); + hdr->type = MCA_BTL_SM_FRAG_ACK; + hdr->u.rc = OMPI_SUCCESS; MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank], - my_smp_rank, peer_smp_rank, frag, rc ); + my_smp_rank, peer_smp_rank, hdr, rc ); if(OMPI_SUCCESS != rc) goto err; break; @@ -505,10 +509,10 @@ int mca_btl_sm_component_progress(void) default: { /* unknown */ - frag->rc = OMPI_ERROR; - frag->type = MCA_BTL_SM_FRAG_ACK; + hdr->u.rc = OMPI_ERROR; + hdr->type = MCA_BTL_SM_FRAG_ACK; MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank], - my_smp_rank, peer_smp_rank, frag, rc ); + my_smp_rank, peer_smp_rank, hdr, rc ); if(OMPI_SUCCESS != rc) goto err; break; diff --git a/ompi/mca/btl/sm/btl_sm_fifo.h b/ompi/mca/btl/sm/btl_sm_fifo.h index b14df7a29c..c202759e2c 100644 --- a/ompi/mca/btl/sm/btl_sm_fifo.h +++ b/ompi/mca/btl/sm/btl_sm_fifo.h @@ -4,7 +4,7 @@ #include "btl_sm.h" #include "btl_sm_endpoint.h" -#define MCA_BTL_SM_FIFO_WRITE(endpoint_peer, my_smp_rank,peer_smp_rank,frag,rc) \ +#define MCA_BTL_SM_FIFO_WRITE(endpoint_peer, my_smp_rank,peer_smp_rank,hdr,rc) \ do { \ ompi_fifo_t* fifo; \ fifo=&(mca_btl_sm_component.fifo[my_smp_rank][peer_smp_rank]); \ @@ -29,7 +29,7 @@ do { \ } \ \ /* post fragment */ \ - while(ompi_fifo_write_to_head_same_base_addr(frag, fifo, \ + while(ompi_fifo_write_to_head_same_base_addr(hdr, fifo, \ mca_btl_sm_component.sm_mpool) != OMPI_SUCCESS) \ opal_progress(); \ MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \ diff --git a/ompi/mca/btl/sm/btl_sm_frag.c b/ompi/mca/btl/sm/btl_sm_frag.c index 1097213ed5..378f31b7b5 100644 --- a/ompi/mca/btl/sm/btl_sm_frag.c +++ b/ompi/mca/btl/sm/btl_sm_frag.c @@ -19,9 +19,12 @@ #include "btl_sm_frag.h" -static inline void mca_btl_sm_frag_constructor(mca_btl_sm_frag_t* frag) +static inline void mca_btl_sm_frag_common_constructor(mca_btl_sm_frag_t* frag) { - frag->segment.seg_addr.pval = frag+1; + frag->hdr = frag->base.super.ptr; + if(frag->hdr != NULL) + frag->hdr->frag = frag; + frag->segment.seg_addr.pval = ((char*)frag->hdr) + sizeof(mca_btl_sm_hdr_t); frag->segment.seg_len = frag->size; frag->base.des_src = &frag->segment; frag->base.des_src_cnt = 1; @@ -30,18 +33,25 @@ static inline void mca_btl_sm_frag_constructor(mca_btl_sm_frag_t* frag) frag->base.des_flags = 0; } +static void mca_btl_sm_frag_constructor(mca_btl_sm_frag_t* frag) +{ + frag->size = 0; + frag->my_list = &mca_btl_sm_component.sm_frags; + mca_btl_sm_frag_common_constructor(frag); +} + static void mca_btl_sm_frag1_constructor(mca_btl_sm_frag_t* frag) { frag->size = mca_btl_sm_component.eager_limit; frag->my_list = &mca_btl_sm_component.sm_frags1; - mca_btl_sm_frag_constructor(frag); + mca_btl_sm_frag_common_constructor(frag); } static void mca_btl_sm_frag2_constructor(mca_btl_sm_frag_t* frag) { frag->size = mca_btl_sm_component.max_frag_size; frag->my_list = &mca_btl_sm_component.sm_frags2; - mca_btl_sm_frag_constructor(frag); + mca_btl_sm_frag_common_constructor(frag); } static void mca_btl_sm_frag_destructor(mca_btl_sm_frag_t* frag) diff --git a/ompi/mca/btl/sm/btl_sm_frag.h b/ompi/mca/btl/sm/btl_sm_frag.h index 6ba6b44843..18b6aeb83d 100644 --- a/ompi/mca/btl/sm/btl_sm_frag.h +++ b/ompi/mca/btl/sm/btl_sm_frag.h @@ -27,13 +27,31 @@ #include "btl_sm.h" -typedef enum { +/*typedef enum { MCA_BTL_SM_FRAG_SEND, MCA_BTL_SM_FRAG_PUT, MCA_BTL_SM_FRAG_GET, MCA_BTL_SM_FRAG_ACK -} mca_btl_sm_frag_type_t; +} mca_btl_sm_frag_type_t; */ +#define MCA_BTL_SM_FRAG_SEND 0 +#define MCA_BTL_SM_FRAG_ACK 1 + +typedef uint8_t mca_btl_sm_frag_type_t; +struct mca_btl_sm_frag_t; + +struct mca_btl_sm_hdr_t { + struct mca_btl_sm_frag_t *frag; + union { + struct { + size_t len; + mca_btl_base_tag_t tag; + } s; + int rc; + } u; + mca_btl_sm_frag_type_t type; +}; +typedef struct mca_btl_sm_hdr_t mca_btl_sm_hdr_t; /** * shared memory send fragment derived type. @@ -42,10 +60,8 @@ struct mca_btl_sm_frag_t { mca_btl_base_descriptor_t base; mca_btl_base_segment_t segment; struct mca_btl_base_endpoint_t *endpoint; - mca_btl_sm_frag_type_t type; - mca_btl_base_tag_t tag; size_t size; - int rc; + mca_btl_sm_hdr_t *hdr; ompi_free_list_t* my_list; }; typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t; @@ -56,6 +72,13 @@ OBJ_CLASS_DECLARATION(mca_btl_sm_frag_t); OBJ_CLASS_DECLARATION(mca_btl_sm_frag1_t); OBJ_CLASS_DECLARATION(mca_btl_sm_frag2_t); +#define MCA_BTL_SM_FRAG_ALLOC(frag, rc) \ +{ \ + ompi_free_list_item_t* item; \ + OMPI_FREE_LIST_WAIT(&mca_btl_sm_component.sm_frags, item, rc); \ + frag = (mca_btl_sm_frag_t*)item; \ +} + #define MCA_BTL_SM_FRAG_ALLOC1(frag, rc) \ { \ ompi_free_list_item_t* item; \ diff --git a/ompi/mca/btl/udapl/btl_udapl_endpoint.c b/ompi/mca/btl/udapl/btl_udapl_endpoint.c index ecf0b42a51..6efec3d3dc 100644 --- a/ompi/mca/btl/udapl/btl_udapl_endpoint.c +++ b/ompi/mca/btl/udapl/btl_udapl_endpoint.c @@ -1127,7 +1127,8 @@ void mca_btl_udapl_endpoint_connect_eager_rdma( mca_btl_udapl_frag_eager_rdma_t* local_rdma_frag; ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf + i*mca_btl_udapl_component.udapl_eager_rdma_frag_size); - item->user_data = endpoint->endpoint_eager_rdma_local.reg; + item->registration = (void*)endpoint->endpoint_eager_rdma_local.reg; + item->ptr = buf + i * mca_btl_udapl_component.udapl_eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_udapl_frag_eager_rdma_t); local_rdma_frag = ((mca_btl_udapl_frag_eager_rdma_t*)item); diff --git a/ompi/mca/btl/udapl/btl_udapl_frag.c b/ompi/mca/btl/udapl/btl_udapl_frag.c index 4c67dc139d..69c361285e 100644 --- a/ompi/mca/btl/udapl/btl_udapl_frag.c +++ b/ompi/mca/btl/udapl/btl_udapl_frag.c @@ -26,7 +26,8 @@ static void mca_btl_udapl_frag_common_constructor(mca_btl_udapl_frag_t* frag) { - mca_btl_udapl_reg_t* reg = (mca_btl_udapl_reg_t*)frag->base.super.user_data; + mca_btl_udapl_reg_t* reg = + (mca_btl_udapl_reg_t*)frag->base.super.registration; #if OMPI_ENABLE_DEBUG frag->base.des_src = NULL; @@ -37,7 +38,7 @@ static void mca_btl_udapl_frag_common_constructor(mca_btl_udapl_frag_t* frag) #endif frag->registration = reg; - frag->segment.seg_addr.pval = (unsigned char*)(frag + 1); + frag->segment.seg_addr.pval = (unsigned char*)frag->base.super.ptr; frag->ftr = NULL; /* Don't understand why yet, but there are cases where reg is NULL - diff --git a/opal/include/opal/align.h b/opal/include/opal/align.h index 00c11f4c27..c5db49033e 100644 --- a/opal/include/opal/align.h +++ b/opal/include/opal/align.h @@ -22,5 +22,6 @@ #define OPAL_ALIGN_H #define OPAL_ALIGN(x,a,t) (((x)+((t)(a)-1)) & ~(((t)(a)-1))) +#define OPAL_ALIGN_PTR(x,a,t) ((t)OPAL_ALIGN((uintptr_t)x, a, uintptr_t)) #endif /* OPAL_ALIGN_H */