When frags are allocated from mpool by free_list the frag structure is also
allocated from mpool memory (which is registered memory for RDMA transports) This is not a problem for a small jobs, but for a big number of ranks an amount of waisted memory is big. This commit was SVN r13921.
Этот коммит содержится в:
родитель
e932d9a695
Коммит
90fb58de4f
@ -20,26 +20,11 @@
|
||||
#include "ompi_config.h"
|
||||
|
||||
#include "ompi/class/ompi_free_list.h"
|
||||
#include "opal/include/opal/align.h"
|
||||
#include "opal/sys/cache.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "ompi/mca/mpool/mpool.h"
|
||||
|
||||
static inline size_t align_to(size_t val, size_t alignment);
|
||||
static inline size_t align_to(size_t val, size_t alignment)
|
||||
{
|
||||
size_t mod;
|
||||
|
||||
if(0 == alignment)
|
||||
return val;
|
||||
|
||||
mod = val % alignment;
|
||||
|
||||
if(mod)
|
||||
val += (alignment - mod);
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static void ompi_free_list_construct(ompi_free_list_t* fl);
|
||||
static void ompi_free_list_destruct(ompi_free_list_t* fl);
|
||||
|
||||
@ -49,6 +34,7 @@ OBJ_CLASS_INSTANCE(ompi_free_list_t, opal_atomic_lifo_t,
|
||||
struct ompi_free_list_memory_t {
|
||||
opal_list_item_t super;
|
||||
mca_mpool_base_registration_t *registration;
|
||||
void *base_ptr;
|
||||
};
|
||||
typedef struct ompi_free_list_memory_t ompi_free_list_memory_t;
|
||||
static OBJ_CLASS_INSTANCE(ompi_free_list_memory_t,
|
||||
@ -69,7 +55,6 @@ static void ompi_free_list_construct(ompi_free_list_t* fl)
|
||||
fl->fl_num_waiting = 0;
|
||||
fl->fl_elem_size = sizeof(ompi_free_list_item_t);
|
||||
fl->fl_elem_class = OBJ_CLASS(ompi_free_list_item_t);
|
||||
fl->fl_header_space = 0;
|
||||
fl->fl_alignment = 0;
|
||||
fl->fl_mpool = 0;
|
||||
OBJ_CONSTRUCT(&(fl->fl_allocations), opal_list_t);
|
||||
@ -78,6 +63,7 @@ static void ompi_free_list_construct(ompi_free_list_t* fl)
|
||||
static void ompi_free_list_destruct(ompi_free_list_t* fl)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
ompi_free_list_memory_t *fl_mem;
|
||||
|
||||
#if 0 && OMPI_ENABLE_DEBUG
|
||||
if(opal_list_get_size(&fl->super) != fl->fl_num_allocated) {
|
||||
@ -87,21 +73,15 @@ static void ompi_free_list_destruct(ompi_free_list_t* fl)
|
||||
}
|
||||
#endif
|
||||
|
||||
if (NULL != fl->fl_mpool) {
|
||||
ompi_free_list_memory_t *fl_mem;
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&(fl->fl_allocations)))) {
|
||||
/* destruct the item (we constructed it), then free the memory chunk */
|
||||
OBJ_DESTRUCT(item);
|
||||
fl_mem = (ompi_free_list_memory_t*) item;
|
||||
fl->fl_mpool->mpool_free(fl->fl_mpool, item, fl_mem->registration);
|
||||
}
|
||||
} else {
|
||||
while (NULL != (item = opal_list_remove_first(&(fl->fl_allocations)))) {
|
||||
/* destruct the item (we constructed it), then free the memory chunk */
|
||||
OBJ_DESTRUCT(item);
|
||||
free(item);
|
||||
while(NULL != (item = opal_list_remove_first(&(fl->fl_allocations)))) {
|
||||
fl_mem = (ompi_free_list_memory_t*)item;
|
||||
if(fl->fl_mpool != NULL) {
|
||||
fl->fl_mpool->mpool_free(fl->fl_mpool, fl_mem->base_ptr,
|
||||
fl_mem->registration);
|
||||
}
|
||||
/* destruct the item (we constructed it), then free the memory chunk */
|
||||
OBJ_DESTRUCT(item);
|
||||
free(item);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&fl->fl_allocations);
|
||||
@ -112,7 +92,6 @@ static void ompi_free_list_destruct(ompi_free_list_t* fl)
|
||||
int ompi_free_list_init_ex(
|
||||
ompi_free_list_t *flist,
|
||||
size_t elem_size,
|
||||
size_t header_space,
|
||||
size_t alignment,
|
||||
opal_class_t* elem_class,
|
||||
int num_elements_to_alloc,
|
||||
@ -120,6 +99,10 @@ int ompi_free_list_init_ex(
|
||||
int num_elements_per_alloc,
|
||||
mca_mpool_base_module_t* mpool)
|
||||
{
|
||||
/* alignment must be more than zero and power of two */
|
||||
if(alignment <= 1 || (alignment & (alignment - 1)))
|
||||
return OMPI_ERROR;
|
||||
|
||||
if(elem_size > flist->fl_elem_size)
|
||||
flist->fl_elem_size = elem_size;
|
||||
if(elem_class)
|
||||
@ -128,9 +111,7 @@ int ompi_free_list_init_ex(
|
||||
flist->fl_num_allocated = 0;
|
||||
flist->fl_num_per_alloc = num_elements_per_alloc;
|
||||
flist->fl_mpool = mpool;
|
||||
flist->fl_header_space = header_space;
|
||||
flist->fl_alignment = alignment;
|
||||
flist->fl_elem_size = align_to(flist->fl_elem_size, flist->fl_alignment);
|
||||
if(num_elements_to_alloc)
|
||||
return ompi_free_list_grow(flist, num_elements_to_alloc);
|
||||
return OMPI_SUCCESS;
|
||||
@ -138,51 +119,67 @@ int ompi_free_list_init_ex(
|
||||
|
||||
int ompi_free_list_grow(ompi_free_list_t* flist, size_t num_elements)
|
||||
{
|
||||
unsigned char* ptr;
|
||||
unsigned char *ptr, *mpool_alloc_ptr = NULL;
|
||||
ompi_free_list_memory_t *alloc_ptr;
|
||||
size_t i, alloc_size;
|
||||
mca_mpool_base_registration_t* user_out = NULL;
|
||||
size_t i, alloc_size, head_size, elem_size = 0;
|
||||
mca_mpool_base_registration_t *reg = NULL;
|
||||
|
||||
if (flist->fl_max_to_alloc > 0)
|
||||
if (flist->fl_num_allocated + num_elements > flist->fl_max_to_alloc)
|
||||
if(flist->fl_max_to_alloc > 0)
|
||||
if(flist->fl_num_allocated + num_elements > flist->fl_max_to_alloc)
|
||||
num_elements = flist->fl_max_to_alloc - flist->fl_num_allocated;
|
||||
|
||||
if (num_elements == 0)
|
||||
if(num_elements == 0)
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
|
||||
alloc_size = num_elements * flist->fl_elem_size +
|
||||
sizeof(ompi_free_list_memory_t) + flist->fl_header_space +
|
||||
head_size = (NULL == flist->fl_mpool) ? flist->fl_elem_size:
|
||||
flist->fl_elem_class->cls_sizeof;
|
||||
head_size = OPAL_ALIGN(head_size, flist->fl_alignment, size_t);
|
||||
|
||||
/* calculate head allocation size */
|
||||
alloc_size = num_elements * head_size + sizeof(ompi_free_list_memory_t) +
|
||||
flist->fl_alignment;
|
||||
|
||||
if (NULL != flist->fl_mpool)
|
||||
alloc_ptr = (ompi_free_list_memory_t*)flist->fl_mpool->mpool_alloc(flist->fl_mpool,
|
||||
alloc_size, 0, MCA_MPOOL_FLAGS_CACHE_BYPASS, &user_out);
|
||||
else
|
||||
alloc_ptr = (ompi_free_list_memory_t*)malloc(alloc_size);
|
||||
alloc_ptr = (ompi_free_list_memory_t*)malloc(alloc_size);
|
||||
|
||||
if(NULL == alloc_ptr)
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
|
||||
/* make the alloc_ptr a list item, save the chunk in the allocations list, and
|
||||
have ptr point to memory right after the list item structure */
|
||||
/* allocate the rest from the mpool */
|
||||
if(flist->fl_mpool != NULL) {
|
||||
elem_size = OPAL_ALIGN(flist->fl_elem_size -
|
||||
flist->fl_elem_class->cls_sizeof, flist->fl_alignment, size_t);
|
||||
if(elem_size != 0) {
|
||||
mpool_alloc_ptr = flist->fl_mpool->mpool_alloc(flist->fl_mpool,
|
||||
num_elements * elem_size, flist->fl_alignment,
|
||||
MCA_MPOOL_FLAGS_CACHE_BYPASS, ®);
|
||||
if(NULL == mpool_alloc_ptr) {
|
||||
free(alloc_ptr);
|
||||
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* make the alloc_ptr a list item, save the chunk in the allocations list,
|
||||
* and have ptr point to memory right after the list item structure */
|
||||
OBJ_CONSTRUCT(alloc_ptr, ompi_free_list_memory_t);
|
||||
opal_list_append(&(flist->fl_allocations), (opal_list_item_t*) alloc_ptr);
|
||||
opal_list_append(&(flist->fl_allocations), (opal_list_item_t*)alloc_ptr);
|
||||
|
||||
alloc_ptr->registration = user_out;
|
||||
alloc_ptr->registration = reg;
|
||||
alloc_ptr->base_ptr = mpool_alloc_ptr;
|
||||
|
||||
ptr = (unsigned char*) alloc_ptr + sizeof(ompi_free_list_memory_t);
|
||||
|
||||
ptr = (unsigned char*)(align_to((size_t)ptr + flist->fl_header_space,
|
||||
flist->fl_alignment) - flist->fl_header_space);
|
||||
ptr = (unsigned char*)alloc_ptr + sizeof(ompi_free_list_memory_t);
|
||||
ptr = OPAL_ALIGN_PTR(ptr, flist->fl_alignment, unsigned char*);
|
||||
|
||||
for(i=0; i<num_elements; i++) {
|
||||
ompi_free_list_item_t* item = (ompi_free_list_item_t*)ptr;
|
||||
item->user_data = user_out;
|
||||
item->registration = reg;
|
||||
item->ptr = mpool_alloc_ptr;
|
||||
|
||||
OBJ_CONSTRUCT_INTERNAL(item, flist->fl_elem_class);
|
||||
|
||||
opal_atomic_lifo_push(&(flist->super), &(item->super));
|
||||
ptr += flist->fl_elem_size;
|
||||
ptr += head_size;
|
||||
mpool_alloc_ptr += elem_size;
|
||||
}
|
||||
|
||||
flist->fl_num_allocated += num_elements;
|
||||
|
@ -40,7 +40,6 @@ struct ompi_free_list_t
|
||||
size_t fl_num_per_alloc;
|
||||
size_t fl_num_waiting;
|
||||
size_t fl_elem_size;
|
||||
size_t fl_header_space;
|
||||
size_t fl_alignment;
|
||||
opal_class_t* fl_elem_class;
|
||||
struct mca_mpool_base_module_t* fl_mpool;
|
||||
@ -51,10 +50,12 @@ struct ompi_free_list_t
|
||||
typedef struct ompi_free_list_t ompi_free_list_t;
|
||||
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_free_list_t);
|
||||
|
||||
struct mca_mpool_base_registration_t;
|
||||
struct ompi_free_list_item_t
|
||||
{
|
||||
opal_list_item_t super;
|
||||
void* user_data;
|
||||
struct mca_mpool_base_registration_t *registration;
|
||||
void *ptr;
|
||||
};
|
||||
typedef struct ompi_free_list_item_t ompi_free_list_item_t;
|
||||
|
||||
@ -75,7 +76,6 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_free_list_item_t);
|
||||
OMPI_DECLSPEC int ompi_free_list_init_ex(
|
||||
ompi_free_list_t *free_list,
|
||||
size_t element_size,
|
||||
size_t header_size,
|
||||
size_t alignment,
|
||||
opal_class_t* element_class,
|
||||
int num_elements_to_alloc,
|
||||
@ -92,7 +92,7 @@ static inline int ompi_free_list_init(
|
||||
int num_elements_per_alloc,
|
||||
struct mca_mpool_base_module_t* mpool)
|
||||
{
|
||||
return ompi_free_list_init_ex(free_list, element_size, 0, CACHE_LINE_SIZE,
|
||||
return ompi_free_list_init_ex(free_list, element_size, CACHE_LINE_SIZE,
|
||||
element_class, num_elements_to_alloc, max_elements_to_alloc,
|
||||
num_elements_per_alloc, mpool);
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ do { \
|
||||
|
||||
static void mca_btl_gm_frag_eager_constructor(mca_btl_gm_frag_t* frag)
|
||||
{
|
||||
frag->hdr = (mca_btl_base_header_t*)(frag + 1);
|
||||
frag->hdr = (mca_btl_base_header_t*)frag->base.super.ptr;
|
||||
frag->segment.seg_addr.pval = (unsigned char*)(frag->hdr + 1);
|
||||
frag->segment.seg_len = mca_btl_gm_module.super.btl_eager_limit - sizeof(mca_btl_base_header_t);
|
||||
frag->size = mca_btl_gm_component.gm_eager_frag_size;
|
||||
|
@ -583,7 +583,10 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
|
||||
2*MCA_BTL_IB_FRAG_ALIGN;
|
||||
|
||||
mvapi_btl->eager_rdma_frag_size =
|
||||
length & ~(2 * MCA_BTL_IB_FRAG_ALIGN - 1);
|
||||
(sizeof(mca_btl_mvapi_header_t) +
|
||||
sizeof(mca_btl_mvapi_footer_t) +
|
||||
mvapi_btl->super.btl_eager_limit +
|
||||
2*MCA_BTL_IB_FRAG_ALIGN) & ~(2 * MCA_BTL_IB_FRAG_ALIGN - 1);
|
||||
|
||||
ompi_free_list_init(&mvapi_btl->send_free_eager,
|
||||
length,
|
||||
|
@ -18,9 +18,11 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
struct mca_btl_mvapi_reg_t;
|
||||
struct mca_btl_mvapi_frag_t;
|
||||
|
||||
struct mca_btl_mvapi_eager_rdma_local_t {
|
||||
ompi_ptr_t base; /**< buffer for RDMAing eager messages */
|
||||
struct mca_btl_mvapi_frag_t *frags;
|
||||
struct mca_btl_mvapi_reg_t *reg;
|
||||
uint16_t head; /**< RDMA buffer to poll */
|
||||
uint16_t tail; /**< Needed for credit managment */
|
||||
@ -73,9 +75,7 @@ typedef struct mca_btl_mvapi_eager_rdma_remote_t mca_btl_mvapi_eager_rdma_remote
|
||||
}while (0)
|
||||
|
||||
#define MCA_BTL_MVAPI_GET_LOCAL_RDMA_FRAG(E, I) \
|
||||
(mca_btl_mvapi_frag_t*) \
|
||||
((char*)(E)->eager_rdma_local.base.pval + \
|
||||
(I) * (E)->endpoint_btl->eager_rdma_frag_size)
|
||||
(&(E)->eager_rdma_local.frags[(I)])
|
||||
|
||||
#define MCA_BTL_MVAPI_RDMA_NEXT_INDEX(I) do { \
|
||||
(I) = ((I) + 1) % \
|
||||
|
@ -165,10 +165,9 @@ static inline int mca_btl_mvapi_endpoint_post_send(
|
||||
#endif
|
||||
frag->desc.sr_desc.r_key = (VAPI_rkey_t)endpoint->eager_rdma_remote.rkey;
|
||||
frag->desc.sr_desc.remote_addr = (VAPI_virt_addr_t)
|
||||
endpoint->eager_rdma_remote.base.lval +
|
||||
(uintptr_t)endpoint->eager_rdma_remote.base.pval +
|
||||
endpoint->eager_rdma_remote.head *
|
||||
mvapi_btl->eager_rdma_frag_size +
|
||||
sizeof(mca_btl_mvapi_frag_t) +
|
||||
sizeof(mca_btl_mvapi_header_t) +
|
||||
frag->size +
|
||||
sizeof(mca_btl_mvapi_footer_t);
|
||||
@ -1223,12 +1222,20 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma(
|
||||
{
|
||||
mca_btl_mvapi_module_t* mvapi_btl = endpoint->endpoint_btl;
|
||||
char *buf;
|
||||
mca_btl_mvapi_recv_frag_eager_t *headers_buf;
|
||||
unsigned int i;
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
|
||||
if (endpoint->eager_rdma_local.base.pval)
|
||||
goto unlock_rdma_local;
|
||||
|
||||
headers_buf = (mca_btl_mvapi_recv_frag_eager_t*)
|
||||
malloc(sizeof(mca_btl_mvapi_recv_frag_eager_t) *
|
||||
mca_btl_mvapi_component.eager_rdma_num);
|
||||
|
||||
if(NULL == headers_buf)
|
||||
goto unlock_rdma_local;
|
||||
|
||||
buf = mvapi_btl->super.btl_mpool->mpool_alloc(mvapi_btl->super.btl_mpool,
|
||||
mvapi_btl->eager_rdma_frag_size *
|
||||
mca_btl_mvapi_component.eager_rdma_num, 0,
|
||||
@ -1236,12 +1243,13 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma(
|
||||
(mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);
|
||||
|
||||
if(!buf)
|
||||
goto unlock_rdma_local;
|
||||
goto free_headers_buf;
|
||||
|
||||
for(i = 0; i < mca_btl_mvapi_component.eager_rdma_num; i++) {
|
||||
ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf +
|
||||
i*mvapi_btl->eager_rdma_frag_size);
|
||||
item->user_data = (void*)endpoint->eager_rdma_local.reg;
|
||||
ompi_free_list_item_t *item;
|
||||
item = (ompi_free_list_item_t *)&headers_buf[i];
|
||||
item->registration = (void*)endpoint->eager_rdma_local.reg;
|
||||
item->ptr = buf + i * mvapi_btl->eager_rdma_frag_size;
|
||||
OBJ_CONSTRUCT(item, mca_btl_mvapi_recv_frag_eager_t);
|
||||
((mca_btl_mvapi_frag_t*)item)->endpoint = endpoint;
|
||||
((mca_btl_mvapi_frag_t*)item)->type = MCA_BTL_MVAPI_FRAG_EAGER_RDMA;
|
||||
@ -1253,6 +1261,7 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma(
|
||||
goto cleanup;
|
||||
|
||||
endpoint->eager_rdma_local.base.pval = buf;
|
||||
endpoint->eager_rdma_local.frags = headers_buf;
|
||||
mvapi_btl->eager_rdma_buffers_count++;
|
||||
if (mca_btl_mvapi_endpoint_send_eager_rdma(endpoint) == 0) {
|
||||
OPAL_THREAD_UNLOCK(&mvapi_btl->eager_rdma_lock);
|
||||
@ -1262,6 +1271,7 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma(
|
||||
|
||||
mvapi_btl->eager_rdma_buffers_count--;
|
||||
endpoint->eager_rdma_local.base.pval = NULL;
|
||||
endpoint->eager_rdma_local.frags = NULL;
|
||||
orte_pointer_array_set_item(mvapi_btl->eager_rdma_buffers,
|
||||
endpoint->eager_rdma_index, NULL);
|
||||
|
||||
@ -1269,6 +1279,8 @@ cleanup:
|
||||
OPAL_THREAD_UNLOCK(&mvapi_btl->eager_rdma_lock);
|
||||
mvapi_btl->super.btl_mpool->mpool_free(mvapi_btl->super.btl_mpool,
|
||||
buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
|
||||
free_headers_buf:
|
||||
free(headers_buf);
|
||||
unlock_rdma_local:
|
||||
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
|
||||
}
|
||||
|
@ -23,8 +23,8 @@
|
||||
static void mca_btl_mvapi_frag_common_constructor( mca_btl_mvapi_frag_t* frag)
|
||||
{
|
||||
mca_btl_mvapi_reg_t* mem_hndl =
|
||||
(mca_btl_mvapi_reg_t*)frag->base.super.user_data;
|
||||
frag->hdr = (mca_btl_mvapi_header_t*) (frag+1); /* initialize btl header to start at end of frag */
|
||||
(mca_btl_mvapi_reg_t*)frag->base.super.registration;
|
||||
frag->hdr = (mca_btl_mvapi_header_t*)frag->base.super.ptr;
|
||||
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_mvapi_header_t);
|
||||
/* init the segment address to start after the btl header */
|
||||
|
||||
|
@ -234,23 +234,21 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl,
|
||||
case MCA_BTL_OPENIB_CONTROL_RDMA:
|
||||
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)ctl_hdr;
|
||||
|
||||
BTL_VERBOSE(("prior to NTOH received rkey %lu, rdma_start.lval %llu, pval %p, ival %u, frag_t_len %llu\n",
|
||||
BTL_VERBOSE(("prior to NTOH received rkey %lu, rdma_start.lval %llu, pval %p, ival %u\n",
|
||||
rdma_hdr->rkey,
|
||||
(unsigned long) rdma_hdr->rdma_start.lval,
|
||||
rdma_hdr->rdma_start.pval,
|
||||
rdma_hdr->rdma_start.ival,
|
||||
(unsigned long) rdma_hdr->frag_t_len
|
||||
rdma_hdr->rdma_start.ival
|
||||
));
|
||||
|
||||
if(endpoint->nbo) {
|
||||
BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH((*rdma_hdr));
|
||||
|
||||
BTL_VERBOSE(("received rkey %lu, rdma_start.lval %llu, pval %p, ival %u, frag_t_len %llu\n",
|
||||
BTL_VERBOSE(("received rkey %lu, rdma_start.lval %llu, pval %p, ival %u\n",
|
||||
rdma_hdr->rkey,
|
||||
(unsigned long) rdma_hdr->rdma_start.lval,
|
||||
rdma_hdr->rdma_start.pval,
|
||||
rdma_hdr->rdma_start.ival,
|
||||
(unsigned long) rdma_hdr->frag_t_len
|
||||
rdma_hdr->rdma_start.ival
|
||||
));
|
||||
|
||||
}
|
||||
@ -261,7 +259,6 @@ static void btl_openib_control(struct mca_btl_base_module_t* btl,
|
||||
}
|
||||
endpoint->eager_rdma_remote.rkey = rdma_hdr->rkey;
|
||||
endpoint->eager_rdma_remote.base.lval = rdma_hdr->rdma_start.lval;
|
||||
endpoint->eager_rdma_remote.frag_t_len = rdma_hdr->frag_t_len;
|
||||
endpoint->eager_rdma_remote.tokens =
|
||||
mca_btl_openib_component.eager_rdma_num - 1;
|
||||
break;
|
||||
@ -699,17 +696,19 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
openib_btl->super.btl_mpool = openib_btl->hca->mpool;
|
||||
|
||||
/* Initialize pool of send fragments */
|
||||
length = sizeof(mca_btl_openib_frag_t) +
|
||||
length = sizeof(mca_btl_openib_send_frag_eager_t) +
|
||||
sizeof(mca_btl_openib_header_t) +
|
||||
sizeof(mca_btl_openib_footer_t) +
|
||||
openib_btl->super.btl_eager_limit;
|
||||
|
||||
openib_btl->eager_rdma_frag_size = OPAL_ALIGN(length,
|
||||
mca_btl_openib_component.buffer_alignment, int);
|
||||
openib_btl->eager_rdma_frag_size = OPAL_ALIGN(
|
||||
sizeof(mca_btl_openib_header_t) +
|
||||
sizeof(mca_btl_openib_footer_t) +
|
||||
openib_btl->super.btl_eager_limit,
|
||||
mca_btl_openib_component.buffer_alignment, size_t);
|
||||
|
||||
ompi_free_list_init_ex(&openib_btl->send_free_eager,
|
||||
length,
|
||||
sizeof(mca_btl_openib_frag_t),
|
||||
mca_btl_openib_component.buffer_alignment,
|
||||
OBJ_CLASS(mca_btl_openib_send_frag_eager_t),
|
||||
mca_btl_openib_component.ib_free_list_num,
|
||||
@ -717,9 +716,13 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
mca_btl_openib_component.ib_free_list_inc,
|
||||
openib_btl->super.btl_mpool);
|
||||
|
||||
length = sizeof(mca_btl_openib_recv_frag_eager_t) +
|
||||
sizeof(mca_btl_openib_header_t) +
|
||||
sizeof(mca_btl_openib_footer_t) +
|
||||
openib_btl->super.btl_eager_limit;
|
||||
|
||||
ompi_free_list_init_ex(&openib_btl->recv_free_eager,
|
||||
length,
|
||||
sizeof(mca_btl_openib_frag_t),
|
||||
mca_btl_openib_component.buffer_alignment,
|
||||
OBJ_CLASS(mca_btl_openib_recv_frag_eager_t),
|
||||
mca_btl_openib_component.ib_free_list_num,
|
||||
@ -727,13 +730,12 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
mca_btl_openib_component.ib_free_list_inc,
|
||||
openib_btl->super.btl_mpool);
|
||||
|
||||
length = sizeof(mca_btl_openib_frag_t) +
|
||||
length = sizeof(mca_btl_openib_send_frag_max_t) +
|
||||
sizeof(mca_btl_openib_header_t) +
|
||||
openib_btl->super.btl_max_send_size;
|
||||
|
||||
ompi_free_list_init_ex(&openib_btl->send_free_max,
|
||||
length,
|
||||
sizeof(mca_btl_openib_frag_t),
|
||||
mca_btl_openib_component.buffer_alignment,
|
||||
OBJ_CLASS(mca_btl_openib_send_frag_max_t),
|
||||
mca_btl_openib_component.ib_free_list_num,
|
||||
@ -741,25 +743,27 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
mca_btl_openib_component.ib_free_list_inc,
|
||||
openib_btl->super.btl_mpool);
|
||||
|
||||
length = sizeof(mca_btl_openib_recv_frag_max_t) +
|
||||
sizeof(mca_btl_openib_header_t) +
|
||||
openib_btl->super.btl_max_send_size;
|
||||
|
||||
/* Initialize pool of receive fragments */
|
||||
ompi_free_list_init_ex(&openib_btl->recv_free_max,
|
||||
length,
|
||||
sizeof(mca_btl_openib_frag_t),
|
||||
mca_btl_openib_component.buffer_alignment,
|
||||
OBJ_CLASS (mca_btl_openib_recv_frag_max_t),
|
||||
OBJ_CLASS(mca_btl_openib_recv_frag_max_t),
|
||||
mca_btl_openib_component.ib_free_list_num,
|
||||
mca_btl_openib_component.ib_free_list_max,
|
||||
mca_btl_openib_component.ib_free_list_inc,
|
||||
openib_btl->super.btl_mpool);
|
||||
|
||||
length = sizeof(mca_btl_openib_frag_t) +
|
||||
length = sizeof(mca_btl_openib_send_frag_control_t) +
|
||||
sizeof(mca_btl_openib_header_t) +
|
||||
sizeof(mca_btl_openib_footer_t) +
|
||||
sizeof(mca_btl_openib_eager_rdma_header_t);
|
||||
|
||||
ompi_free_list_init_ex(&openib_btl->send_free_control,
|
||||
length,
|
||||
sizeof(mca_btl_openib_frag_t),
|
||||
mca_btl_openib_component.buffer_alignment,
|
||||
OBJ_CLASS(mca_btl_openib_send_frag_control_t),
|
||||
mca_btl_openib_component.ib_free_list_num,
|
||||
|
@ -19,6 +19,7 @@ extern "C" {
|
||||
|
||||
struct mca_btl_openib_eager_rdma_local_t {
|
||||
ompi_ptr_t base; /**< buffer for RDMAing eager messages */
|
||||
mca_btl_openib_recv_frag_eager_t *frags;
|
||||
mca_btl_openib_reg_t *reg;
|
||||
uint16_t head; /**< RDMA buffer to poll */
|
||||
uint16_t tail; /**< Needed for credit managment */
|
||||
@ -38,7 +39,6 @@ struct mca_btl_openib_eager_rdma_remote_t {
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
uint32_t seq;
|
||||
#endif
|
||||
uint64_t frag_t_len; /**< remote's sizeof(mca_btl_openib_frag_t) */
|
||||
};
|
||||
typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remote_t;
|
||||
|
||||
@ -72,10 +72,8 @@ typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remo
|
||||
((volatile uint8_t*)(F)->u.buf)[3] = EAGER_RDMA_BUFFER_LOCAL; \
|
||||
}while (0)
|
||||
|
||||
#define MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(E, I) \
|
||||
(mca_btl_openib_frag_t*) \
|
||||
((char*)(E)->eager_rdma_local.base.pval + \
|
||||
(I) * (E)->endpoint_btl->eager_rdma_frag_size)
|
||||
#define MCA_BTL_OPENIB_GET_LOCAL_RDMA_FRAG(E, I) \
|
||||
(&(E)->eager_rdma_local.frags[(I)])
|
||||
|
||||
#define MCA_BTL_OPENIB_RDMA_NEXT_INDEX(I) do { \
|
||||
(I) = ((I) + 1); \
|
||||
|
@ -173,7 +173,6 @@ static inline int mca_btl_openib_endpoint_post_send(mca_btl_openib_module_t* ope
|
||||
endpoint->eager_rdma_remote.base.lval +
|
||||
endpoint->eager_rdma_remote.head *
|
||||
openib_btl->eager_rdma_frag_size +
|
||||
endpoint->eager_rdma_remote.frag_t_len +
|
||||
sizeof(mca_btl_openib_header_t) +
|
||||
mca_btl_openib_component.eager_limit +
|
||||
sizeof(mca_btl_openib_footer_t);
|
||||
@ -1168,7 +1167,6 @@ static int mca_btl_openib_endpoint_send_eager_rdma(
|
||||
rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)frag->segment.seg_addr.pval;
|
||||
rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
|
||||
rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
|
||||
rdma_hdr->frag_t_len = sizeof(mca_btl_openib_frag_t);
|
||||
rdma_hdr->rdma_start.lval = ompi_ptr_ptol(endpoint->eager_rdma_local.base.pval);
|
||||
BTL_VERBOSE(("sending rkey %lu, rdma_start.lval %llu, pval %p, ival %u type %d and sizeof(rdma_hdr) %d\n",
|
||||
rdma_hdr->rkey,
|
||||
@ -1204,6 +1202,7 @@ void mca_btl_openib_endpoint_connect_eager_rdma(
|
||||
{
|
||||
mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
|
||||
char *buf;
|
||||
mca_btl_openib_recv_frag_eager_t *headers_buf;
|
||||
unsigned int i;
|
||||
orte_std_cntr_t index;
|
||||
|
||||
@ -1213,6 +1212,13 @@ void mca_btl_openib_endpoint_connect_eager_rdma(
|
||||
(void*)1))
|
||||
return;
|
||||
|
||||
headers_buf = (mca_btl_openib_recv_frag_eager_t*)
|
||||
malloc(sizeof(mca_btl_openib_recv_frag_eager_t) *
|
||||
mca_btl_openib_component.eager_rdma_num);
|
||||
|
||||
if(NULL == headers_buf)
|
||||
goto unlock_rdma_local;
|
||||
|
||||
buf = openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool,
|
||||
openib_btl->eager_rdma_frag_size *
|
||||
mca_btl_openib_component.eager_rdma_num,
|
||||
@ -1221,22 +1227,24 @@ void mca_btl_openib_endpoint_connect_eager_rdma(
|
||||
(mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);
|
||||
|
||||
if(!buf)
|
||||
goto unlock_rdma_local;
|
||||
goto free_headers_buf;
|
||||
|
||||
buf = buf + openib_btl->eager_rdma_frag_size -
|
||||
sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit -
|
||||
sizeof(mca_btl_openib_header_t) -
|
||||
sizeof(mca_btl_openib_recv_frag_eager_t);
|
||||
sizeof(mca_btl_openib_header_t);
|
||||
|
||||
for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
|
||||
ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf +
|
||||
i*openib_btl->eager_rdma_frag_size);
|
||||
item->user_data = (void*)endpoint->eager_rdma_local.reg;
|
||||
ompi_free_list_item_t *item;
|
||||
item = (ompi_free_list_item_t*)&headers_buf[i];
|
||||
item->registration = (void*)endpoint->eager_rdma_local.reg;
|
||||
item->ptr = buf + i * openib_btl->eager_rdma_frag_size;
|
||||
OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_eager_t);
|
||||
((mca_btl_openib_frag_t*)item)->endpoint = endpoint;
|
||||
((mca_btl_openib_frag_t*)item)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
|
||||
}
|
||||
|
||||
|
||||
endpoint->eager_rdma_local.frags = headers_buf;
|
||||
|
||||
/* set local rdma pointer to real value */
|
||||
opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1,
|
||||
buf);
|
||||
@ -1253,8 +1261,11 @@ void mca_btl_openib_endpoint_connect_eager_rdma(
|
||||
|
||||
openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool,
|
||||
buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
|
||||
free_headers_buf:
|
||||
free(headers_buf);
|
||||
unlock_rdma_local:
|
||||
/* set local rdma pointer back to zero. Will retry later */
|
||||
opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval,
|
||||
endpoint->eager_rdma_local.base.pval, NULL);
|
||||
endpoint->eager_rdma_local.frags = NULL;
|
||||
}
|
||||
|
@ -23,9 +23,9 @@
|
||||
static void mca_btl_openib_frag_common_constructor( mca_btl_openib_frag_t* frag)
|
||||
{
|
||||
mca_btl_openib_reg_t* registration =
|
||||
(mca_btl_openib_reg_t*)frag->base.super.user_data;
|
||||
(mca_btl_openib_reg_t*)frag->base.super.registration;
|
||||
|
||||
frag->hdr = (mca_btl_openib_header_t*) (frag+1); /* initialize the btl header to start at end of frag */
|
||||
frag->hdr = (mca_btl_openib_header_t*)frag->base.super.ptr;
|
||||
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);
|
||||
/* init the segment address to start after the btl header */
|
||||
|
||||
|
@ -113,7 +113,6 @@ struct mca_btl_openib_eager_rdma_header_t {
|
||||
uint8_t padding[3];
|
||||
uint32_t rkey;
|
||||
ompi_ptr_t rdma_start;
|
||||
uint64_t frag_t_len;
|
||||
};
|
||||
typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_header_t;
|
||||
|
||||
@ -121,14 +120,12 @@ typedef struct mca_btl_openib_eager_rdma_header_t mca_btl_openib_eager_rdma_head
|
||||
do { \
|
||||
h.rkey = htonl(h.rkey); \
|
||||
h.rdma_start.lval = hton64(h.rdma_start.lval); \
|
||||
h.frag_t_len = hton64(h.frag_t_len); \
|
||||
} while (0)
|
||||
|
||||
#define BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_NTOH(h) \
|
||||
do { \
|
||||
h.rkey = ntohl(h.rkey); \
|
||||
h.rdma_start.lval = ntoh64(h.rdma_start.lval); \
|
||||
h.frag_t_len = ntoh64(h.frag_t_len); \
|
||||
} while (0)
|
||||
|
||||
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <string.h>
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "btl_openib.h"
|
||||
#include "btl_openib_mca.h"
|
||||
@ -337,9 +338,15 @@ int btl_openib_register_mca_params(void)
|
||||
|
||||
CHECK(reg_int("buffer_alignment",
|
||||
"Prefered communication buffer alignment, in bytes "
|
||||
"(must be >= 0)",
|
||||
"(must be > 0 and power of two)",
|
||||
64, &ival, REGINT_GE_ZERO));
|
||||
mca_btl_openib_component.buffer_alignment = (uint32_t) ival;
|
||||
if(ival <= 1 || (ival & (ival - 1))) {
|
||||
opal_show_help("help-mpi-btl-openib.txt", "wrong buffer alignment",
|
||||
true, ival, orte_system_info.nodename, 64);
|
||||
mca_btl_openib_component.buffer_alignment = 64;
|
||||
} else {
|
||||
mca_btl_openib_component.buffer_alignment = (uint32_t) ival;
|
||||
}
|
||||
|
||||
CHECK(reg_int("eager_limit", "Eager send limit, in bytes "
|
||||
"(must be >= 1)",
|
||||
|
@ -169,3 +169,6 @@ Please see this FAQ entry for more details:
|
||||
|
||||
NOTE: You can turn off this warning by setting the MCA parameter
|
||||
btl_openib_warn_default_gid_prefix to 0.
|
||||
[wrong buffer alignment]
|
||||
Wrong buffer alignment %d configured on host '%s'. Should be bigger
|
||||
than zero and power of two. Use default %d instead.
|
||||
|
@ -542,7 +542,8 @@ int mca_btl_sm_add_procs_same_base_addr(
|
||||
/* initialize fragment descriptor free lists */
|
||||
|
||||
/* allocation will be for the fragment descriptor and payload buffer */
|
||||
length=sizeof(mca_btl_sm_frag_t) + mca_btl_sm_component.eager_limit;
|
||||
length = sizeof(mca_btl_sm_frag1_t) + sizeof(mca_btl_sm_hdr_t) +
|
||||
mca_btl_sm_component.eager_limit;
|
||||
ompi_free_list_init(&mca_btl_sm_component.sm_frags1, length,
|
||||
OBJ_CLASS(mca_btl_sm_frag1_t),
|
||||
mca_btl_sm_component.sm_free_list_num,
|
||||
@ -550,7 +551,8 @@ int mca_btl_sm_add_procs_same_base_addr(
|
||||
mca_btl_sm_component.sm_free_list_inc,
|
||||
mca_btl_sm_component.sm_mpool); /* use shared-memory pool */
|
||||
|
||||
length=sizeof(mca_btl_sm_frag_t) + mca_btl_sm_component.max_frag_size;
|
||||
length = sizeof(mca_btl_sm_frag2_t) + sizeof(mca_btl_sm_hdr_t) +
|
||||
mca_btl_sm_component.max_frag_size;
|
||||
ompi_free_list_init(&mca_btl_sm_component.sm_frags2, length,
|
||||
OBJ_CLASS(mca_btl_sm_frag2_t),
|
||||
mca_btl_sm_component.sm_free_list_num,
|
||||
@ -558,6 +560,14 @@ int mca_btl_sm_add_procs_same_base_addr(
|
||||
mca_btl_sm_component.sm_free_list_inc,
|
||||
mca_btl_sm_component.sm_mpool); /* use shared-memory pool */
|
||||
|
||||
ompi_free_list_init(&mca_btl_sm_component.sm_frags,
|
||||
sizeof(mca_btl_sm_frag_t),
|
||||
OBJ_CLASS(mca_btl_sm_frag_t),
|
||||
mca_btl_sm_component.sm_free_list_num,
|
||||
-1,
|
||||
mca_btl_sm_component.sm_free_list_inc,
|
||||
NULL);
|
||||
|
||||
/* set up mca_btl_sm_component.list_smp_procs_same_base_addr */
|
||||
mca_btl_sm_component.list_smp_procs_same_base_addr=(int *)
|
||||
malloc(mca_btl_sm_component.sm_max_procs*sizeof(int));
|
||||
@ -854,7 +864,9 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src(
|
||||
max_data = frag->size - reserve;
|
||||
}
|
||||
iov.iov_len = max_data;
|
||||
iov.iov_base = (IOVBASE_TYPE*)(((unsigned char*)(frag+1)) + reserve);
|
||||
iov.iov_base =
|
||||
(IOVBASE_TYPE*)(((unsigned char*)(frag->segment.seg_addr.pval)) +
|
||||
reserve);
|
||||
|
||||
rc = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data );
|
||||
if(rc < 0) {
|
||||
@ -883,16 +895,14 @@ int mca_btl_sm_send(
|
||||
mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)descriptor;
|
||||
int rc;
|
||||
|
||||
frag->tag = tag;
|
||||
frag->type = MCA_BTL_SM_FRAG_SEND;
|
||||
frag->rc = OMPI_SUCCESS;
|
||||
frag->hdr->u.s.len = frag->segment.seg_len;
|
||||
frag->hdr->u.s.tag = tag;
|
||||
frag->hdr->type = MCA_BTL_SM_FRAG_SEND;
|
||||
|
||||
/*
|
||||
* post the descriptor in the queue - post with the relative
|
||||
* address
|
||||
*/
|
||||
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank, endpoint->peer_smp_rank, frag, rc);
|
||||
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank, endpoint->peer_smp_rank, frag->hdr, rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
|
@ -129,6 +129,7 @@ struct mca_btl_sm_component_t {
|
||||
* SMP specfic data structures. */
|
||||
ompi_free_list_t sm_frags1; /**< free list of sm first */
|
||||
ompi_free_list_t sm_frags2; /**< free list of sm second */
|
||||
ompi_free_list_t sm_frags; /**< free list of frags without data */
|
||||
ompi_free_list_t sm_first_frags_to_progress; /**< list of first
|
||||
fragments that are
|
||||
awaiting resources */
|
||||
|
@ -165,6 +165,7 @@ int mca_btl_sm_component_open(void)
|
||||
|
||||
/* initialize objects */
|
||||
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_lock, opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags1, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags2, ompi_free_list_t);
|
||||
return OMPI_SUCCESS;
|
||||
@ -342,6 +343,7 @@ int mca_btl_sm_component_progress(void)
|
||||
unsigned int peer_smp_rank ;
|
||||
mca_btl_sm_frag_t *frag;
|
||||
ompi_fifo_t *fifo = NULL;
|
||||
mca_btl_sm_hdr_t *hdr;
|
||||
int my_smp_rank=mca_btl_sm_component.my_smp_rank;
|
||||
int proc;
|
||||
int rc = 0, btl = 0;
|
||||
@ -377,7 +379,7 @@ int mca_btl_sm_component_progress(void)
|
||||
* that we have the same base address as the sender, so no
|
||||
* translation is necessary when accessing the fifo. Hence,
|
||||
* we use the _same_base_addr varient. */
|
||||
frag = (mca_btl_sm_frag_t *)
|
||||
hdr = (mca_btl_sm_hdr_t *)
|
||||
ompi_fifo_read_from_tail_same_base_addr( fifo );
|
||||
|
||||
/* release thread lock */
|
||||
@ -385,26 +387,33 @@ int mca_btl_sm_component_progress(void)
|
||||
opal_atomic_unlock(&(fifo->tail_lock));
|
||||
}
|
||||
|
||||
if( OMPI_CB_FREE == frag ) {
|
||||
if( OMPI_CB_FREE == hdr ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* dispatch fragment by type */
|
||||
switch(frag->type) {
|
||||
switch(hdr->type) {
|
||||
case MCA_BTL_SM_FRAG_ACK:
|
||||
{
|
||||
frag = hdr->frag;
|
||||
/* completion callback */
|
||||
frag->base.des_cbfunc(&mca_btl_sm[0].super, frag->endpoint, &frag->base, frag->rc);
|
||||
frag->base.des_cbfunc(&mca_btl_sm[0].super, frag->endpoint, &frag->base, hdr->u.rc);
|
||||
break;
|
||||
}
|
||||
case MCA_BTL_SM_FRAG_SEND:
|
||||
{
|
||||
/* recv upcall */
|
||||
mca_btl_sm_recv_reg_t* reg = mca_btl_sm[0].sm_reg + frag->tag;
|
||||
reg->cbfunc(&mca_btl_sm[0].super,frag->tag,&frag->base,reg->cbdata);
|
||||
frag->type = MCA_BTL_SM_FRAG_ACK;
|
||||
mca_btl_sm_recv_reg_t* reg = mca_btl_sm[0].sm_reg + hdr->u.s.tag;
|
||||
MCA_BTL_SM_FRAG_ALLOC(frag, rc);
|
||||
frag->segment.seg_addr.pval = ((char*)hdr) +
|
||||
sizeof(mca_btl_sm_hdr_t);
|
||||
frag->segment.seg_len = hdr->u.s.len;
|
||||
reg->cbfunc(&mca_btl_sm[0].super,hdr->u.s.tag,&frag->base,reg->cbdata);
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
hdr->type = MCA_BTL_SM_FRAG_ACK;
|
||||
hdr->u.rc = OMPI_SUCCESS;
|
||||
MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank],
|
||||
my_smp_rank, peer_smp_rank, frag, rc );
|
||||
my_smp_rank, peer_smp_rank, hdr, rc );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto err;
|
||||
break;
|
||||
@ -412,10 +421,10 @@ int mca_btl_sm_component_progress(void)
|
||||
default:
|
||||
{
|
||||
/* unknown */
|
||||
frag->rc = OMPI_ERROR;
|
||||
frag->type = MCA_BTL_SM_FRAG_ACK;
|
||||
hdr->u.rc = OMPI_ERROR;
|
||||
hdr->type = MCA_BTL_SM_FRAG_ACK;
|
||||
MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank],
|
||||
my_smp_rank, peer_smp_rank, frag, rc );
|
||||
my_smp_rank, peer_smp_rank, hdr, rc );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto err;
|
||||
break;
|
||||
@ -450,9 +459,9 @@ int mca_btl_sm_component_progress(void)
|
||||
* translate every access into the fifo to be relevant to our
|
||||
* memory space. Hence, we do *not* use the _same_base_addr
|
||||
* variant. */
|
||||
frag=(mca_btl_sm_frag_t *)ompi_fifo_read_from_tail( fifo,
|
||||
hdr=(mca_btl_sm_hdr_t *)ompi_fifo_read_from_tail( fifo,
|
||||
mca_btl_sm_component.sm_offset[peer_smp_rank]);
|
||||
if( OMPI_CB_FREE == frag ) {
|
||||
if( OMPI_CB_FREE == hdr ) {
|
||||
/* release thread lock */
|
||||
if( opal_using_threads() ) {
|
||||
opal_atomic_unlock(&(fifo->tail_lock));
|
||||
@ -467,37 +476,32 @@ int mca_btl_sm_component_progress(void)
|
||||
|
||||
/* change the address from address relative to the shared
|
||||
* memory address, to a true virtual address */
|
||||
frag = (mca_btl_sm_frag_t *)( (char *)frag +
|
||||
hdr = (mca_btl_sm_hdr_t *)( (char *)hdr +
|
||||
mca_btl_sm_component.sm_offset[peer_smp_rank]);
|
||||
|
||||
/* dispatch fragment by type */
|
||||
switch(frag->type) {
|
||||
switch(hdr->type) {
|
||||
case MCA_BTL_SM_FRAG_ACK:
|
||||
{
|
||||
frag = hdr->frag;
|
||||
/* completion callback */
|
||||
frag->base.des_src =
|
||||
( mca_btl_base_segment_t* )((ptrdiff_t)frag->base.des_dst + mca_btl_sm_component.sm_offset[peer_smp_rank]);
|
||||
frag->base.des_src->seg_addr.pval = (void*)
|
||||
((ptrdiff_t)frag->base.des_src->seg_addr.pval +
|
||||
mca_btl_sm_component.sm_offset[peer_smp_rank]);
|
||||
frag->base.des_dst = frag->base.des_src;
|
||||
frag->base.des_cbfunc(&mca_btl_sm[1].super, frag->endpoint, &frag->base, frag->rc);
|
||||
frag->base.des_cbfunc(&mca_btl_sm[1].super, frag->endpoint, &frag->base, hdr->u.rc);
|
||||
break;
|
||||
}
|
||||
case MCA_BTL_SM_FRAG_SEND:
|
||||
{
|
||||
/* recv upcall */
|
||||
mca_btl_sm_recv_reg_t* reg = mca_btl_sm[1].sm_reg + frag->tag;
|
||||
frag->base.des_dst = (mca_btl_base_segment_t*)
|
||||
((ptrdiff_t)frag->base.des_src + mca_btl_sm_component.sm_offset[peer_smp_rank]);
|
||||
frag->base.des_dst->seg_addr.pval = (void*)
|
||||
((ptrdiff_t)frag->base.des_dst->seg_addr.pval +
|
||||
mca_btl_sm_component.sm_offset[peer_smp_rank]);
|
||||
frag->base.des_src = frag->base.des_dst;
|
||||
reg->cbfunc(&mca_btl_sm[1].super,frag->tag,&frag->base,reg->cbdata);
|
||||
frag->type = MCA_BTL_SM_FRAG_ACK;
|
||||
mca_btl_sm_recv_reg_t* reg = mca_btl_sm[1].sm_reg + hdr->u.s.tag;
|
||||
MCA_BTL_SM_FRAG_ALLOC(frag, rc);
|
||||
frag->segment.seg_addr.pval = ((char*)hdr) +
|
||||
sizeof(mca_btl_sm_hdr_t);
|
||||
frag->segment.seg_len = hdr->u.s.len;
|
||||
reg->cbfunc(&mca_btl_sm[1].super,hdr->u.s.tag,&frag->base,reg->cbdata);
|
||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||
hdr->type = MCA_BTL_SM_FRAG_ACK;
|
||||
hdr->u.rc = OMPI_SUCCESS;
|
||||
MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank],
|
||||
my_smp_rank, peer_smp_rank, frag, rc );
|
||||
my_smp_rank, peer_smp_rank, hdr, rc );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto err;
|
||||
break;
|
||||
@ -505,10 +509,10 @@ int mca_btl_sm_component_progress(void)
|
||||
default:
|
||||
{
|
||||
/* unknown */
|
||||
frag->rc = OMPI_ERROR;
|
||||
frag->type = MCA_BTL_SM_FRAG_ACK;
|
||||
hdr->u.rc = OMPI_ERROR;
|
||||
hdr->type = MCA_BTL_SM_FRAG_ACK;
|
||||
MCA_BTL_SM_FIFO_WRITE( mca_btl_sm_component.sm_peers[peer_smp_rank],
|
||||
my_smp_rank, peer_smp_rank, frag, rc );
|
||||
my_smp_rank, peer_smp_rank, hdr, rc );
|
||||
if(OMPI_SUCCESS != rc)
|
||||
goto err;
|
||||
break;
|
||||
|
@ -4,7 +4,7 @@
|
||||
#include "btl_sm.h"
|
||||
#include "btl_sm_endpoint.h"
|
||||
|
||||
#define MCA_BTL_SM_FIFO_WRITE(endpoint_peer, my_smp_rank,peer_smp_rank,frag,rc) \
|
||||
#define MCA_BTL_SM_FIFO_WRITE(endpoint_peer, my_smp_rank,peer_smp_rank,hdr,rc) \
|
||||
do { \
|
||||
ompi_fifo_t* fifo; \
|
||||
fifo=&(mca_btl_sm_component.fifo[my_smp_rank][peer_smp_rank]); \
|
||||
@ -29,7 +29,7 @@ do { \
|
||||
} \
|
||||
\
|
||||
/* post fragment */ \
|
||||
while(ompi_fifo_write_to_head_same_base_addr(frag, fifo, \
|
||||
while(ompi_fifo_write_to_head_same_base_addr(hdr, fifo, \
|
||||
mca_btl_sm_component.sm_mpool) != OMPI_SUCCESS) \
|
||||
opal_progress(); \
|
||||
MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \
|
||||
|
@ -19,9 +19,12 @@
|
||||
#include "btl_sm_frag.h"
|
||||
|
||||
|
||||
static inline void mca_btl_sm_frag_constructor(mca_btl_sm_frag_t* frag)
|
||||
static inline void mca_btl_sm_frag_common_constructor(mca_btl_sm_frag_t* frag)
|
||||
{
|
||||
frag->segment.seg_addr.pval = frag+1;
|
||||
frag->hdr = frag->base.super.ptr;
|
||||
if(frag->hdr != NULL)
|
||||
frag->hdr->frag = frag;
|
||||
frag->segment.seg_addr.pval = ((char*)frag->hdr) + sizeof(mca_btl_sm_hdr_t);
|
||||
frag->segment.seg_len = frag->size;
|
||||
frag->base.des_src = &frag->segment;
|
||||
frag->base.des_src_cnt = 1;
|
||||
@ -30,18 +33,25 @@ static inline void mca_btl_sm_frag_constructor(mca_btl_sm_frag_t* frag)
|
||||
frag->base.des_flags = 0;
|
||||
}
|
||||
|
||||
static void mca_btl_sm_frag_constructor(mca_btl_sm_frag_t* frag)
|
||||
{
|
||||
frag->size = 0;
|
||||
frag->my_list = &mca_btl_sm_component.sm_frags;
|
||||
mca_btl_sm_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
static void mca_btl_sm_frag1_constructor(mca_btl_sm_frag_t* frag)
|
||||
{
|
||||
frag->size = mca_btl_sm_component.eager_limit;
|
||||
frag->my_list = &mca_btl_sm_component.sm_frags1;
|
||||
mca_btl_sm_frag_constructor(frag);
|
||||
mca_btl_sm_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
static void mca_btl_sm_frag2_constructor(mca_btl_sm_frag_t* frag)
|
||||
{
|
||||
frag->size = mca_btl_sm_component.max_frag_size;
|
||||
frag->my_list = &mca_btl_sm_component.sm_frags2;
|
||||
mca_btl_sm_frag_constructor(frag);
|
||||
mca_btl_sm_frag_common_constructor(frag);
|
||||
}
|
||||
|
||||
static void mca_btl_sm_frag_destructor(mca_btl_sm_frag_t* frag)
|
||||
|
@ -27,13 +27,31 @@
|
||||
#include "btl_sm.h"
|
||||
|
||||
|
||||
typedef enum {
|
||||
/*typedef enum {
|
||||
MCA_BTL_SM_FRAG_SEND,
|
||||
MCA_BTL_SM_FRAG_PUT,
|
||||
MCA_BTL_SM_FRAG_GET,
|
||||
MCA_BTL_SM_FRAG_ACK
|
||||
} mca_btl_sm_frag_type_t;
|
||||
} mca_btl_sm_frag_type_t; */
|
||||
|
||||
#define MCA_BTL_SM_FRAG_SEND 0
|
||||
#define MCA_BTL_SM_FRAG_ACK 1
|
||||
|
||||
typedef uint8_t mca_btl_sm_frag_type_t;
|
||||
struct mca_btl_sm_frag_t;
|
||||
|
||||
struct mca_btl_sm_hdr_t {
|
||||
struct mca_btl_sm_frag_t *frag;
|
||||
union {
|
||||
struct {
|
||||
size_t len;
|
||||
mca_btl_base_tag_t tag;
|
||||
} s;
|
||||
int rc;
|
||||
} u;
|
||||
mca_btl_sm_frag_type_t type;
|
||||
};
|
||||
typedef struct mca_btl_sm_hdr_t mca_btl_sm_hdr_t;
|
||||
|
||||
/**
|
||||
* shared memory send fragment derived type.
|
||||
@ -42,10 +60,8 @@ struct mca_btl_sm_frag_t {
|
||||
mca_btl_base_descriptor_t base;
|
||||
mca_btl_base_segment_t segment;
|
||||
struct mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_sm_frag_type_t type;
|
||||
mca_btl_base_tag_t tag;
|
||||
size_t size;
|
||||
int rc;
|
||||
mca_btl_sm_hdr_t *hdr;
|
||||
ompi_free_list_t* my_list;
|
||||
};
|
||||
typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t;
|
||||
@ -56,6 +72,13 @@ OBJ_CLASS_DECLARATION(mca_btl_sm_frag_t);
|
||||
OBJ_CLASS_DECLARATION(mca_btl_sm_frag1_t);
|
||||
OBJ_CLASS_DECLARATION(mca_btl_sm_frag2_t);
|
||||
|
||||
#define MCA_BTL_SM_FRAG_ALLOC(frag, rc) \
|
||||
{ \
|
||||
ompi_free_list_item_t* item; \
|
||||
OMPI_FREE_LIST_WAIT(&mca_btl_sm_component.sm_frags, item, rc); \
|
||||
frag = (mca_btl_sm_frag_t*)item; \
|
||||
}
|
||||
|
||||
#define MCA_BTL_SM_FRAG_ALLOC1(frag, rc) \
|
||||
{ \
|
||||
ompi_free_list_item_t* item; \
|
||||
|
@ -1127,7 +1127,8 @@ void mca_btl_udapl_endpoint_connect_eager_rdma(
|
||||
mca_btl_udapl_frag_eager_rdma_t* local_rdma_frag;
|
||||
ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf +
|
||||
i*mca_btl_udapl_component.udapl_eager_rdma_frag_size);
|
||||
item->user_data = endpoint->endpoint_eager_rdma_local.reg;
|
||||
item->registration = (void*)endpoint->endpoint_eager_rdma_local.reg;
|
||||
item->ptr = buf + i * mca_btl_udapl_component.udapl_eager_rdma_frag_size;
|
||||
OBJ_CONSTRUCT(item, mca_btl_udapl_frag_eager_rdma_t);
|
||||
|
||||
local_rdma_frag = ((mca_btl_udapl_frag_eager_rdma_t*)item);
|
||||
|
@ -26,7 +26,8 @@
|
||||
|
||||
static void mca_btl_udapl_frag_common_constructor(mca_btl_udapl_frag_t* frag)
|
||||
{
|
||||
mca_btl_udapl_reg_t* reg = (mca_btl_udapl_reg_t*)frag->base.super.user_data;
|
||||
mca_btl_udapl_reg_t* reg =
|
||||
(mca_btl_udapl_reg_t*)frag->base.super.registration;
|
||||
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
frag->base.des_src = NULL;
|
||||
@ -37,7 +38,7 @@ static void mca_btl_udapl_frag_common_constructor(mca_btl_udapl_frag_t* frag)
|
||||
#endif
|
||||
|
||||
frag->registration = reg;
|
||||
frag->segment.seg_addr.pval = (unsigned char*)(frag + 1);
|
||||
frag->segment.seg_addr.pval = (unsigned char*)frag->base.super.ptr;
|
||||
frag->ftr = NULL;
|
||||
|
||||
/* Don't understand why yet, but there are cases where reg is NULL -
|
||||
|
@ -22,5 +22,6 @@
|
||||
#define OPAL_ALIGN_H
|
||||
|
||||
#define OPAL_ALIGN(x,a,t) (((x)+((t)(a)-1)) & ~(((t)(a)-1)))
|
||||
#define OPAL_ALIGN_PTR(x,a,t) ((t)OPAL_ALIGN((uintptr_t)x, a, uintptr_t))
|
||||
|
||||
#endif /* OPAL_ALIGN_H */
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user