diff --git a/src/mca/bmi/ib/bmi_ib.c b/src/mca/bmi/ib/bmi_ib.c index a959a8ff96..8d11a05576 100644 --- a/src/mca/bmi/ib/bmi_ib.c +++ b/src/mca/bmi/ib/bmi_ib.c @@ -338,11 +338,6 @@ mca_bmi_base_descriptor_t* mca_bmi_ib_prepare_src( 0, registration); - /* frag->ret = VAPI_deregister_mr( */ -/* ib_bmi->nic, */ -/* registration->hndl */ -/* ); */ - mca_mpool_base_remove((void*) registration->base); ib_bmi->ib_pool->mpool_register(ib_bmi->ib_pool, @@ -359,7 +354,7 @@ mca_bmi_base_descriptor_t* mca_bmi_ib_prepare_src( max_data, ®istration); - if(frag->base.des_flags && MCA_BMI_DES_FLAGS_LEAVE_PINNED) { + if(mca_bmi_ib_component.leave_pinned) { rc = mca_mpool_base_insert(iov.iov_base, iov.iov_len, ib_bmi->ib_pool, @@ -421,6 +416,7 @@ mca_bmi_base_descriptor_t* mca_bmi_ib_prepare_dst( frag->segment.seg_len = *size; frag->segment.seg_addr.pval = convertor->pBaseBuf + convertor->bConverted; if(NULL!= registration){ + reg_len = (unsigned char*)registration->bound - (unsigned char*)frag->segment.seg_addr.pval + 1; if(frag->segment.seg_len > reg_len) { ib_bmi->ib_pool->mpool_deregister( ib_bmi->ib_pool, @@ -428,11 +424,6 @@ mca_bmi_base_descriptor_t* mca_bmi_ib_prepare_dst( 0, registration); -/* frag->ret = VAPI_deregister_mr( */ -/* ib_bmi->nic, */ -/* registration->hndl */ -/* ); */ - mca_mpool_base_remove((void*) registration->base); ib_bmi->ib_pool->mpool_register(ib_bmi->ib_pool, @@ -449,7 +440,7 @@ mca_bmi_base_descriptor_t* mca_bmi_ib_prepare_dst( *size, ®istration); - if(frag->base.des_flags && MCA_BMI_DES_FLAGS_LEAVE_PINNED) { + if(mca_bmi_ib_component.leave_pinned) { rc = mca_mpool_base_insert(frag->segment.seg_addr.pval, *size, ib_bmi->ib_pool, diff --git a/src/mca/bmi/ib/bmi_ib_component.c b/src/mca/bmi/ib/bmi_ib_component.c index aa513d17c9..c75439f99c 100644 --- a/src/mca/bmi/ib/bmi_ib_component.c +++ b/src/mca/bmi/ib/bmi_ib_component.c @@ -101,6 +101,7 @@ static inline int mca_bmi_ib_param_register_int( int mca_bmi_ib_component_open(void) { + int param, value; /* initialize state */ mca_bmi_ib_component.ib_num_bmis=0; @@ -129,15 +130,13 @@ int mca_bmi_ib_component_open(void) mca_bmi_ib_param_register_int ("exclusivity", 0); mca_bmi_ib_module.super.bmi_eager_limit = mca_bmi_ib_param_register_int ("eager_limit", - (64*1024 - - sizeof(mca_bmi_ib_header_t))); + (64*1024)) - sizeof(mca_bmi_ib_header_t); mca_bmi_ib_module.super.bmi_min_send_size = mca_bmi_ib_param_register_int ("min_send_size", - (64*1024 - - sizeof(mca_bmi_ib_header_t))); + (64*1024))- sizeof(mca_bmi_ib_header_t); mca_bmi_ib_module.super.bmi_max_send_size = - mca_bmi_ib_param_register_int ("max_send_size", 128*1024 - - sizeof(mca_bmi_ib_header_t)); + mca_bmi_ib_param_register_int ("max_send_size", (128*1024)) - sizeof(mca_bmi_ib_header_t); + mca_bmi_ib_module.ib_pin_min = mca_bmi_ib_param_register_int("ib_pin_min", 128*1024); mca_bmi_ib_module.ib_cq_size = @@ -195,10 +194,10 @@ int mca_bmi_ib_component_open(void) mca_bmi_ib_module.super.bmi_flags = mca_bmi_ib_param_register_int("flags", MCA_BMI_FLAGS_RDMA); - - mca_bmi_ib_component.leave_pinned = - mca_bmi_ib_param_register_int("leave_pinned", - 0); + + param = mca_base_param_find("mpi", NULL, "leave_pinned"); + mca_base_param_lookup_int(param, &value); + mca_bmi_ib_component.leave_pinned = value; diff --git a/src/mca/mpool/base/mpool_base_alloc.c b/src/mca/mpool/base/mpool_base_alloc.c index 97b97f983c..40427f5383 100644 --- a/src/mca/mpool/base/mpool_base_alloc.c +++ b/src/mca/mpool/base/mpool_base_alloc.c @@ -378,41 +378,6 @@ int mca_mpool_base_free(void * base) struct mca_mpool_base_chunk_t * mca_mpool_base_find(void * base) { return (mca_mpool_base_chunk_t *) - ompi_rb_tree_find(&mca_mpool_base_tree, &base); + ompi_rb_tree_find(&mca_mpool_base_tree, &base); } - -/* int mca_bmi_ib_tree_node_compare_range(void * key1, void * key2) */ -/* { */ -/* if(((mca_mpool_base_key_t *) key1)->bottom < */ -/* ((mca_mpool_base_key_t *) key2)->bottom) */ -/* { */ -/* return -1; */ -/* } */ -/* else if((((mca_mpool_base_key_t *) key1)->bottom + ((mca_mpool_base_key_t *) key1)->length) > */ -/* ((mca_mpool_base_key_t *) key2)->top) */ -/* { */ -/* return 1; */ -/* } */ -/* else */ -/* { */ -/* return 0; */ -/* } */ -/* } */ - - -/* /\** */ -/* * Searches the mpool to see if it has allocated the memory that is passed in. */ -/* * If so it returns an array of mpools the memory is registered with. */ -/* * */ -/* * @param base pointer to the memory to lookup */ -/* * */ -/* * @retval NULL if the memory is not in any mpool */ -/* * @retval pointer to an array of type mca_mpool_base_reg_mpool_t */ -/* *\/ */ -/* struct mca_mpool_base_chunk_t * mca_mpool_base_find_range(void * base) */ -/* { */ -/* return (mca_mpool_base_chunk_t *) */ -/* ompi_rb_tree_find(&mca_mpool_base_tree, &base); */ -/* } */ - diff --git a/src/mca/pml/ob1/pml_ob1.h b/src/mca/pml/ob1/pml_ob1.h index bbe322077e..2b41631e31 100644 --- a/src/mca/pml/ob1/pml_ob1.h +++ b/src/mca/pml/ob1/pml_ob1.h @@ -60,6 +60,7 @@ struct mca_pml_ob1_t { size_t rdma_offset; /* offset at which we attempt to initiate rdma */ size_t send_pipeline_depth; size_t recv_pipeline_depth; + bool leave_pinned; /* lock queue access */ ompi_mutex_t lock; diff --git a/src/mca/pml/ob1/pml_ob1_component.c b/src/mca/pml/ob1/pml_ob1_component.c index 25ec4184d0..a2786c81dd 100644 --- a/src/mca/pml/ob1/pml_ob1_component.c +++ b/src/mca/pml/ob1/pml_ob1_component.c @@ -77,6 +77,7 @@ static inline int mca_pml_ob1_param_register_int( int mca_pml_ob1_component_open(void) { + int param, value; OBJ_CONSTRUCT(&mca_pml_ob1.lock, ompi_mutex_t); /* requests */ @@ -116,6 +117,11 @@ int mca_pml_ob1_component_open(void) mca_pml_ob1_param_register_int("recv_pipeline_depth", 3); mca_pml_ob1.rdma_offset = mca_pml_ob1_param_register_int("rdma_offset", 1024*1024); + + mca_base_param_register_int("mpi", NULL, "leave_pinned", "leave_pinned", 0); + param = mca_base_param_find("mpi", NULL, "leave_pinned"); + mca_base_param_lookup_int(param, &value); + mca_pml_ob1.leave_pinned = value; return mca_bmi_base_open(); } diff --git a/src/mca/pml/ob1/pml_ob1_recvreq.c b/src/mca/pml/ob1/pml_ob1_recvreq.c index 46ce036e9d..28078b196c 100644 --- a/src/mca/pml/ob1/pml_ob1_recvreq.c +++ b/src/mca/pml/ob1/pml_ob1_recvreq.c @@ -164,7 +164,7 @@ static void mca_pml_ob1_recv_request_ack( * - size is larger than the rdma threshold * - rdma devices are available */ - if(recvreq->req_mpool == NULL) { + if(NULL == recvreq->req_mpool && !mca_pml_ob1.leave_pinned) { if(recvreq->req_recv.req_bytes_packed > mca_pml_ob1.rdma_offset && mca_pml_ob1_ep_array_get_size(&proc->bmi_rdma) && ompi_convertor_need_buffers(&recvreq->req_recv.req_convertor) == 0) { @@ -180,7 +180,11 @@ static void mca_pml_ob1_recv_request_ack( ack->hdr_rdma_offset = recvreq->req_recv.req_bytes_packed; } } - + else{ + recvreq->req_rdma_offset = hdr->hdr_frag_length; + ack->hdr_rdma_offset = hdr->hdr_frag_length; + } + /* initialize descriptor */ des->des_flags |= MCA_BMI_DES_FLAGS_PRIORITY; des->des_cbfunc = mca_pml_ob1_send_ctl_complete; @@ -345,7 +349,7 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq) * registed with. Otherwise, schedule round-robin across the * available RDMA nics. */ - if(recvreq->req_mpool == NULL) { + if(recvreq->req_mpool == NULL && !mca_pml_ob1.leave_pinned) { ep = mca_pml_ob1_ep_array_get_next(&proc->bmi_rdma); /* if there is only one bmi available or the size is less than @@ -384,23 +388,34 @@ void mca_pml_ob1_recv_request_schedule(mca_pml_ob1_recv_request_t* recvreq) recvreq->pin2[recvreq->pin_index] = get_profiler_timestamp(); #endif } else { - - /* find the endpoint corresponding to this bmi and schedule the entire message */ - ep = mca_pml_ob1_ep_array_find(&proc->bmi_rdma, (mca_bmi_base_module_t*) recvreq->req_mpool->user_data); + struct mca_mpool_base_registration_t * reg; size = bytes_remaining; /* prepare a descriptor for RDMA */ ompi_convertor_set_position(&recvreq->req_recv.req_convertor, &recvreq->req_rdma_offset); + + if(NULL != recvreq->req_mpool){ + /* find the endpoint corresponding to this bmi and schedule the entire message */ + ep = mca_pml_ob1_ep_array_find(&proc->bmi_rdma, (mca_bmi_base_module_t*) recvreq->req_mpool->user_data); + reg = recvreq->req_mpool->mpool_registration; + + + } + else{ + ep = mca_pml_ob1_ep_array_get_next(&proc->bmi_rdma); + reg = NULL; + } + #if MCA_PML_OB1_TIMESTAMPS recvreq->pin1[recvreq->pin_index] = get_profiler_timestamp(); #endif dst = ep->bmi_prepare_dst( - ep->bmi, - ep->bmi_endpoint, - recvreq->req_mpool->mpool_registration, - &recvreq->req_recv.req_convertor, - 0, - &size); + ep->bmi, + ep->bmi_endpoint, + reg, + &recvreq->req_recv.req_convertor, + 0, + &size); #if MCA_PML_OB1_TIMESTAMPS recvreq->pin2[recvreq->pin_index] = get_profiler_timestamp(); #endif diff --git a/src/mca/pml/ob1/pml_ob1_sendreq.c b/src/mca/pml/ob1/pml_ob1_sendreq.c index d384a3f9b1..e8a9465ce5 100644 --- a/src/mca/pml/ob1/pml_ob1_sendreq.c +++ b/src/mca/pml/ob1/pml_ob1_sendreq.c @@ -297,22 +297,31 @@ int mca_pml_ob1_send_request_start( /* check to see if memory is registered */ sendreq->req_chunk = mca_mpool_base_find(sendreq->req_send.req_addr); - /* pack the data into the supplied buffer */ - iov.iov_base = (void*)((unsigned char*)segment->seg_addr.pval + - sizeof(mca_pml_ob1_rendezvous_hdr_t)); - iov.iov_len = size; - iov_count = 1; - max_data = size; - if((rc = ompi_convertor_pack( - &sendreq->req_send.req_convertor, - &iov, - &iov_count, - &max_data, - &free_after)) < 0) { - endpoint->bmi_free(endpoint->bmi, descriptor); - return rc; - } + /* if the buffer is not pinned and leave pinned is false we eagerly send + data to cover the cost of pinning the recv buffers on the peer */ + if(NULL == sendreq->req_chunk && !mca_pml_ob1.leave_pinned){ + /* pack the data into the supplied buffer */ + iov.iov_base = (void*)((unsigned char*)segment->seg_addr.pval + + sizeof(mca_pml_ob1_rendezvous_hdr_t)); + iov.iov_len = size; + iov_count = 1; + max_data = size; + if((rc = ompi_convertor_pack( + &sendreq->req_send.req_convertor, + &iov, + &iov_count, + &max_data, + &free_after)) < 0) { + endpoint->bmi_free(endpoint->bmi, descriptor); + return rc; + } + } + /* if the buffer is pinned or leave pinned is true we do not eagerly send + any data */ + else { + max_data = 0; + } /* build hdr */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; hdr->hdr_common.hdr_flags = (sendreq->req_chunk != NULL ? MCA_PML_OB1_HDR_FLAGS_PIN : 0);