From e03956e0999071504c5b8d7941d53e79b8a3649f Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 30 Oct 2014 16:43:41 -0600 Subject: [PATCH] Update the scif and openib btls for the new btl interface Other changes: - Remove the registration argument from prepare_src since it no longer is meant for RDMA buffers. - Additional cleanup and bugfixes. --- opal/mca/btl/btl.h | 23 +- opal/mca/btl/openib/btl_openib.c | 471 +++++++++------------ opal/mca/btl/openib/btl_openib.h | 122 ++++-- opal/mca/btl/openib/btl_openib_component.c | 58 ++- opal/mca/btl/openib/btl_openib_frag.h | 16 +- opal/mca/btl/scif/btl_scif.h | 44 +- opal/mca/btl/scif/btl_scif_add_procs.c | 25 +- opal/mca/btl/scif/btl_scif_component.c | 10 +- opal/mca/btl/scif/btl_scif_frag.c | 4 +- opal/mca/btl/scif/btl_scif_frag.h | 18 +- opal/mca/btl/scif/btl_scif_get.c | 42 +- opal/mca/btl/scif/btl_scif_module.c | 133 ++---- opal/mca/btl/scif/btl_scif_put.c | 52 +-- opal/mca/btl/scif/btl_scif_send.c | 16 +- opal/mca/btl/self/btl_self.c | 37 +- opal/mca/btl/self/btl_self.h | 18 - opal/mca/btl/sm/btl_sm.c | 170 ++++---- opal/mca/btl/sm/btl_sm.h | 58 ++- opal/mca/btl/sm/btl_sm_component.c | 29 +- opal/mca/btl/sm/btl_sm_endpoint.h | 2 + opal/mca/btl/sm/btl_sm_frag.h | 13 + opal/mca/btl/tcp/btl_tcp.c | 156 +++---- opal/mca/btl/tcp/btl_tcp.h | 37 +- opal/mca/btl/tcp/btl_tcp_component.c | 2 +- opal/mca/btl/tcp/btl_tcp_frag.h | 8 +- opal/mca/btl/ugni/btl_ugni.h | 40 +- opal/mca/btl/ugni/btl_ugni_get.c | 15 +- opal/mca/btl/ugni/btl_ugni_module.c | 2 - opal/mca/btl/ugni/btl_ugni_prepare.h | 6 +- opal/mca/btl/ugni/btl_ugni_put.c | 13 +- opal/mca/btl/ugni/btl_ugni_rdma.h | 22 +- opal/mca/btl/ugni/btl_ugni_send.c | 1 - opal/mca/btl/vader/btl_vader.h | 47 +- opal/mca/btl/vader/btl_vader_component.c | 14 +- opal/mca/btl/vader/btl_vader_frag.c | 8 +- opal/mca/btl/vader/btl_vader_frag.h | 44 +- opal/mca/btl/vader/btl_vader_get.c | 46 +- opal/mca/btl/vader/btl_vader_knem.c | 70 +++ opal/mca/btl/vader/btl_vader_knem.h | 6 + opal/mca/btl/vader/btl_vader_module.c | 197 ++++----- opal/mca/btl/vader/btl_vader_put.c | 46 +- opal/mca/btl/vader/btl_vader_send.c | 2 +- opal/mca/btl/vader/btl_vader_sendi.c | 4 +- 43 files changed, 1054 insertions(+), 1093 deletions(-) diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index fbbcc21974..f0e82287f1 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -241,23 +241,27 @@ enum { /** Allow local write on the registered region. If a region is registered * with this flag the registration can be used as the local handle for a * btl_get operation. */ - MCA_BTL_REG_FLAG_LOCAL_WRITE = 0x1, + MCA_BTL_REG_FLAG_LOCAL_WRITE = 0x00000001, /** Allow remote read on the registered region. If a region is registered * with this flag the registration can be used as the remote handle for a * btl_get operation. */ - MCA_BTL_REG_FLAG_REMOTE_READ = 0x2, + MCA_BTL_REG_FLAG_REMOTE_READ = 0x00000002, /** Allow remote write on the registered region. If a region is registered * with this flag the registration can be used as the remote handle for a * btl_put operation. */ - MCA_BTL_REG_FLAG_REMOTE_WRITE = 0x4, + MCA_BTL_REG_FLAG_REMOTE_WRITE = 0x00000004, /** Allow remote atomic operations on the registered region. If a region is * registered with this flag the registration can be used as the remote * handle for a btl_atomic_op or btl_atomic_fop operation. */ - MCA_BTL_REG_FLAG_REMOTE_ATOMIC = 0x8, + MCA_BTL_REG_FLAG_REMOTE_ATOMIC = 0x00000008, /** Allow any btl operation on the registered region. If a region is registered * with this flag the registration can be used as the local or remote handle for * any btl operation. */ - MCA_BTL_REG_FLAG_ACCESS_ANY = 0xf, + MCA_BTL_REG_FLAG_ACCESS_ANY = 0x0000000f, +#if OPAL_CUDA_GDR_SUPPORT + /** Region is in GPU memory */ + MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000, +#endif }; /** @@ -718,7 +722,6 @@ typedef int (*mca_btl_base_module_free_fn_t)( typedef struct mca_btl_base_descriptor_t* (*mca_btl_base_module_prepare_fn_t)( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -853,10 +856,11 @@ typedef int (*mca_btl_base_module_sendi_fn_t)( * (remote_address, remote_address + size) * @param size (IN) Number of bytes to put * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback * @param cbdata (IN) Data for callback - * + * * @retval OPAL_SUCCESS The descriptor was successfully queued for a put * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put @@ -868,7 +872,7 @@ typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Initiate an asynchronous get. @@ -916,6 +920,7 @@ typedef int (*mca_btl_base_module_put_fn_t) (struct mca_btl_base_module_t *btl, * (remote_address, remote_address + size) * @param size (IN) Number of bytes to put * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering * @param cbfunc (IN) Function to call on completion (if queued) * @param cbcontext (IN) Context for the callback * @param cbdata (IN) Data for callback @@ -931,7 +936,7 @@ typedef int (*mca_btl_base_module_get_fn_t) (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Diagnostic dump of btl state. diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index a939aeaa66..a26f960bda 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -91,6 +91,11 @@ #define MIN(a,b) ((a)<(b)?(a):(b)) #endif +static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags); +static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle); + mca_btl_openib_module_t mca_btl_openib_module = { .super = { .btl_component = &mca_btl_openib_component.super, @@ -101,14 +106,15 @@ mca_btl_openib_module_t mca_btl_openib_module = { .btl_alloc = mca_btl_openib_alloc, .btl_free = mca_btl_openib_free, .btl_prepare_src = mca_btl_openib_prepare_src, - .btl_prepare_dst = mca_btl_openib_prepare_dst, .btl_send = mca_btl_openib_send, .btl_sendi = mca_btl_openib_sendi, /* send immediate */ .btl_put = mca_btl_openib_put, .btl_get = mca_btl_openib_get, .btl_dump = mca_btl_base_dump, .btl_register_error = mca_btl_openib_register_error_cb, /* error call back registration */ - .btl_ft_event = mca_btl_openib_ft_event + .btl_ft_event = mca_btl_openib_ft_event, + .btl_register_mem = mca_btl_openib_register_mem, + .btl_deregister_mem = mca_btl_openib_deregister_mem, } }; @@ -1226,7 +1232,7 @@ ib_frag_alloc(mca_btl_openib_module_t *btl, size_t size, uint8_t order, /* check if pending fragment has enough space for coalescing */ static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list, - opal_mutex_t *lock, mca_btl_base_endpoint_t *ep, size_t size) + opal_mutex_t *lock, struct mca_btl_base_endpoint_t *ep, size_t size) { mca_btl_openib_send_frag_t *frag = NULL; @@ -1390,12 +1396,6 @@ int mca_btl_openib_free( to_send_frag(des)->hdr + 1; assert(!opal_list_get_size(&to_send_frag(des)->coalesced_frags)); /* fall through */ - case MCA_BTL_OPENIB_FRAG_RECV: - case MCA_BTL_OPENIB_FRAG_RECV_USER: - case MCA_BTL_OPENIB_FRAG_SEND_USER: - to_base_frag(des)->base.des_remote = NULL; - to_base_frag(des)->base.des_remote_count = 0; - break; default: break; } @@ -1430,7 +1430,6 @@ int mca_btl_openib_free( mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -1438,7 +1437,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( uint32_t flags) { mca_btl_openib_module_t *openib_btl; - mca_btl_openib_reg_t *openib_reg; mca_btl_openib_com_frag_t *frag = NULL; struct iovec iov; uint32_t iov_count = 1; @@ -1448,83 +1446,20 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( openib_btl = (mca_btl_openib_module_t*)btl; -#if OPAL_CUDA_GDR_SUPPORT - if(opal_convertor_cuda_need_buffers(convertor) == false && 0 == reserve) { -#else - if(opal_convertor_need_buffers(convertor) == false && 0 == reserve) { -#endif /* OPAL_CUDA_GDR_SUPPORT */ - /* GMS bloody HACK! */ - if(registration != NULL || max_data > btl->btl_max_send_size) { - frag = alloc_send_user_frag(); - if(NULL == frag) { - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = NULL; - - opal_convertor_pack(convertor, &iov, &iov_count, &max_data); - - *size = max_data; - - if(NULL == registration) { - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, - iov.iov_base, max_data, 0, ®istration); - if(OPAL_SUCCESS != rc || NULL == registration) { - MCA_BTL_IB_FRAG_RETURN(frag); - return NULL; - } - /* keep track of the registration we did */ - to_com_frag(frag)->registration = - (mca_btl_openib_reg_t*)registration; - } - openib_reg = (mca_btl_openib_reg_t*)registration; - - frag->sg_entry.length = max_data; - frag->sg_entry.lkey = openib_reg->mr->lkey; - frag->sg_entry.addr = (uint64_t)(uintptr_t)iov.iov_base; - - to_base_frag(frag)->base.order = order; - to_base_frag(frag)->base.des_flags = flags; - to_base_frag(frag)->segment.base.seg_len = max_data; - to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) iov.iov_base; - to_base_frag(frag)->segment.key = frag->sg_entry.lkey; - - assert(MCA_BTL_NO_ORDER == order); - - BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64, - frag->sg_entry.lkey, frag->sg_entry.addr)); - - return &to_base_frag(frag)->base; - } - } - assert(MCA_BTL_NO_ORDER == order); - if(max_data + reserve > btl->btl_max_send_size) { + if (max_data + reserve > btl->btl_max_send_size) { max_data = btl->btl_max_send_size - reserve; } - if (OPAL_UNLIKELY(0 == reserve)) { - frag = (mca_btl_openib_com_frag_t *) ib_frag_alloc(openib_btl, max_data, order, flags); - if(NULL == frag) - return NULL; - - /* NTH: this frag will be ue used for either a get or put so we need to set the lval to be - consistent with the usage in get and put. the pval will be restored in mca_btl_openib_free */ - ptr = to_base_frag(frag)->segment.base.seg_addr.pval; - to_base_frag(frag)->segment.base.seg_addr.lval = - (uint64_t)(uintptr_t) ptr; - } else { - frag = - (mca_btl_openib_com_frag_t *) mca_btl_openib_alloc(btl, endpoint, order, + frag = (mca_btl_openib_com_frag_t *) mca_btl_openib_alloc (btl, endpoint, order, max_data + reserve, flags); - if(NULL == frag) - return NULL; - - ptr = to_base_frag(frag)->segment.base.seg_addr.pval; + if (NULL == frag) { + return NULL; } + ptr = to_base_frag(frag)->segment.base.seg_addr.pval; + iov.iov_len = max_data; iov.iov_base = (IOVBASE_TYPE *) ( (unsigned char*) ptr + reserve ); rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data); @@ -1547,103 +1482,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( return &to_base_frag(frag)->base; } -/** - * Prepare the dst buffer - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - * prepare dest's behavior depends on the following: - * Has a valid memory registration been passed to prepare_src? - * if so we attempt to use the pre-registered user-buffer, if the memory registration - * is to small (only a portion of the user buffer) then we must reregister the user buffer - * Has the user requested the memory to be left pinned? - * if so we insert the memory registration into a memory tree for later lookup, we - * may also remove a previous registration if a MRU (most recently used) list of - * registrations is full, this prevents resources from being exhausted. - */ -mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - mca_btl_openib_module_t *openib_btl; - mca_btl_openib_component_t *openib_component; - mca_btl_openib_com_frag_t *frag; - mca_btl_openib_reg_t *openib_reg; - uint32_t max_msg_sz; - int rc; - void *buffer; - - openib_btl = (mca_btl_openib_module_t*)btl; - openib_component = (mca_btl_openib_component_t*)btl->btl_component; - - frag = alloc_recv_user_frag(); - if(NULL == frag) { - return NULL; - } - - /* max_msg_sz is the maximum message size of the HCA (hw limitation) - set the minimum between local max_msg_sz and the remote */ - max_msg_sz = MIN(openib_btl->ib_port_attr.max_msg_sz, - endpoint->endpoint_btl->ib_port_attr.max_msg_sz); - - /* check if user has explicitly limited the max message size */ - if (openib_component->max_hw_msg_size > 0 && - max_msg_sz > (size_t)openib_component->max_hw_msg_size) { - max_msg_sz = openib_component->max_hw_msg_size; - } - - /* limit the message so to max_msg_sz */ - if (*size > (size_t)max_msg_sz) { - *size = (size_t)max_msg_sz; - BTL_VERBOSE(("message size limited to %" PRIsize_t "\n", *size)); - } - - opal_convertor_get_current_pointer(convertor, &buffer); - - if(NULL == registration){ - /* we didn't get a memory registration passed in, so we have to - * register the region ourselves - */ - uint32_t mflags = 0; -#if OPAL_CUDA_GDR_SUPPORT - if (convertor->flags & CONVERTOR_CUDA) { - mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM; - } -#endif /* OPAL_CUDA_GDR_SUPPORT */ - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, mflags, - ®istration); - if(OPAL_SUCCESS != rc || NULL == registration) { - MCA_BTL_IB_FRAG_RETURN(frag); - return NULL; - } - /* keep track of the registration we did */ - frag->registration = (mca_btl_openib_reg_t*)registration; - } - openib_reg = (mca_btl_openib_reg_t*)registration; - - frag->sg_entry.length = *size; - frag->sg_entry.lkey = openib_reg->mr->lkey; - frag->sg_entry.addr = (uint64_t)(uintptr_t)buffer; - - to_base_frag(frag)->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) buffer; - to_base_frag(frag)->segment.base.seg_len = *size; - to_base_frag(frag)->segment.key = openib_reg->mr->rkey; - to_base_frag(frag)->base.order = order; - to_base_frag(frag)->base.des_flags = flags; - - BTL_VERBOSE(("frag->sg_entry.lkey = %" PRIu32 " .addr = %" PRIx64 " " - "rkey = %" PRIu32, frag->sg_entry.lkey, frag->sg_entry.addr, - openib_reg->mr->rkey)); - - return &to_base_frag(frag)->base; -} - static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) { mca_btl_openib_module_t* openib_btl; mca_btl_openib_endpoint_t* endpoint; @@ -1997,40 +1835,127 @@ int mca_btl_openib_send( return mca_btl_openib_endpoint_send(ep, frag); } +static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags) +{ + mca_btl_openib_reg_t *reg; + uint32_t mflags = 0; + int rc; + +#if OPAL_CUDA_GDR_SUPPORT + if (flags & MCA_BTL_REG_FLAG_CUDA_GPU_MEM) { + mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM; + } +#endif /* OPAL_CUDA_GDR_SUPPORT */ + + rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mflags, + (mca_mpool_base_registration_t **) ®); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == reg)) { + return NULL; + } + + return ®->btl_handle; +} + +static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) +{ + mca_btl_openib_reg_t *reg = (mca_btl_openib_reg_t *)((intptr_t) handle - offsetof (mca_btl_openib_reg_t, btl_handle)); + + btl->btl_mpool->mpool_deregister (btl->btl_mpool, (mca_mpool_base_registration_t *) reg); + + return OPAL_SUCCESS; +} + /* * RDMA WRITE local buffer to remote buffer address. */ -int mca_btl_openib_put( mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* ep, - mca_btl_base_descriptor_t* descriptor) +int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_segments; - mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_remote; - struct ibv_send_wr* bad_wr; - mca_btl_openib_out_frag_t* frag = to_out_frag(descriptor); - int qp = descriptor->order; - uint64_t rem_addr = dst_seg->base.seg_addr.lval; - uint32_t rkey = dst_seg->key; + mca_btl_openib_put_frag_t *frag = NULL; + int rc, qp = order; - assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND_USER || - openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_SEND); - - descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) { - int rc; - OPAL_THREAD_LOCK(&ep->endpoint_lock); - rc = check_endpoint_state(ep, descriptor, &ep->pending_put_frags); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OPAL_ERR_RESOURCE_BUSY == rc) - return OPAL_SUCCESS; - if(OPAL_SUCCESS != rc) - return rc; + if (OPAL_UNLIKELY(size > btl->btl_put_limit)) { + return OPAL_ERR_BAD_PARAM; } - if(MCA_BTL_NO_ORDER == qp) + frag = to_put_frag(alloc_send_user_frag ()); + if (OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (MCA_BTL_NO_ORDER == qp) { qp = mca_btl_openib_component.rdma_qp; + } + + /* set base descriptor flags */ + to_base_frag(frag)->base.order = order; + /* free this descriptor when the operation is complete */ + to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + + /* set up scatter-gather entry */ + to_com_frag(frag)->sg_entry.length = size; + to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; + to_com_frag(frag)->sg_entry.addr = (uint64_t)(intptr_t) local_address; + to_com_frag(frag)->endpoint = ep; + + /* set up rdma callback */ + frag->cb.func = cbfunc; + frag->cb.context = cbcontext; + frag->cb.data = cbdata; + frag->cb.local_handle = local_handle; + + /* post descriptor */ + to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; + to_out_frag(frag)->sr_desc.send_flags = ib_send_flags(size, &(ep->qps[qp]), 1); + to_out_frag(frag)->sr_desc.wr.rdma.remote_addr = remote_address; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if ((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) + != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + to_out_frag(frag)->sr_desc.wr.rdma.rkey = opal_swap_bytes4(remote_handle->rkey); + } else +#endif + { + to_out_frag(frag)->sr_desc.wr.rdma.rkey = remote_handle->rkey; + } + +#if HAVE_XRC + if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) + to_out_frag(frag)->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; +#endif + + if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_put_frags); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + if (OPAL_ERR_RESOURCE_BUSY == rc) { + /* descriptor was queued pending connection */ + return OPAL_SUCCESS; + } + + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + MCA_BTL_IB_FRAG_RETURN (frag); + return rc; + } + } + + rc = mca_btl_openib_put_internal (btl, ep, qp, frag); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + MCA_BTL_IB_FRAG_RETURN (frag); + } + + return rc; +} + +int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + int qp, mca_btl_openib_put_frag_t *frag) +{ + struct ibv_send_wr *bad_wr; /* check for a send wqe */ if (qp_get_wqe(ep, qp) < 0) { @@ -2040,35 +1965,11 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, OPAL_THREAD_UNLOCK(&ep->endpoint_lock); return OPAL_SUCCESS; } - /* post descriptor */ -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) - != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - rem_addr = opal_swap_bytes8(rem_addr); - rkey = opal_swap_bytes4(rkey); - } -#endif - frag->sr_desc.wr.rdma.remote_addr = rem_addr; - frag->sr_desc.wr.rdma.rkey = rkey; - - to_com_frag(frag)->sg_entry.addr = src_seg->base.seg_addr.lval; - to_com_frag(frag)->sg_entry.length = src_seg->base.seg_len; - to_com_frag(frag)->endpoint = ep; -#if HAVE_XRC - if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) - frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num; -#endif - - descriptor->order = qp; - /* Setting opcode on a frag constructor isn't enough since prepare_src - * may return send_frag instead of put_frag */ - frag->sr_desc.opcode = IBV_WR_RDMA_WRITE; - frag->sr_desc.send_flags = ib_send_flags(src_seg->base.seg_len, &(ep->qps[qp]), 1); qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); qp_reset_signal_count(ep, qp); - if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) + if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &to_out_frag(frag)->sr_desc, &bad_wr)) return OPAL_ERROR; return OPAL_SUCCESS; @@ -2078,35 +1979,84 @@ int mca_btl_openib_put( mca_btl_base_module_t* btl, * RDMA READ remote buffer to local buffer address. */ -int mca_btl_openib_get(mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* ep, - mca_btl_base_descriptor_t* descriptor) +int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_openib_segment_t *src_seg = (mca_btl_openib_segment_t *) descriptor->des_remote; - mca_btl_openib_segment_t *dst_seg = (mca_btl_openib_segment_t *) descriptor->des_segments; - struct ibv_send_wr* bad_wr; - mca_btl_openib_get_frag_t* frag = to_get_frag(descriptor); - int qp = descriptor->order; - uint64_t rem_addr = src_seg->base.seg_addr.lval; - uint32_t rkey = src_seg->key; + mca_btl_openib_get_frag_t* frag = NULL; + int qp = order; + int rc; - assert(openib_frag_type(frag) == MCA_BTL_OPENIB_FRAG_RECV_USER); - - descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - if(ep->endpoint_state != MCA_BTL_IB_CONNECTED) { - int rc; - OPAL_THREAD_LOCK(&ep->endpoint_lock); - rc = check_endpoint_state(ep, descriptor, &ep->pending_get_frags); - OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(OPAL_ERR_RESOURCE_BUSY == rc) - return OPAL_SUCCESS; - if(OPAL_SUCCESS != rc) - return rc; + if (OPAL_UNLIKELY(size > btl->btl_get_limit)) { + return OPAL_ERR_BAD_PARAM; } - if(MCA_BTL_NO_ORDER == qp) + frag = to_get_frag(alloc_recv_user_frag()); + if (OPAL_UNLIKELY(NULL == frag)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (MCA_BTL_NO_ORDER == qp) { qp = mca_btl_openib_component.rdma_qp; + } + + /* set base descriptor flags */ + to_base_frag(frag)->base.order = order; + /* free this descriptor when the operation is complete */ + to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + + /* set up scatter-gather entry */ + to_com_frag(frag)->sg_entry.length = size; + to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; + to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; + to_com_frag(frag)->endpoint = ep; + + /* set up rdma callback */ + frag->cb.func = cbfunc; + frag->cb.context = cbcontext; + frag->cb.data = cbdata; + frag->cb.local_handle = local_handle; + + /* set up descriptor */ + frag->sr_desc.wr.rdma.remote_addr = remote_address; + +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) + != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { + frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey); + } else +#endif + { + frag->sr_desc.wr.rdma.rkey = remote_handle->rkey; + } + + if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { + OPAL_THREAD_LOCK(&ep->endpoint_lock); + rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags); + OPAL_THREAD_UNLOCK(&ep->endpoint_lock); + if (OPAL_ERR_RESOURCE_BUSY == rc) { + return OPAL_SUCCESS; + } + + if (OPAL_SUCCESS != rc) { + MCA_BTL_IB_FRAG_RETURN (frag); + return rc; + } + } + + rc = mca_btl_openib_get_internal (btl, ep, qp, frag); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + MCA_BTL_IB_FRAG_RETURN (frag); + } + + return rc; +} + +int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + int qp, mca_btl_openib_get_frag_t *frag) +{ + struct ibv_send_wr* bad_wr; /* check for a send wqe */ if (qp_get_wqe(ep, qp) < 0) { @@ -2118,7 +2068,7 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, } /* check for a get token */ - if(OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) { + if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) { qp_put_wqe(ep, qp); OPAL_THREAD_ADD32(&ep->get_tokens,1); OPAL_THREAD_LOCK(&ep->endpoint_lock); @@ -2127,30 +2077,15 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, return OPAL_SUCCESS; } -#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT - if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) - != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { - rem_addr = opal_swap_bytes8(rem_addr); - rkey = opal_swap_bytes4(rkey); - } -#endif - frag->sr_desc.wr.rdma.remote_addr = rem_addr; - frag->sr_desc.wr.rdma.rkey = rkey; - - to_com_frag(frag)->sg_entry.addr = dst_seg->base.seg_addr.lval; - to_com_frag(frag)->sg_entry.length = dst_seg->base.seg_len; - to_com_frag(frag)->endpoint = ep; - #if HAVE_XRC if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) frag->sr_desc.xrc_remote_srq_num=ep->rem_info.rem_srqs[qp].rem_srq_num; #endif - descriptor->order = qp; qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); qp_reset_signal_count(ep, qp); - if(ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) + if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) return OPAL_ERROR; return OPAL_SUCCESS; diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index cfcf367461..1972301967 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -497,9 +497,15 @@ typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; extern mca_btl_openib_module_t mca_btl_openib_module; +struct mca_btl_base_registration_handle_t { + uint32_t rkey; + uint32_t lkey; +}; + struct mca_btl_openib_reg_t { mca_mpool_base_registration_t base; struct ibv_mr *mr; + mca_btl_base_registration_handle_t btl_handle; }; typedef struct mca_btl_openib_reg_t mca_btl_openib_reg_t; @@ -612,32 +618,91 @@ extern int mca_btl_openib_sendi( struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t** descriptor ); -/** - * PML->BTL Initiate a put of the specified size. - * - * @param btl (IN) BTL instance - * @param btl_peer (IN) BTL peer addressing - * @param descriptor (IN) Descriptor of data to be transmitted. - */ -extern int mca_btl_openib_put( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* descriptor - ); +/* forward decaration for internal put/get */ +struct mca_btl_openib_put_frag_t; +struct mca_btl_openib_get_frag_t; /** - * PML->BTL Initiate a get of the specified size. + * @brief Schedule a put fragment with the HCA (internal) * * @param btl (IN) BTL instance - * @param btl_base_peer (IN) BTL peer addressing - * @param descriptor (IN) Descriptor of data to be transmitted. + * @param ep (IN) BTL endpoint + * @param qp (IN) ID of queue pair to schedule the get on + * @param frag (IN) Fragment prepared by mca_btl_openib_put + * + * If the fragment can not be scheduled due to resource limitations then + * the fragment will be put on the pending put fragment list and retried + * when another get/put fragment has completed. */ -extern int mca_btl_openib_get( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* descriptor - ); +int mca_btl_openib_put_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + int qp, struct mca_btl_openib_put_frag_t *frag); +/** + * @brief Schedule an RDMA write with the HCA + * + * @param btl (IN) BTL instance + * @param ep (IN) BTL endpoint + * @param local_address (IN) Source address + * @param remote_address (IN) Destination address + * @param local_handle (IN) Registration handle for region containing the region {local_address, size} + * @param remote_handle (IN) Registration handle for region containing the region {remote_address, size} + * @param size (IN) Number of bytes to write + * @param flags (IN) Transfer flags + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion + * @param cbcontext (IN) Context for completion callback + * @param cbdata (IN) Data for completion callback + * + * @return OPAL_ERR_BAD_PARAM if a bad parameter was passed + * @return OPAL_SUCCCESS if the operation was successfully scheduled + * + * This function will attempt to schedule a put operation with the HCA. + */ +int mca_btl_openib_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +/** + * @brief Schedule a get fragment with the HCA (internal) + * + * @param btl (IN) BTL instance + * @param ep (IN) BTL endpoint + * @param qp (IN) ID of queue pair to schedule the get on + * @param frag (IN) Fragment prepared by mca_btl_openib_get + * + * If the fragment can not be scheduled due to resource limitations then + * the fragment will be put on the pending get fragment list and retried + * when another get/put fragment has completed. + */ +int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, + int qp, struct mca_btl_openib_get_frag_t *frag); + +/** + * @brief Schedule an RDMA read with the HCA + * + * @param btl (IN) BTL instance + * @param ep (IN) BTL endpoint + * @param local_address (IN) Destination address + * @param remote_address (IN) Source address + * @param local_handle (IN) Registration handle for region containing the region {local_address, size} + * @param remote_handle (IN) Registration handle for region containing the region {remote_address, size} + * @param size (IN) Number of bytes to read + * @param flags (IN) Transfer flags + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion + * @param cbcontext (IN) Context for completion callback + * @param cbdata (IN) Data for completion callback + * + * @return OPAL_ERR_BAD_PARAM if a bad parameter was passed + * @return OPAL_SUCCCESS if the operation was successfully scheduled + * + * This function will attempt to schedule a get operation with the HCA. + */ +int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Allocate a descriptor. @@ -674,7 +739,6 @@ extern int mca_btl_openib_free( mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -682,22 +746,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src( uint32_t flags ); -/** - * Allocate a descriptor initialized for RDMA write. - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - */ -extern mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); - extern void mca_btl_openib_frag_progress_pending_put_get( struct mca_btl_base_endpoint_t*, const int); diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index d195146b4b..73b8227c49 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -605,6 +605,9 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size, return OPAL_ERR_OUT_OF_RESOURCE; } + openib_reg->btl_handle.lkey = openib_reg->mr->lkey; + openib_reg->btl_handle.rkey = openib_reg->mr->rkey; + OPAL_OUTPUT_VERBOSE((30, mca_btl_openib_component.memory_registration_verbose, "openib_reg_mr: base=%p, bound=%p, size=%d, flags=0x%x", reg->base, reg->bound, (int) (reg->bound - reg->base + 1), reg->flags)); @@ -804,7 +807,19 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbfunc = btl_openib_control; mca_btl_base_active_message_trigger[MCA_BTL_TAG_IB].cbdata = NULL; - openib_btl->super.btl_seg_size = sizeof (mca_btl_openib_segment_t); + if (openib_btl->super.btl_get_limit > openib_btl->ib_port_attr.max_msg_sz) { + openib_btl->super.btl_get_limit = openib_btl->ib_port_attr.max_msg_sz; + } + + openib_btl->super.btl_get_alignment = 0; + + if (openib_btl->super.btl_put_limit > openib_btl->ib_port_attr.max_msg_sz) { + openib_btl->super.btl_put_limit = openib_btl->ib_port_attr.max_msg_sz; + } + + openib_btl->super.btl_put_alignment = 0; + + openib_btl->super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); /* Check bandwidth configured for this device */ sprintf(param, "bandwidth_%s", ibv_get_device_name(device->ib_dev)); @@ -2881,16 +2896,15 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, size_t i, len = opal_list_get_size(&ep->pending_get_frags); int rc; - for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) - { + for(i = 0; i < len && ep->qps[qp].qp->sd_wqe > 0 && ep->get_tokens > 0; i++) { OPAL_THREAD_LOCK(&ep->endpoint_lock); frag = opal_list_remove_first(&(ep->pending_get_frags)); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(NULL == frag) + if (NULL == frag) break; - rc = mca_btl_openib_get((mca_btl_base_module_t *)openib_btl, ep, - &to_base_frag(frag)->base); - if(OPAL_ERR_OUT_OF_RESOURCE == rc) + rc = mca_btl_openib_get_internal ((mca_btl_base_module_t *)openib_btl, ep, + qp, to_get_frag(frag)); + if (OPAL_ERR_OUT_OF_RESOURCE == rc) break; } @@ -2899,11 +2913,11 @@ void mca_btl_openib_frag_progress_pending_put_get(mca_btl_base_endpoint_t *ep, OPAL_THREAD_LOCK(&ep->endpoint_lock); frag = opal_list_remove_first(&(ep->pending_put_frags)); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); - if(NULL == frag) + if (NULL == frag) break; - rc = mca_btl_openib_put((mca_btl_base_module_t*)openib_btl, ep, - &to_base_frag(frag)->base); - if(OPAL_ERR_OUT_OF_RESOURCE == rc) + rc = mca_btl_openib_put_internal ((mca_btl_base_module_t*)openib_btl, ep, + qp, to_put_frag(frag)); + if (OPAL_ERR_OUT_OF_RESOURCE == rc) break; } } @@ -3266,11 +3280,25 @@ static void handle_wc(mca_btl_openib_device_t* device, const uint32_t cq, /* Handle work completions */ switch(wc->opcode) { case IBV_WC_RDMA_READ: - OPAL_OUTPUT((-1, "Got WC: RDMA_READ")); - OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); - /* fall through */ - case IBV_WC_RDMA_WRITE: + OPAL_OUTPUT((-1, "Got WC: RDMA_READ or RDMA_WRITE")); + + if (IBV_WC_RDMA_READ == wc->opcode) { + OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); + + mca_btl_openib_get_frag_t *get_frag = to_get_frag(des); + + get_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, + get_frag->cb.local_handle, get_frag->cb.context, get_frag->cb.data, + OPAL_SUCCESS); + } else { + mca_btl_openib_put_frag_t *put_frag = to_put_frag(des); + + put_frag->cb.func (&openib_btl->super, endpoint, (void *)(intptr_t) frag->sg_entry.addr, + put_frag->cb.local_handle, put_frag->cb.context, put_frag->cb.data, + OPAL_SUCCESS); + } + /* fall through */ case IBV_WC_SEND: OPAL_OUTPUT((-1, "Got WC: RDMA_WRITE or SEND")); if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { diff --git a/opal/mca/btl/openib/btl_openib_frag.h b/opal/mca/btl/openib/btl_openib_frag.h index 24bc3a93ad..b2e2d2dfc3 100644 --- a/opal/mca/btl/openib/btl_openib_frag.h +++ b/opal/mca/btl/openib/btl_openib_frag.h @@ -349,7 +349,15 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_recv_frag_t); #define to_recv_frag(f) ((mca_btl_openib_recv_frag_t*)(f)) -typedef struct mca_btl_openib_out_frag_t mca_btl_openib_put_frag_t; +typedef struct mca_btl_openib_put_frag_t { + mca_btl_openib_out_frag_t super; + struct { + mca_btl_base_rdma_completion_fn_t func; + mca_btl_base_registration_handle_t *local_handle; + void *context; + void *data; + } cb; +} mca_btl_openib_put_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t); #define to_put_frag(f) ((mca_btl_openib_put_frag_t*)(f)) @@ -357,6 +365,12 @@ OBJ_CLASS_DECLARATION(mca_btl_openib_put_frag_t); typedef struct mca_btl_openib_get_frag_t { mca_btl_openib_in_frag_t super; struct ibv_send_wr sr_desc; + struct { + mca_btl_base_rdma_completion_fn_t func; + mca_btl_base_registration_handle_t *local_handle; + void *context; + void *data; + } cb; } mca_btl_openib_get_frag_t; OBJ_CLASS_DECLARATION(mca_btl_openib_get_frag_t); diff --git a/opal/mca/btl/scif/btl_scif.h b/opal/mca/btl/scif/btl_scif.h index b8d9aabaf5..83ee601517 100644 --- a/opal/mca/btl/scif/btl_scif.h +++ b/opal/mca/btl/scif/btl_scif.h @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -197,29 +197,21 @@ int mca_btl_scif_sendi (struct mca_btl_base_module_t *btl, * Initiate a get operation. * * location: btl_scif_get.c - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int -mca_btl_scif_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Initiate a put operation. * * location: btl_scif_put.c - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int -mca_btl_scif_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); mca_btl_base_descriptor_t * mca_btl_scif_alloc(struct mca_btl_base_module_t *btl, @@ -228,9 +220,25 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl, int mca_btl_scif_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint); +struct mca_btl_scif_reg_t; + +struct mca_btl_base_registration_handle_t { + /** scif offset */ + off_t scif_offset; + /** base address of this scif region */ + uintptr_t scif_base; +}; + +struct mca_btl_scif_registration_handle_t { + mca_btl_base_registration_handle_t btl_handle; + struct mca_btl_scif_reg_t *reg; +}; +typedef struct mca_btl_scif_registration_handle_t mca_btl_scif_registration_handle_t; + typedef struct mca_btl_scif_reg_t { mca_mpool_base_registration_t base; - off_t *registrations; + /** per-endpoint btl handles for this registration */ + mca_btl_scif_registration_handle_t *handles; } mca_btl_scif_reg_t; /* Global structures */ diff --git a/opal/mca/btl/scif/btl_scif_add_procs.c b/opal/mca/btl/scif/btl_scif_add_procs.c index 4a6d838102..f801ee5c7a 100644 --- a/opal/mca/btl/scif/btl_scif_add_procs.c +++ b/opal/mca/btl/scif/btl_scif_add_procs.c @@ -165,14 +165,14 @@ static int scif_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg) /* register the fragment with all connected endpoints */ for (i = 0 ; i < (int) mca_btl_scif_module.endpoint_count ; ++i) { - if ((off_t)-1 != scif_reg->registrations[i] && + if ((off_t)-1 != scif_reg->handles[i].btl_handle.scif_offset && MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) { (void) scif_unregister(mca_btl_scif_module.endpoints[i].scif_epd, - scif_reg->registrations[i], size); + scif_reg->handles[i].btl_handle.scif_offset, size); } } - free (scif_reg->registrations); + free (scif_reg->handles); return OPAL_SUCCESS; } @@ -184,17 +184,22 @@ static int scif_reg_mem (void *reg_data, void *base, size_t size, int rc = OPAL_SUCCESS; unsigned int i; - scif_reg->registrations = calloc (mca_btl_scif_module.endpoint_count, - sizeof (off_t)); - memset (scif_reg->registrations, -1, mca_btl_scif_module.endpoint_count * sizeof (off_t)); + scif_reg->handles = calloc (mca_btl_scif_module.endpoint_count, sizeof (scif_reg->handles[0])); + + /* intialize all scif offsets to -1 and initialize the pointer back to the mpool registration */ + for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) { + scif_reg->handles[i].btl_handle.scif_offset = -1; + scif_reg->handles[i].btl_handle.scif_base = (intptr_t) base; + scif_reg->handles[i].reg = scif_reg; + } /* register the pointer with all connected endpoints */ for (i = 0 ; i < mca_btl_scif_module.endpoint_count ; ++i) { if (MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) { - scif_reg->registrations[i] = scif_register(mca_btl_scif_module.endpoints[i].scif_epd, - base, size, 0, SCIF_PROT_READ | - SCIF_PROT_WRITE, 0); - if (SCIF_REGISTER_FAILED == scif_reg->registrations[i]) { + scif_reg->handles[i].btl_handle.scif_offset = scif_register (mca_btl_scif_module.endpoints[i].scif_epd, + base, size, 0, SCIF_PROT_READ | + SCIF_PROT_WRITE, 0); + if (SCIF_REGISTER_FAILED == scif_reg->handles[i].btl_handle.scif_offset) { /* cleanup */ scif_dereg_mem (reg_data, reg); rc = OPAL_ERR_OUT_OF_RESOURCE; diff --git a/opal/mca/btl/scif/btl_scif_component.c b/opal/mca/btl/scif/btl_scif_component.c index 33d500f5d3..36db880445 100644 --- a/opal/mca/btl/scif/btl_scif_component.c +++ b/opal/mca/btl/scif/btl_scif_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -171,7 +171,7 @@ static int btl_scif_component_register(void) mca_btl_scif_module.super.btl_flags = MCA_BTL_FLAGS_SEND | MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE; - mca_btl_scif_module.super.btl_seg_size = sizeof (mca_btl_scif_segment_t); + mca_btl_scif_module.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); mca_btl_scif_module.super.btl_bandwidth = 50000; /* Mbs */ mca_btl_scif_module.super.btl_latency = 2; /* Microsecs */ @@ -330,10 +330,10 @@ static int mca_btl_scif_progress_recvs (mca_btl_base_endpoint_t *ep) * limitation has not appeared to cause any performance * problems. */ frag.base.des_segment_count = 1; - frag.segments[0].base.seg_len = hdr->size; - frag.segments[0].base.seg_addr.pval = (void *) (hdr + 1); + frag.segments[0].seg_len = hdr->size; + frag.segments[0].seg_addr.pval = (void *) (hdr + 1); - frag.base.des_segments = &frag.segments[0].base; + frag.base.des_segments = frag.segments; /* call the registered callback function */ reg->cbfunc(&mca_btl_scif_module.super, hdr->tag, &frag.base, reg->cbdata); diff --git a/opal/mca/btl/scif/btl_scif_frag.c b/opal/mca/btl/scif/btl_scif_frag.c index 651e88cf51..6a684defb6 100644 --- a/opal/mca/btl/scif/btl_scif_frag.c +++ b/opal/mca/btl/scif/btl_scif_frag.c @@ -15,13 +15,13 @@ static inline void mca_btl_scif_base_frag_constructor (mca_btl_scif_base_frag_t *frag) { memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base)); - frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; + frag->segments[0].seg_addr.pval = frag->base.super.ptr; } static inline void mca_btl_scif_eager_frag_constructor (mca_btl_scif_base_frag_t *frag) { memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base)); - frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; + frag->segments[0].seg_addr.pval = frag->base.super.ptr; } OBJ_CLASS_INSTANCE(mca_btl_scif_eager_frag_t, mca_btl_base_descriptor_t, diff --git a/opal/mca/btl/scif/btl_scif_frag.h b/opal/mca/btl/scif/btl_scif_frag.h index a4ade0c9f4..2f6736a6dc 100644 --- a/opal/mca/btl/scif/btl_scif_frag.h +++ b/opal/mca/btl/scif/btl_scif_frag.h @@ -15,16 +15,6 @@ #include "btl_scif.h" #include "btl_scif_endpoint.h" -typedef struct mca_btl_scif_segment_t { - mca_btl_base_segment_t base; - - /* scif offset */ - off_t scif_offset; - - /* original pointer */ - uint64_t orig_ptr; -} mca_btl_scif_segment_t; - typedef struct mca_btl_scif_frag_hdr_t { #if defined(SCIF_USE_SEQ) uint32_t seq; @@ -41,7 +31,7 @@ typedef void (*frag_cb_t) (struct mca_btl_scif_base_frag_t *, int); typedef struct mca_btl_scif_base_frag_t { mca_btl_base_descriptor_t base; mca_btl_scif_frag_hdr_t hdr; - mca_btl_scif_segment_t segments[2]; + mca_btl_base_segment_t segments[2]; mca_btl_base_endpoint_t *endpoint; mca_btl_scif_reg_t *registration; ompi_free_list_t *my_list; @@ -78,9 +68,9 @@ static inline int mca_btl_scif_frag_return (mca_btl_scif_base_frag_t *frag) frag->registration = NULL; } - frag->segments[0].base.seg_addr.pval = frag->base.super.ptr; - frag->segments[0].base.seg_len = 0; - frag->segments[1].base.seg_len = 0; + frag->segments[0].seg_addr.pval = frag->base.super.ptr; + frag->segments[0].seg_len = 0; + frag->segments[1].seg_len = 0; OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *) frag); diff --git a/opal/mca/btl/scif/btl_scif_get.c b/opal/mca/btl/scif/btl_scif_get.c index a15f218c4f..131352b327 100644 --- a/opal/mca/btl/scif/btl_scif_get.c +++ b/opal/mca/btl/scif/btl_scif_get.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -20,18 +20,13 @@ /** * Initiate a get operation. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_scif_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) { - mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_remote; - mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_segments; - size_t len = lmin (src->base.seg_len, dst->base.seg_len); - int rc, mark, flags = 0; +int mca_btl_scif_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc, mark, scif_flags = 0; off_t roffset, loffset; #if defined(SCIF_TIMING) struct timespec ts; @@ -41,30 +36,27 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl, mca_btl_scif_component.get_count++; #endif - BTL_VERBOSE(("Using DMA Get for frag %p from offset %lu", (void *) des, - (unsigned long) src->scif_offset)); + BTL_VERBOSE(("Using DMA Get from remote address %" PRIx64 " to local address %p", + remote_address, local_address)); - roffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval); - loffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval); + roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base); + loffset = local_handle->scif_offset + (off_t)((intptr_t)local_address - local_handle->scif_base); if (mca_btl_scif_component.rma_use_cpu) { - flags = SCIF_RMA_USECPU; + scif_flags = SCIF_RMA_USECPU; } if (mca_btl_scif_component.rma_sync) { - flags |= SCIF_RMA_SYNC; + scif_flags |= SCIF_RMA_SYNC; } /* start the read */ - rc = scif_readfrom (endpoint->scif_epd, loffset, len, roffset, flags); + rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags); if (OPAL_UNLIKELY(-1 == rc)) { return OPAL_ERROR; } - /* always call the callback function */ - des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - if (!(flags & SCIF_RMA_SYNC)) { + if (!(scif_flags & SCIF_RMA_SYNC)) { /* according to the scif documentation is is better to use a fence rather * than using the SCIF_RMA_SYNC flag with scif_readfrom */ scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark); @@ -76,8 +68,8 @@ int mca_btl_scif_get (struct mca_btl_base_module_t *btl, mca_btl_scif_component.get_time_max, ts); #endif - /* since we completed the fence the RMA operation is complete */ - mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS); + /* always call the callback function */ + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/scif/btl_scif_module.c b/opal/mca/btl/scif/btl_scif_module.c index 3b6ccfde04..1926efa86c 100644 --- a/opal/mca/btl/scif/btl_scif_module.c +++ b/opal/mca/btl/scif/btl_scif_module.c @@ -24,17 +24,14 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl, static int mca_btl_scif_module_finalize (struct mca_btl_base_module_t* btl); -static mca_btl_base_descriptor_t * -mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, - opal_convertor_t *convertor, uint8_t order, - size_t reserve, size_t *size, uint32_t flags); +static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags); +static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle); static struct mca_btl_base_descriptor_t * mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags); @@ -48,11 +45,12 @@ mca_btl_scif_module_t mca_btl_scif_module = { .btl_alloc = mca_btl_scif_alloc, .btl_free = mca_btl_scif_free, .btl_prepare_src = mca_btl_scif_prepare_src, - .btl_prepare_dst = mca_btl_scif_prepare_dst, .btl_send = mca_btl_scif_send, .btl_sendi = mca_btl_scif_sendi, .btl_put = mca_btl_scif_put, .btl_get = mca_btl_scif_get, + .btl_register_mem = mca_btl_scif_register_mem, + .btl_deregister_mem = mca_btl_scif_deregister_mem, } }; @@ -163,10 +161,10 @@ mca_btl_scif_alloc(struct mca_btl_base_module_t *btl, frag->base.des_flags = flags; frag->base.order = order; - frag->base.des_segments = &frag->segments[0].base; + frag->base.des_segments = frag->segments; frag->base.des_segment_count = 1; - frag->segments[0].base.seg_len = size; + frag->segments[0].seg_len = size; return &frag->base; } @@ -178,16 +176,19 @@ mca_btl_scif_free (struct mca_btl_base_module_t *btl, return mca_btl_scif_frag_return ((mca_btl_scif_base_frag_t *) des); } -static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - void *data_ptr, size_t size, - mca_mpool_base_registration_t *registration, - uint8_t order, uint32_t flags) +static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags) { - mca_btl_scif_base_frag_t *frag; mca_btl_scif_reg_t *scif_reg; int rc; + if (MCA_BTL_ENDPOINT_ANY == endpoint) { + /* it probably isn't possible to support registering memory to use with any endpoint so + * return NULL */ + return NULL; + } + if (OPAL_LIKELY(MCA_BTL_SCIF_EP_STATE_CONNECTED != endpoint->state)) { /* the endpoint needs to be connected before the fragment can be * registered. */ @@ -198,67 +199,36 @@ static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma (struct mca_bt } } - (void) MCA_BTL_SCIF_FRAG_ALLOC_DMA(endpoint, frag); - if (OPAL_UNLIKELY(NULL == frag)) { + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, + (mca_mpool_base_registration_t **) &scif_reg); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; } - if (NULL == registration) { - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, size, 0, - (mca_mpool_base_registration_t **) ®istration); - if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { - mca_btl_scif_frag_return (frag); - return NULL; - } - - frag->registration = (mca_btl_scif_reg_t *) registration; - } - - scif_reg = (mca_btl_scif_reg_t *) registration; - /* register the memory location with this peer if it isn't already */ - if ((off_t) -1 == scif_reg->registrations[endpoint->id]) { - size_t seg_size = (size_t)((uintptr_t) registration->bound - (uintptr_t) registration->base) + 1; - scif_reg->registrations[endpoint->id] = scif_register (endpoint->scif_epd, registration->base, - seg_size, 0, SCIF_PROT_READ | - SCIF_PROT_WRITE, 0); + if ((off_t) -1 == scif_reg->handles[endpoint->id].btl_handle.scif_offset) { + size_t seg_size = (size_t)((uintptr_t) scif_reg->base.bound - (uintptr_t) scif_reg->base.base) + 1; + + /* NTH: until we determine a way to pass permissions to the mpool just make all segments + * read/write */ + scif_reg->handles[endpoint->id].btl_handle.scif_offset = + scif_register (endpoint->scif_epd, scif_reg->base.base, seg_size, 0, SCIF_PROT_READ | + SCIF_PROT_WRITE, 0); BTL_VERBOSE(("registered fragment for scif DMA transaction. offset = %lu", - (unsigned long) scif_reg->registrations[endpoint->id])); + (unsigned long) scif_reg->handles[endpoint->id].btl_handle.scif_offset)); } - if (OPAL_UNLIKELY((off_t) -1 == scif_reg->registrations[endpoint->id])) { - mca_btl_scif_frag_return (frag); - return NULL; - } - - frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr; - frag->segments[0].base.seg_len = size; - frag->segments[0].scif_offset = scif_reg->registrations[endpoint->id] + - (off_t) ((ptrdiff_t) data_ptr - (ptrdiff_t) registration->base); - /* save the original pointer so the offset can be adjusted if needed (this is - * required for osc/rdma) */ - frag->segments[0].orig_ptr = (uint64_t)(uintptr_t) data_ptr; - frag->base.order = order; - frag->base.des_flags = flags; - - frag->base.des_segments = &frag->segments->base; - frag->base.des_segment_count = 1; - - return &frag->base; + return &scif_reg->handles[endpoint->id].btl_handle; } -static inline mca_btl_base_descriptor_t *mca_btl_scif_prepare_dma_conv (struct mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, - struct opal_convertor_t *convertor, - uint8_t order, size_t *size, - uint32_t flags) +static int mca_btl_scif_deregister_mem (struct mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) { - void *data_ptr; + mca_btl_scif_registration_handle_t *scif_handle = (mca_btl_scif_registration_handle_t *) handle; + mca_btl_scif_reg_t *scif_reg = scif_handle->reg; - opal_convertor_get_current_pointer (convertor, &data_ptr); + btl->btl_mpool->mpool_deregister (btl->btl_mpool, &scif_reg->base); - return mca_btl_scif_prepare_dma (btl, endpoint, data_ptr, *size, registration, order, flags); + return OPAL_SUCCESS; } static inline struct mca_btl_base_descriptor_t * @@ -286,9 +256,9 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl, return NULL; } - frag->segments[0].base.seg_len = reserve; - frag->segments[1].base.seg_addr.pval = data_ptr; - frag->segments[1].base.seg_len = *size; + frag->segments[0].seg_len = reserve; + frag->segments[1].seg_addr.pval = data_ptr; + frag->segments[1].seg_len = *size; frag->base.des_segment_count = 2; } else { /* buffered send */ @@ -299,7 +269,7 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl, if (*size) { iov.iov_len = *size; - iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].base.seg_addr.pval + reserve); + iov.iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->segments[0].seg_addr.pval + reserve); rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_size); if (OPAL_UNLIKELY(rc < 0)) { @@ -309,37 +279,22 @@ mca_btl_scif_prepare_src_send (struct mca_btl_base_module_t *btl, *size = max_size; } - frag->segments[0].base.seg_len = reserve + *size; + frag->segments[0].seg_len = reserve + *size; frag->base.des_segment_count = 1; } - frag->base.des_segments = &frag->segments->base; - frag->base.order = order; - frag->base.des_flags = flags; + frag->base.des_segments = frag->segments; + frag->base.order = order; + frag->base.des_flags = flags; return &frag->base; } static mca_btl_base_descriptor_t *mca_btl_scif_prepare_src (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) { - if (OPAL_LIKELY(reserve)) { - return mca_btl_scif_prepare_src_send (btl, endpoint, convertor, - order, reserve, size, flags); - } else { - return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags); - } -} - -static mca_btl_base_descriptor_t *mca_btl_scif_prepare_dst (mca_btl_base_module_t *btl, - mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, - opal_convertor_t *convertor, uint8_t order, - size_t reserve, size_t *size, uint32_t flags) -{ - return mca_btl_scif_prepare_dma_conv (btl, endpoint, registration, convertor, order, size, flags); + return mca_btl_scif_prepare_src_send (btl, endpoint, convertor, order, reserve, size, flags); } diff --git a/opal/mca/btl/scif/btl_scif_put.c b/opal/mca/btl/scif/btl_scif_put.c index 17c2733d96..27355a3e5c 100644 --- a/opal/mca/btl/scif/btl_scif_put.c +++ b/opal/mca/btl/scif/btl_scif_put.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2013-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -16,63 +16,57 @@ /** * Initiate a put operation. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_scif_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) { - mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_segments; - mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_remote; - size_t len = lmin (src->base.seg_len, dst->base.seg_len); - int rc, mark, flags = 0; +int mca_btl_scif_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + int rc, mark, scif_flags = 0; off_t roffset, loffset; #if defined(SCIF_TIMING) struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); - mca_btl_scif_component.put_count++; + mca_btl_scif_component.get_count++; #endif - BTL_VERBOSE(("Using DMA Put for frag %p", (void *) des)); + BTL_VERBOSE(("Using DMA Put from local address %p to remote address %" PRIx64, + local_address, remote_address)); - roffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval); - loffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval); + roffset = remote_handle->scif_offset + (off_t)(remote_address - remote_handle->scif_base); + loffset = local_handle->scif_offset + (off_t)((intptr_t) local_address - local_handle->scif_base); if (mca_btl_scif_component.rma_use_cpu) { - flags = SCIF_RMA_USECPU; + scif_flags = SCIF_RMA_USECPU; } if (mca_btl_scif_component.rma_sync) { - flags |= SCIF_RMA_SYNC; + scif_flags |= SCIF_RMA_SYNC; } /* start the write */ - rc = scif_writeto (endpoint->scif_epd, loffset, len, roffset, flags); + rc = scif_writeto (endpoint->scif_epd, loffset, size, roffset, scif_flags); + rc = scif_readfrom (endpoint->scif_epd, loffset, size, roffset, scif_flags); if (OPAL_UNLIKELY(-1 == rc)) { return OPAL_ERROR; } - /* always call the callback function */ - des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - /* according to the scif documentation is is better to use a fence rather - * than using the SCIF_RMA_SYNC flag with scif_writeto */ - if (!(flags & SCIF_RMA_SYNC)) { + if (!(scif_flags & SCIF_RMA_SYNC)) { + /* according to the scif documentation is is better to use a fence rather + * than using the SCIF_RMA_SYNC flag with scif_readfrom */ scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark); scif_fence_wait (endpoint->scif_epd, mark); } #if defined(SCIF_TIMING) - SCIF_UPDATE_TIMER(mca_btl_scif_component.put_time, - mca_btl_scif_component.put_time_max, ts); + SCIF_UPDATE_TIMER(mca_btl_scif_component.get_time, + mca_btl_scif_component.get_time_max, ts); #endif - /* since we completed the fence the RMA operation is complete */ - mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OPAL_SUCCESS); + /* always call the callback function */ + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/scif/btl_scif_send.c b/opal/mca/btl/scif/btl_scif_send.c index 3df0f1cc9c..15c3b7e6ea 100644 --- a/opal/mca/btl/scif/btl_scif_send.c +++ b/opal/mca/btl/scif/btl_scif_send.c @@ -118,22 +118,22 @@ static int mca_btl_scif_send_frag (struct mca_btl_base_endpoint_t *endpoint, unsigned char * restrict dst; BTL_VERBOSE(("btl/scif sending descriptor %p from %d -> %d. length = %" PRIu64, (void *) frag, - OPAL_PROC_MY_NAME.vpid, endpoint->peer_proc->proc_name.vpid, frag->segments[0].base.seg_len)); + opal_process_name_vpid(OPAL_PROC_MY_NAME), opal_process_name_vpid(endpoint->peer_proc->proc_name), frag->segments[0].seg_len)); if (OPAL_LIKELY(OPAL_SUCCESS == mca_btl_scif_send_get_buffer (endpoint, size, &dst))) { - unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].base.seg_addr.pval; + unsigned char * restrict data = (unsigned char * restrict) frag->segments[0].seg_addr.pval; #if defined(SCIF_TIMING) struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); #endif - memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].base.seg_len); + memcpy (dst + sizeof (frag->hdr), data, frag->segments[0].seg_len); - if (frag->segments[1].base.seg_len) { - memcpy (dst + sizeof (frag->hdr) + frag->segments[0].base.seg_len, - frag->segments[1].base.seg_addr.pval, - frag->segments[1].base.seg_len); + if (frag->segments[1].seg_len) { + memcpy (dst + sizeof (frag->hdr) + frag->segments[0].seg_len, + frag->segments[1].seg_addr.pval, + frag->segments[1].seg_len); } #if defined(SCIF_USE_SEQ) @@ -165,7 +165,7 @@ int mca_btl_scif_send (struct mca_btl_base_module_t *btl, mca_btl_base_tag_t tag) { mca_btl_scif_base_frag_t *frag = (mca_btl_scif_base_frag_t *) descriptor; - size_t size = frag->segments[0].base.seg_len + frag->segments[1].base.seg_len; + size_t size = frag->segments[0].seg_len + frag->segments[1].seg_len; int rc; frag->hdr.tag = tag; diff --git a/opal/mca/btl/self/btl_self.c b/opal/mca/btl/self/btl_self.c index d0a6d76c1e..26f2a88f8e 100644 --- a/opal/mca/btl/self/btl_self.c +++ b/opal/mca/btl/self/btl_self.c @@ -38,17 +38,15 @@ #include "btl_self_frag.h" #include "opal/util/proc.h" -int mca_btl_self_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); +static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); -int mca_btl_self_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); +static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); mca_btl_base_module_t mca_btl_self = { .btl_component = &mca_btl_self_component.super, @@ -176,7 +174,6 @@ int mca_btl_self_free( struct mca_btl_base_module_t* btl, struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -268,11 +265,10 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl, } -int mca_btl_self_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { memcpy ((void *)(intptr_t) remote_address, local_address, size); @@ -281,11 +277,10 @@ int mca_btl_self_put (struct mca_btl_base_module_t *btl, return OPAL_SUCCESS; } -int mca_btl_self_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { memcpy (local_address, (void *)(intptr_t) remote_address, size); diff --git a/opal/mca/btl/self/btl_self.h b/opal/mca/btl/self/btl_self.h index d738e82afb..1402a2d290 100644 --- a/opal/mca/btl/self/btl_self.h +++ b/opal/mca/btl/self/btl_self.h @@ -165,24 +165,6 @@ int mca_btl_self_free( struct mca_btl_base_descriptor_t* mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags -); - -/** - * Prepare data for RDMA - * - * @param btl (IN) BTL module - * @param peer (IN) BTL peer addressing - */ -struct mca_btl_base_descriptor_t* mca_btl_self_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, diff --git a/opal/mca/btl/sm/btl_sm.c b/opal/mca/btl/sm/btl_sm.c index bf43402f5d..ee456e1024 100644 --- a/opal/mca/btl/sm/btl_sm.c +++ b/opal/mca/btl/sm/btl_sm.c @@ -743,7 +743,6 @@ extern int mca_btl_sm_free( struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -999,44 +998,77 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl, } #if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA -struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) +mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + void *base, size_t size, uint32_t flags) { - void *ptr; - mca_btl_sm_frag_t* frag; + mca_btl_sm_registration_handle_t *handle = NULL; - MCA_BTL_SM_FRAG_ALLOC_USER(frag); - if(OPAL_UNLIKELY(NULL == frag)) { + OMPI_FREE_LIST_GET_MT(&mca_btl_sm_component.registration_handles, &handle); + if (OPAL_UNLIKELY(NULL == handle)) { return NULL; } - frag->segment.base.seg_len = *size; - opal_convertor_get_current_pointer( convertor, &ptr ); - frag->segment.base.seg_addr.lval = (uint64_t)(uintptr_t) ptr; +#if OPAL_BTL_SM_HAVE_KNEM + if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) { + knem_iov.base = (uintptr_t)base & (opal_getpagesize() - 1); + knem_iov.len = OPAL_ALIGN(size + ((intptr_t) base - knem_iov.base), opal_getpagesize()); + knem_cr.iovec_array = (uintptr_t)&knem_iov; + knem_cr.iovec_nr = iov_count; + knem_cr.flags = 0; + knem_cr.protection = 0; - frag->base.des_segments = (mca_btl_base_segment_t*)&frag->segment; - frag->base.des_segment_count = 1; - frag->base.des_flags = flags; - return &frag->base; + if (flags & MCA_BTL_REG_FLAG_REMOTE_READ) { + knem_cr.protection |= PROT_READ; + } + if (flags & MCA_BTL_REG_FLAG_REMOTE_WRITE) { + knem_cr.protection |= PROT_WRITE; + } + + if (OPAL_UNLIKELY(ioctl(sm_btl->knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) { + return NULL; + } + + handle->btl_handle.data.knem.cookie = knem_cr.cookie; + handle->btl_handle.data.knem.base_addr = knem_iov.base; + } else +#endif + { + /* the pid could be included in a modex but this will work until btl/sm is + * deleted */ + handle->btl_handle.data.pid = getpid (); + } + + /* return the public part of the handle */ + return &handle->btl_handle; } +void mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle) +{ + mca_btl_sm_registration_handle_t *sm_handle = + (mca_btl_sm_registration_handle_t *)((intptr_t) handle - offsetof (mca_btl_sm_registration_handle_t, btl_handle)); + +#if OPAL_BTL_SM_HAVE_KNEM + if (OPAL_LIKELY(mca_btl_sm_component.use_knem)) { + (void) ioctl(sm_btl->knem_fd, KNEM_CMD_DESTROY_REGION, &handle->cookie); + } +#endif + + OMPI_FREE_LIST_RETURN_MT(&mca_btl_sm_component.registration_handles, &sm_handle->super); + + return OPAL_SUCCESS; +} +#endif /* OPAL_BTL_SM_HAVE_KNEM */ + +#if OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA + /** * Initiate an synchronous get. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des) +int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { int btl_ownership; mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des; @@ -1050,12 +1082,12 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, /* Fill in the ioctl data fields. There's no async completion, so we don't need to worry about getting a slot, etc. */ - recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval; - recv_iovec.len = dst->base.seg_len; + recv_iovec.base = (uintptr_t) local_address; + recv_iovec.len = size; icopy.local_iovec_array = (uintptr_t)&recv_iovec; icopy.local_iovec_nr = 1; - icopy.remote_cookie = src->key; - icopy.remote_offset = 0; + icopy.remote_cookie = remote_handle->data.knem.cookie; + icopy.remote_offset = remote_address - remote_handle->base_addr; icopy.write = 0; /* Use the DMA flag if knem supports it *and* the segment length @@ -1063,7 +1095,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, value is 0 (i.e., the MCA param was set to 0), the segment size will never be larger than it, so DMA will never be used. */ icopy.flags = 0; - if (mca_btl_sm_component.knem_dma_min <= dst->base.seg_len) { + if (mca_btl_sm_component.knem_dma_min <= size) { icopy.flags = mca_btl_sm_component.knem_dma_flag; } /* synchronous flags only, no need to specify icopy.async_status_index */ @@ -1081,23 +1113,18 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, #if OPAL_BTL_SM_HAVE_CMA if (OPAL_LIKELY(mca_btl_sm_component.use_cma)) { - char *remote_address, *local_address; - int remote_length, local_length; struct iovec local, remote; pid_t remote_pid; int val; - remote_address = (char *)(uintptr_t) src->base.seg_addr.lval; - remote_length = src->base.seg_len; - local_address = (char *)(uintptr_t) dst->base.seg_addr.lval; local_length = dst->base.seg_len; - remote_pid = src->key; + remote_pid = remote_handle->data.pid; remote.iov_base = remote_address; - remote.iov_len = remote_length; + remote.iov_len = size; local.iov_base = local_address; - local.iov_len = local_length; + local.iov_len = size; val = process_vm_readv(remote_pid, &local, 1, &remote, 1, 0); @@ -1115,15 +1142,7 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, } #endif /* OPAL_BTL_SM_HAVE_CMA */ - btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) { - frag->base.des_cbfunc(&mca_btl_sm.super, - frag->endpoint, &frag->base, - OPAL_SUCCESS); - } - if (btl_ownership) { - MCA_BTL_SM_FRAG_RETURN(frag); - } + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } @@ -1135,33 +1154,44 @@ int mca_btl_sm_get_sync(struct mca_btl_base_module_t* btl, /** * Initiate an asynchronous get. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des) +int mca_btl_sm_get_async (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { int btl_ownership; mca_btl_sm_t* sm_btl = (mca_btl_sm_t*) btl; - mca_btl_sm_frag_t* frag = (mca_btl_sm_frag_t*)des; + mca_btl_sm_frag_t* frag; mca_btl_sm_segment_t *src = (mca_btl_sm_segment_t*)des->des_remote; mca_btl_sm_segment_t *dst = (mca_btl_sm_segment_t*)des->des_segments; struct knem_cmd_inline_copy icopy; struct knem_cmd_param_iovec recv_iovec; - /* If we have no knem slots available, return - TEMP_OUT_OF_RESOURCE */ + /* If we have no knem slots available, fall back to synchronous */ if (sm_btl->knem_status_num_used >= mca_btl_sm_component.knem_max_simultaneous) { - return OPAL_ERR_TEMP_OUT_OF_RESOURCE; + return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle, + remote_handle, size, flags, cbfunc, cbcontext, cbdata); } + /* allocate a fragment to keep track of this transaction */ + MCA_BTL_SM_FRAG_ALLOC_USER(frag); + if (OPAL_UNLIKELY(NULL == frag)) { + return mca_btl_sm_get_sync (btl, endpoint, local_address, remote_address, local_handle, + remote_handle, size, flags, cbfunc, cbcontext, cbdata); + } + + /* fill in callback data */ + frag->cb.func = cbfunc; + frag->cb.context = cbcontext; + frag->cb.data = cbdata; + frag->cb.local_address = local_address; + frag->cb.local_handle = local_handle; + /* We have a slot, so fill in the data fields. Bump the first_avail and num_used counters. */ - recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval; + recv_iovec.base = (uintptr_t) local_address; recv_iovec.len = dst->base.seg_len; icopy.local_iovec_array = (uintptr_t)&recv_iovec; icopy.local_iovec_nr = 1; @@ -1172,8 +1202,8 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl, sm_btl->knem_status_first_avail = 0; } ++sm_btl->knem_status_num_used; - icopy.remote_cookie = src->key; - icopy.remote_offset = 0; + icopy.remote_cookie = remote_handle->data.knem.cookie; + icopy.remote_offset = remote_address - remote_handle->data.knem.base_addr; /* Use the DMA flag if knem supports it *and* the segment length is greater than the cutoff */ @@ -1186,19 +1216,11 @@ int mca_btl_sm_get_async(struct mca_btl_base_module_t* btl, if (OPAL_LIKELY(0 == ioctl(sm_btl->knem_fd, KNEM_CMD_INLINE_COPY, &icopy))) { if (icopy.current_status != KNEM_STATUS_PENDING) { + MCA_BTL_SM_FRAG_RETURN(frag); /* request completed synchronously */ /* FIXME: what if icopy.current_status == KNEM_STATUS_FAILED? */ - - btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags)) { - frag->base.des_cbfunc(&mca_btl_sm.super, - frag->endpoint, &frag->base, - OPAL_SUCCESS); - } - if (btl_ownership) { - MCA_BTL_SM_FRAG_RETURN(frag); - } + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); --sm_btl->knem_status_num_used; ++sm_btl->knem_status_first_used; diff --git a/opal/mca/btl/sm/btl_sm.h b/opal/mca/btl/sm/btl_sm.h index fd7271fb3e..885e1c84cd 100644 --- a/opal/mca/btl/sm/btl_sm.h +++ b/opal/mca/btl/sm/btl_sm.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2013 Los Alamos National Security, LLC. + * Copyright (c) 2010-2014 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -182,6 +183,8 @@ struct mca_btl_sm_component_t { #if OPAL_BTL_SM_HAVE_KNEM /* Knem capabilities info */ struct knem_cmd_info knem_info; + /** registration handles to hold knem cookies */ + ompi_free_list_t registration_handles; #endif /* OPAL_BTL_SM_HAVE_KNEM */ /** MCA: should we be using knem or not? neg=try but continue if @@ -461,7 +464,6 @@ extern int mca_btl_sm_free( struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -504,30 +506,20 @@ extern int mca_btl_sm_send( /* * Synchronous knem/cma get */ -extern int mca_btl_sm_get_sync( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des ); - -extern struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); +int mca_btl_sm_get_sync (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif /* OPAL_BTL_SM_HAVE_KNEM || OPAL_BTL_SM_HAVE_CMA */ #if OPAL_BTL_SM_HAVE_KNEM /* * Asynchronous knem get */ -extern int mca_btl_sm_get_async( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_btl_base_descriptor_t* des ); +int mca_btl_sm_get_async (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif /* OPAL_BTL_SM_HAVE_KNEM */ @@ -558,6 +550,32 @@ void mca_btl_sm_component_event_thread(opal_object_t*); #define MCA_BTL_SM_SIGNAL_PEER(peer) #endif +#if OPAL_BTL_SM_HAVE_KNEM | OPAL_BTL_SM_HAVE_CMA +struct mca_btl_base_registration_handle_t { + union { + struct { + uint64_t cookie; + intptr_t base_addr; + } knem; + pid_t pid; + } data; +}; + +struct mca_btl_sm_registration_handle_t { + ompi_free_list_item_t super; + mca_btl_base_registration_handle_t btl_handle; +}; +typedef struct mca_btl_sm_registration_handle_t mca_btl_sm_registration_handle_t; +OBJ_CLASS_DECLARATION(mca_btl_sm_registration_handle_t); + +mca_btl_base_registration_handle_t *mca_btl_sm_register_mem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + void *base, size_t size, uint32_t flags); + +void mca_btl_sm_deregister_mem (struct mca_btl_base_module_t* btl, mca_btl_base_registration_handle_t *handle); + +#endif + END_C_DECLS #endif diff --git a/opal/mca/btl/sm/btl_sm_component.c b/opal/mca/btl/sm/btl_sm_component.c index 8beff93e0b..c58c4b48ce 100644 --- a/opal/mca/btl/sm/btl_sm_component.c +++ b/opal/mca/btl/sm/btl_sm_component.c @@ -254,6 +254,10 @@ static int sm_register(void) mca_btl_sm.super.btl_bandwidth = 9000; /* Mbs */ mca_btl_sm.super.btl_latency = 1; /* Microsecs */ +#if OPAL_BTL_SM_HAVE_KNEM + mca_btl_sm.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); +#endif + /* Call the BTL based to register its MCA params */ mca_btl_base_param_register(&mca_btl_sm_component.super.btl_version, &mca_btl_sm.super); @@ -295,6 +299,8 @@ static int mca_btl_sm_component_open(void) mca_btl_sm_component.sm_seg = NULL; #if OPAL_BTL_SM_HAVE_KNEM + OBJ_CONSTRICT(&mca_btl_sm_component.registration_handles, ompi_free_list_t); + mca_btl_sm.knem_fd = -1; mca_btl_sm.knem_status_array = NULL; mca_btl_sm.knem_frag_array = NULL; @@ -329,6 +335,8 @@ static int mca_btl_sm_component_close(void) close(mca_btl_sm.knem_fd); mca_btl_sm.knem_fd = -1; } + + OBJ_DESTRUCT(&mca_btl_sm_component.registration_handles); #endif /* OPAL_BTL_SM_HAVE_KNEM */ OBJ_DESTRUCT(&mca_btl_sm_component.sm_lock); @@ -903,6 +911,9 @@ mca_btl_sm_component_init(int *num_btls, } else { mca_btl_sm.super.btl_get = mca_btl_sm_get_sync; } + + mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem; + mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem; } #else /* If the user explicitly asked for knem and we can't provide it, @@ -917,6 +928,8 @@ mca_btl_sm_component_init(int *num_btls, /* Will only ever have either cma or knem enabled at runtime so no problems with accidentally overwriting this set earlier */ mca_btl_sm.super.btl_get = mca_btl_sm_get_sync; + mca_btl_sm.super.btl_register_mem = mca_btl_sm_register_mem; + mca_btl_sm.super.btl_deregister_mem = mca_btl_sm_deregister_mem; } #else /* If the user explicitly asked for CMA and we can't provide itm @@ -1175,22 +1188,14 @@ int mca_btl_sm_component_progress(void) mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) { if (KNEM_STATUS_SUCCESS == mca_btl_sm.knem_status_array[mca_btl_sm.knem_status_first_used]) { - int btl_ownership; /* Handle the completed fragment */ frag = mca_btl_sm.knem_frag_array[mca_btl_sm.knem_status_first_used]; - btl_ownership = (frag->base.des_flags & - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if (0 != (MCA_BTL_DES_SEND_ALWAYS_CALLBACK & - frag->base.des_flags)) { - frag->base.des_cbfunc(&mca_btl_sm.super, - frag->endpoint, &frag->base, - OPAL_SUCCESS); - } - if (btl_ownership) { - MCA_BTL_SM_FRAG_RETURN(frag); - } + frag.cb.func (&mca_btl_sm.super, frag->endpoint, + frag->cb.local_address, frag->cb.local_handle, + frag->cb.context, frag->cb.data, OPAL_SUCCESS); + MCA_BTL_SM_FRAG_RETURN(frag); /* Bump counters, loop around the circular buffer if necessary */ diff --git a/opal/mca/btl/sm/btl_sm_endpoint.h b/opal/mca/btl/sm/btl_sm_endpoint.h index 5e32510a67..04708dc856 100644 --- a/opal/mca/btl/sm/btl_sm_endpoint.h +++ b/opal/mca/btl/sm/btl_sm_endpoint.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/opal/mca/btl/sm/btl_sm_frag.h b/opal/mca/btl/sm/btl_sm_frag.h index 3dde48c802..424de6a7fb 100644 --- a/opal/mca/btl/sm/btl_sm_frag.h +++ b/opal/mca/btl/sm/btl_sm_frag.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,6 +12,8 @@ * All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,6 +67,16 @@ struct mca_btl_sm_frag_t { /* pointer written to the FIFO, this is the base of the shared memory region */ mca_btl_sm_hdr_t *hdr; ompi_free_list_t* my_list; +#if OPAL_BTL_SM_HAVE_KNEM + /* rdma callback data. required for async get */ + struct { + mca_btl_base_rdma_completion_fn_t func; + void *local_address; + struct mca_btl_base_registration_handle_t *local_handle; + void *context; + void *data; + } cb; +#endif }; typedef struct mca_btl_sm_frag_t mca_btl_sm_frag_t; typedef struct mca_btl_sm_frag_t mca_btl_sm_frag1_t; diff --git a/opal/mca/btl/tcp/btl_tcp.c b/opal/mca/btl/tcp/btl_tcp.c index c2ac2f9dc9..9541451619 100644 --- a/opal/mca/btl/tcp/btl_tcp.c +++ b/opal/mca/btl/tcp/btl_tcp.c @@ -42,7 +42,6 @@ mca_btl_tcp_module_t mca_btl_tcp_module = { .btl_alloc = mca_btl_tcp_alloc, .btl_free = mca_btl_tcp_free, .btl_prepare_src = mca_btl_tcp_prepare_src, - .btl_prepare_dst = mca_btl_tcp_prepare_dst, .btl_send = mca_btl_tcp_send, .btl_put = mca_btl_tcp_put, .btl_dump = mca_btl_base_dump, @@ -202,7 +201,6 @@ int mca_btl_tcp_free( mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -272,62 +270,12 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( } frag->base.des_segments = frag->segments; - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; frag->base.des_flags = flags; frag->base.order = MCA_BTL_NO_ORDER; *size = max_data; return &frag->base; } - -/** - * Prepare a descriptor for send/rdma using the supplied - * convertor. If the convertor references data that is contigous, - * the descriptor may simply point to the user buffer. Otherwise, - * this routine is responsible for allocating buffer space and - * packing if required. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL peer addressing - * @param convertor (IN) Data type convertor - * @param reserve (IN) Additional bytes requested by upper layer to precede user data - * @param size (IN/OUT) Number of bytes to prepare (IN), number of bytes actually prepared (OUT) - */ - -mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* endpoint, - struct mca_mpool_base_registration_t* registration, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags) -{ - mca_btl_tcp_frag_t* frag; - - if( OPAL_UNLIKELY((*size) > UINT32_MAX) ) { /* limit the size to what we support */ - *size = (size_t)UINT32_MAX; - } - MCA_BTL_TCP_FRAG_ALLOC_USER(frag); - if( OPAL_UNLIKELY(NULL == frag) ) { - return NULL; - } - - frag->segments->seg_len = *size; - opal_convertor_get_current_pointer( convertor, (void**)&(frag->segments->seg_addr.pval) ); - - frag->base.des_remote = NULL; - frag->base.des_remote_count = 0; - frag->base.des_segments = frag->segments; - frag->base.des_segment_count = 1; - frag->base.des_flags = flags; - frag->base.order = MCA_BTL_NO_ORDER; - return &frag->base; -} - - /** * Initiate an asynchronous send. * @@ -368,23 +316,55 @@ int mca_btl_tcp_send( struct mca_btl_base_module_t* btl, return mca_btl_tcp_endpoint_send(endpoint,frag); } +static void fake_rdma_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_base_descriptor_t *desc, int rc) +{ + mca_btl_tcp_frag_t *frag = (mca_btl_tcp_frag_t *) desc; + + frag->cb.func (btl, endpoint, frag->segments[0].seg_addr.pval, NULL, frag->cb.context, frag->cb.data, + rc); +} /** * Initiate an asynchronous put. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -int mca_btl_tcp_put( mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* endpoint, - mca_btl_base_descriptor_t* descriptor ) +int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; - mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor; + mca_btl_tcp_frag_t *frag = NULL; int i; + MCA_BTL_TCP_FRAG_ALLOC_USER(frag); + if( OPAL_UNLIKELY(NULL == frag) ) { + return OPAL_ERR_OUT_OF_RESOURCE;; + } + + frag->endpoint = endpoint; + + frag->segments->seg_len = size; + frag->segments->seg_addr.pval = local_address; + + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; + frag->base.order = MCA_BTL_NO_ORDER; + + frag->segments[0].seg_addr.pval = local_address; + frag->segments[0].seg_len = size; + + frag->segments[1].seg_addr.lval = remote_address; + frag->segments[1].seg_len = size; + + frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + frag->base.des_cbfunc = fake_rdma_complete; + + frag->cb.func = cbfunc; + frag->cb.data = cbdata; + frag->cb.context = cbcontext; + frag->btl = tcp_btl; frag->endpoint = endpoint; frag->rc = 0; @@ -394,8 +374,8 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl, frag->iov_ptr = frag->iov; frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr; frag->iov[0].iov_len = sizeof(frag->hdr); - frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote; - frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t); + frag->iov[1].iov_base = (IOVBASE_TYPE*) (frag->segments + 1); + frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t); for( i = 0; i < (int)frag->base.des_segment_count; i++ ) { frag->hdr.size += frag->segments[i].seg_len; frag->iov[i+2].iov_len = frag->segments[i].seg_len; @@ -404,7 +384,7 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl, } frag->hdr.base.tag = MCA_BTL_TAG_BTL; frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT; - frag->hdr.count = frag->base.des_remote_count; + frag->hdr.count = 1; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : i); } @@ -412,22 +392,46 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl, /** * Initiate an asynchronous get. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - * */ -int mca_btl_tcp_get( - mca_btl_base_module_t* btl, - mca_btl_base_endpoint_t* endpoint, - mca_btl_base_descriptor_t* descriptor) +int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; - mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor; + mca_btl_tcp_frag_t* frag = NULL; int rc; + MCA_BTL_TCP_FRAG_ALLOC_USER(frag); + if( OPAL_UNLIKELY(NULL == frag) ) { + return OPAL_ERR_OUT_OF_RESOURCE;; + } + + frag->endpoint = endpoint; + + frag->segments->seg_len = size; + frag->segments->seg_addr.pval = local_address; + + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; + frag->base.order = MCA_BTL_NO_ORDER; + + frag->segments[0].seg_addr.pval = local_address; + frag->segments[0].seg_len = size; + + frag->segments[1].seg_addr.lval = remote_address; + frag->segments[1].seg_len = size; + + /* call the rdma callback through the descriptor callback. this is + * tcp so the extra latency is not an issue */ + frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + frag->base.des_cbfunc = fake_rdma_complete; + + frag->cb.func = cbfunc; + frag->cb.data = cbdata; + frag->cb.context = cbcontext; + frag->btl = tcp_btl; frag->endpoint = endpoint; frag->rc = 0; @@ -437,11 +441,11 @@ int mca_btl_tcp_get( frag->iov_ptr = frag->iov; frag->iov[0].iov_base = (IOVBASE_TYPE*)&frag->hdr; frag->iov[0].iov_len = sizeof(frag->hdr); - frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->base.des_remote; - frag->iov[1].iov_len = frag->base.des_remote_count * sizeof(mca_btl_base_segment_t); + frag->iov[1].iov_base = (IOVBASE_TYPE*) &frag->segments[1]; + frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t); frag->hdr.base.tag = MCA_BTL_TAG_BTL; frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET; - frag->hdr.count = frag->base.des_remote_count; + frag->hdr.count = 1; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OPAL_SUCCESS : rc); } diff --git a/opal/mca/btl/tcp/btl_tcp.h b/opal/mca/btl/tcp/btl_tcp.h index b4daef4cd3..845d188eb8 100644 --- a/opal/mca/btl/tcp/btl_tcp.h +++ b/opal/mca/btl/tcp/btl_tcp.h @@ -217,32 +217,22 @@ extern int mca_btl_tcp_send( /** * Initiate an asynchronous put. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -extern int mca_btl_tcp_put( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* decriptor -); +int mca_btl_tcp_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Initiate an asynchronous get. - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred */ -extern int mca_btl_tcp_get( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* decriptor -); +int mca_btl_tcp_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); /** * Allocate a descriptor with a segment of the requested size. @@ -290,7 +280,6 @@ extern int mca_btl_tcp_free( mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* peer, - struct mca_mpool_base_registration_t*, struct opal_convertor_t* convertor, uint8_t order, size_t reserve, @@ -298,16 +287,6 @@ mca_btl_base_descriptor_t* mca_btl_tcp_prepare_src( uint32_t flags ); -extern mca_btl_base_descriptor_t* mca_btl_tcp_prepare_dst( - struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* peer, - struct mca_mpool_base_registration_t*, - struct opal_convertor_t* convertor, - uint8_t order, - size_t reserve, - size_t* size, - uint32_t flags); - /** * Fault Tolerance Event Notification Function diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index 84ab602a02..831fc9429d 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -287,7 +287,7 @@ static int mca_btl_tcp_component_register(void) MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; - mca_btl_tcp_module.super.btl_seg_size = sizeof (mca_btl_base_segment_t); + mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_latency = 100; diff --git a/opal/mca/btl/tcp/btl_tcp_frag.h b/opal/mca/btl/tcp/btl_tcp_frag.h index cbe00f9a49..f3c4243bf9 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.h +++ b/opal/mca/btl/tcp/btl_tcp_frag.h @@ -58,6 +58,12 @@ struct mca_btl_tcp_frag_t { size_t size; int rc; ompi_free_list_t* my_list; + /* fake rdma completion */ + struct { + mca_btl_base_rdma_completion_fn_t func; + void *data; + void *context; + } cb; }; typedef struct mca_btl_tcp_frag_t mca_btl_tcp_frag_t; OBJ_CLASS_DECLARATION(mca_btl_tcp_frag_t); @@ -116,8 +122,6 @@ do { \ frag->iov_cnt = 1; \ frag->iov_idx = 0; \ frag->iov_ptr = frag->iov; \ - frag->base.des_remote = NULL; \ - frag->base.des_remote_count = 0; \ frag->base.des_segments = frag->segments; \ frag->base.des_segment_count = 1; \ } while(0) diff --git a/opal/mca/btl/ugni/btl_ugni.h b/opal/mca/btl/ugni/btl_ugni.h index 468213585b..059c702967 100644 --- a/opal/mca/btl/ugni/btl_ugni.h +++ b/opal/mca/btl/ugni/btl_ugni.h @@ -264,39 +264,15 @@ mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, uint32_t flags, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t **descriptor); -/** - * Initiate a get operation. - * - * location: btl_ugni_get.c - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - */ -int mca_btl_ugni_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, - size_t size, int flags, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata); +int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); -/** - * Initiate a put operation. - * - * location: btl_ugni_put.c - * - * @param btl (IN) BTL module - * @param endpoint (IN) BTL addressing information - * @param descriptor (IN) Description of the data to be transferred - */ -int mca_btl_ugni_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, - size_t size, int flags, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata); +int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); int mca_btl_ugni_progress_send_wait_list (struct mca_btl_base_endpoint_t *endpoint); diff --git a/opal/mca/btl/ugni/btl_ugni_get.c b/opal/mca/btl/ugni/btl_ugni_get.c index 8788c7c2c1..a11ef32b2d 100644 --- a/opal/mca/btl/ugni/btl_ugni_get.c +++ b/opal/mca/btl/ugni/btl_ugni_get.c @@ -13,13 +13,10 @@ #include "btl_ugni_rdma.h" #include "btl_ugni_smsg.h" -int mca_btl_ugni_get (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, - size_t size, int flags, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata) +int mca_btl_ugni_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { bool check; @@ -40,7 +37,7 @@ int mca_btl_ugni_get (struct mca_btl_base_module_t *btl, (void) mca_btl_ugni_check_endpoint_state(endpoint); return mca_btl_ugni_post (endpoint, true, size, local_address, remote_address, local_handle, - remote_handle, cbfunc, cbcontext, cbdata); + remote_handle, order, cbfunc, cbcontext, cbdata); } /* eager get */ @@ -171,7 +168,7 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *endpoint, /* start the get */ rc = mca_btl_ugni_post (endpoint, true, size, frag->base.super.ptr, hdr.eager.address, &frag->memory_handle, &hdr.eager.memory_handle, - mca_btl_ugni_callback_eager_get, frag, NULL); + MCA_BTL_NO_ORDER, mca_btl_ugni_callback_eager_get, frag, NULL); if (OPAL_UNLIKELY(OPAL_SUCCESS == rc)) { return OPAL_SUCCESS; } diff --git a/opal/mca/btl/ugni/btl_ugni_module.c b/opal/mca/btl/ugni/btl_ugni_module.c index 9227efebf0..93317ad050 100644 --- a/opal/mca/btl/ugni/btl_ugni_module.c +++ b/opal/mca/btl/ugni/btl_ugni_module.c @@ -30,7 +30,6 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t* btl); static struct mca_btl_base_descriptor_t * mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags); @@ -271,7 +270,6 @@ mca_btl_ugni_free (struct mca_btl_base_module_t *btl, static struct mca_btl_base_descriptor_t * mca_btl_ugni_prepare_src (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) diff --git a/opal/mca/btl/ugni/btl_ugni_prepare.h b/opal/mca/btl/ugni/btl_ugni_prepare.h index a6ef600a76..0a018a4d8d 100644 --- a/opal/mca/btl/ugni/btl_ugni_prepare.h +++ b/opal/mca/btl/ugni/btl_ugni_prepare.h @@ -41,7 +41,7 @@ mca_btl_ugni_prepare_src_send_nodata (struct mca_btl_base_module_t *btl, frag->segments[1].seg_addr.pval = NULL; frag->segments[1].seg_len = 0; - frag->base.des_segments = &frag->segments; + frag->base.des_segments = frag->segments; frag->base.des_segment_count = 1; frag->base.order = order; frag->base.des_flags = flags; @@ -98,7 +98,7 @@ mca_btl_ugni_prepare_src_send_inplace (struct mca_btl_base_module_t *btl, frag->segments[1].seg_addr.pval = data_ptr; frag->segments[1].seg_len = *size; - frag->base.des_segments = &frag->segments; + frag->base.des_segments = frag->segments; frag->base.des_segment_count = 2; frag->base.order = order; frag->base.des_flags = flags; @@ -159,7 +159,7 @@ mca_btl_ugni_prepare_src_send_buffered (struct mca_btl_base_module_t *btl, frag->segments[1].seg_addr.pval = frag->base.super.ptr; frag->segments[1].seg_len = *size; - frag->base.des_segments = &frag->segments; + frag->base.des_segments = frag->segments; frag->base.des_segment_count = 2; frag->base.order = order; frag->base.des_flags = flags; diff --git a/opal/mca/btl/ugni/btl_ugni_put.c b/opal/mca/btl/ugni/btl_ugni_put.c index 8404b36cef..6b6b7da923 100644 --- a/opal/mca/btl/ugni/btl_ugni_put.c +++ b/opal/mca/btl/ugni/btl_ugni_put.c @@ -14,13 +14,10 @@ #include "btl_ugni_rdma.h" -int mca_btl_ugni_put (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, - size_t size, int flags, mca_btl_base_rdma_completion_fn_t cbfunc, - void *cbcontext, void *cbdata) +int mca_btl_ugni_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { BTL_VERBOSE(("Using RDMA/FMA Put from local address %p to remote address %" PRIx64, local_address, remote_address)); @@ -29,5 +26,5 @@ int mca_btl_ugni_put (struct mca_btl_base_module_t *btl, (void) mca_btl_ugni_check_endpoint_state(endpoint); return mca_btl_ugni_post (endpoint, false, size, local_address, remote_address, local_handle, - remote_handle, cbfunc, cbcontext, cbdata); + remote_handle, order, cbfunc, cbcontext, cbdata); } diff --git a/opal/mca/btl/ugni/btl_ugni_rdma.h b/opal/mca/btl/ugni/btl_ugni_rdma.h index 1c24ffcc7a..3fd8ff43d7 100644 --- a/opal/mca/btl/ugni/btl_ugni_rdma.h +++ b/opal/mca/btl/ugni/btl_ugni_rdma.h @@ -21,7 +21,7 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep, mca_btl_ugni_base_frag_t *frag); static inline void init_gni_post_desc (opal_common_ugni_post_desc_t *post_desc, - gni_post_type_t op_type, + int order, gni_post_type_t op_type, uint64_t lcl_addr, gni_mem_handle_t lcl_mdh, uint64_t rem_addr, @@ -30,7 +30,11 @@ static inline void init_gni_post_desc (opal_common_ugni_post_desc_t *post_desc, gni_cq_handle_t cq_hndl) { post_desc->base.type = op_type; post_desc->base.cq_mode = GNI_CQMODE_GLOBAL_EVENT; - post_desc->base.dlvr_mode = GNI_DLVMODE_PERFORMANCE; + if (MCA_BTL_NO_ORDER == order) { + post_desc->base.dlvr_mode = GNI_DLVMODE_PERFORMANCE; + } else { + post_desc->base.dlvr_mode = GNI_DLVMODE_NO_ADAPT; + } post_desc->base.local_addr = (uint64_t) lcl_addr; post_desc->base.local_mem_hndl = lcl_mdh; post_desc->base.remote_addr = (uint64_t) rem_addr; @@ -45,7 +49,7 @@ static inline int mca_btl_ugni_post_fma (struct mca_btl_base_endpoint_t *endpoin size_t size, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, - mca_btl_base_rdma_completion_fn_t cbfunc, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_ugni_post_descriptor_t *post_desc; @@ -58,7 +62,7 @@ static inline int mca_btl_ugni_post_fma (struct mca_btl_base_endpoint_t *endpoin /* Post descriptor (CQ is ignored for FMA transactions) -- The CQ associated with the endpoint * is used. */ - init_gni_post_desc (&post_desc->desc, op_type, (intptr_t) local_address, local_handle->gni_handle, + init_gni_post_desc (&post_desc->desc, order, op_type, (intptr_t) local_address, local_handle->gni_handle, remote_address, remote_handle->gni_handle, size, 0); OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); @@ -83,7 +87,7 @@ static inline int mca_btl_ugni_post_bte (mca_btl_base_endpoint_t *endpoint, gni_ size_t size, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, - mca_btl_base_rdma_completion_fn_t cbfunc, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_ugni_post_descriptor_t *post_desc; @@ -95,7 +99,7 @@ static inline int mca_btl_ugni_post_bte (mca_btl_base_endpoint_t *endpoint, gni_ } /* Post descriptor */ - init_gni_post_desc (&post_desc->desc, op_type, (intptr_t) local_address, local_handle->gni_handle, + init_gni_post_desc (&post_desc->desc, order, op_type, (intptr_t) local_address, local_handle->gni_handle, remote_address, remote_handle->gni_handle, size, endpoint->btl->rdma_local_cq); OPAL_THREAD_LOCK(&endpoint->btl->device->dev_lock); @@ -120,7 +124,7 @@ static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, - mca_btl_base_rdma_completion_fn_t cbfunc, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { const gni_post_type_t fma_ops[2] = {GNI_POST_FMA_PUT, GNI_POST_FMA_GET}; @@ -128,11 +132,11 @@ static inline int mca_btl_ugni_post (mca_btl_base_endpoint_t *endpoint, int get, if (size <= mca_btl_ugni_component.ugni_fma_limit) { return mca_btl_ugni_post_fma (endpoint, fma_ops[get], size, local_address, remote_address, - local_handle, remote_handle, cbfunc, cbcontext, cbdata); + local_handle, remote_handle, order, cbfunc, cbcontext, cbdata); } return mca_btl_ugni_post_bte (endpoint, rdma_ops[get], size, local_address, remote_address, - local_handle, remote_handle, cbfunc, cbcontext, cbdata); + local_handle, remote_handle, order, cbfunc, cbcontext, cbdata); } static inline int mca_btl_ugni_repost (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_post_descriptor_t *post_desc) diff --git a/opal/mca/btl/ugni/btl_ugni_send.c b/opal/mca/btl/ugni/btl_ugni_send.c index 69b55c035e..36e9403c3f 100644 --- a/opal/mca/btl/ugni/btl_ugni_send.c +++ b/opal/mca/btl/ugni/btl_ugni_send.c @@ -98,7 +98,6 @@ int mca_btl_ugni_sendi (struct mca_btl_base_module_t *btl, uint32_t flags, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t **descriptor) { - mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; size_t total_size = header_size + payload_size; mca_btl_ugni_base_frag_t *frag = NULL; size_t packed_size = payload_size; diff --git a/opal/mca/btl/vader/btl_vader.h b/opal/mca/btl/vader/btl_vader.h index fb9f14c5a1..ae647278fe 100644 --- a/opal/mca/btl/vader/btl_vader.h +++ b/opal/mca/btl/vader/btl_vader.h @@ -115,7 +115,9 @@ struct mca_btl_vader_component_t { ompi_free_list_t vader_frags_eager; /**< free list of vader send frags */ ompi_free_list_t vader_frags_max_send; /**< free list of vader max send frags (large fragments) */ ompi_free_list_t vader_frags_user; /**< free list of small inline frags */ - ompi_free_list_t vader_frags_rdma; /**< free list of vader put/get frags (single-copy) */ +#if OPAL_BTL_VADER_HAVE_KNEM + ompi_free_list_t registration_handles; /**< registration handles for knem segments */ +#endif unsigned int fbox_threshold; /**< number of sends required before we setup a send fast box for a peer */ unsigned int fbox_max; /**< maximum number of send fast boxes to allocate */ @@ -208,21 +210,24 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, * @param descriptor (IN) Description of the data to be transferred */ #if OPAL_BTL_VADER_HAVE_XPMEM -int mca_btl_vader_put_xpmem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_put_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif #if OPAL_BTL_VADER_HAVE_CMA -int mca_btl_vader_put_cma (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_put_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif #if OPAL_BTL_VADER_HAVE_KNEM -int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_put_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif /** @@ -233,21 +238,24 @@ int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, * @param descriptor (IN) Description of the data to be transferred */ #if OPAL_BTL_VADER_HAVE_XPMEM -int mca_btl_vader_get_xpmem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_get_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif #if OPAL_BTL_VADER_HAVE_CMA -int mca_btl_vader_get_cma (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif #if OPAL_BTL_VADER_HAVE_KNEM -int mca_btl_vader_get_knem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des); +int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); #endif /** @@ -260,6 +268,7 @@ mca_btl_base_descriptor_t* mca_btl_vader_alloc (struct mca_btl_base_module_t* bt struct mca_btl_base_endpoint_t* endpoint, uint8_t order, size_t size, uint32_t flags); + END_C_DECLS #endif diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c index 7061612f95..8e53599dd9 100644 --- a/opal/mca/btl/vader/btl_vader_component.c +++ b/opal/mca/btl/vader/btl_vader_component.c @@ -251,7 +251,6 @@ static int mca_btl_vader_component_register (void) mca_btl_vader.super.btl_bandwidth = 10000; /* Mbs */ } - mca_btl_vader.super.btl_seg_size = sizeof (mca_btl_vader_segment_t); mca_btl_vader.super.btl_latency = 1; /* Microsecs */ /* Call the BTL based to register its MCA params */ @@ -272,7 +271,9 @@ static int mca_btl_vader_component_open(void) OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_user, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_max_send, ompi_free_list_t); - OBJ_CONSTRUCT(&mca_btl_vader_component.vader_frags_rdma, ompi_free_list_t); +#if OPAL_BTL_VADER_HAVE_KNEM + OBJ_CONSTRUCT(&mca_btl_vader_component.registration_handles, ompi_free_list_t); +#endif OBJ_CONSTRUCT(&mca_btl_vader_component.lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_vader_component.pending_endpoints, opal_list_t); OBJ_CONSTRUCT(&mca_btl_vader_component.pending_fragments, opal_list_t); @@ -293,7 +294,9 @@ static int mca_btl_vader_component_close(void) OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_eager); OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_user); OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_max_send); - OBJ_DESTRUCT(&mca_btl_vader_component.vader_frags_rdma); +#if OPAL_BTL_VADER_HAVE_KNEM + OBJ_DESTRUCT(&mca_btl_vader_component.registration_handles); +#endif OBJ_DESTRUCT(&mca_btl_vader_component.lock); OBJ_DESTRUCT(&mca_btl_vader_component.pending_endpoints); OBJ_DESTRUCT(&mca_btl_vader_component.pending_fragments); @@ -349,7 +352,6 @@ static void mca_btl_vader_select_next_single_copy_mechanism (void) static void mca_btl_vader_check_single_copy (void) { int initial_mechanism = mca_btl_vader_component.single_copy_mechanism; - int rc; #if OPAL_BTL_VADER_HAVE_XPMEM if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) { @@ -564,7 +566,7 @@ failed: void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, struct mca_btl_base_endpoint_t *endpoint) { mca_btl_base_segment_t segments[2]; - mca_btl_base_descriptor_t frag = {.des_local = segments, .des_local_count = 1}; + mca_btl_base_descriptor_t frag = {.des_segments = segments, .des_segment_count = 1}; const mca_btl_active_message_callback_t *reg; if (hdr->flags & MCA_BTL_VADER_FLAG_COMPLETE) { @@ -584,7 +586,7 @@ void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, struct mca_btl_ba &segments[1].seg_addr.pval); segments[1].seg_len = hdr->sc_iov.iov_len; - frag.des_local_count = 2; + frag.des_segment_count = 2; /* recv upcall */ reg->cbfunc(&mca_btl_vader.super, hdr->tag, &frag, reg->cbdata); diff --git a/opal/mca/btl/vader/btl_vader_frag.c b/opal/mca/btl/vader/btl_vader_frag.c index 6cad4e5b63..b39f5fb3c3 100644 --- a/opal/mca/btl/vader/btl_vader_frag.c +++ b/opal/mca/btl/vader/btl_vader_frag.c @@ -31,11 +31,11 @@ static inline void mca_btl_vader_frag_constructor (mca_btl_vader_frag_t *frag) if(frag->hdr != NULL) { frag->hdr->frag = frag; frag->hdr->flags = 0; - frag->segments[0].base.seg_addr.pval = (char *)(frag->hdr + 1); + frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1); } - frag->base.des_local = &frag->segments->base; - frag->base.des_local_count = 1; + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; frag->fbox = NULL; } @@ -65,8 +65,6 @@ void mca_btl_vader_frag_init (ompi_free_list_item_t *item, void *ctx) frag->my_list = &mca_btl_vader_component.vader_frags_eager; } else if (mca_btl_vader.super.btl_max_send_size == data_size) { frag->my_list = &mca_btl_vader_component.vader_frags_max_send; - } else { - frag->my_list = &mca_btl_vader_component.vader_frags_rdma; } if (data_size) { diff --git a/opal/mca/btl/vader/btl_vader_frag.h b/opal/mca/btl/vader/btl_vader_frag.h index fee0bdb565..2c6e5c9091 100644 --- a/opal/mca/btl/vader/btl_vader_frag.h +++ b/opal/mca/btl/vader/btl_vader_frag.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -57,15 +57,6 @@ struct mca_btl_vader_hdr_t { }; typedef struct mca_btl_vader_hdr_t mca_btl_vader_hdr_t; -struct mca_btl_vader_segment_t { - mca_btl_base_segment_t base; -#if OPAL_BTL_VADER_HAVE_KNEM - uint64_t cookie; - intptr_t registered_base; -#endif -}; -typedef struct mca_btl_vader_segment_t mca_btl_vader_segment_t; - /** * shared memory send fragment derived type. */ @@ -73,7 +64,7 @@ struct mca_btl_vader_frag_t { /** base object */ mca_btl_base_descriptor_t base; /** storage for segment data (max 2) */ - mca_btl_vader_segment_t segments[2]; + mca_btl_base_segment_t segments[2]; /** endpoint this fragment is active on */ struct mca_btl_base_endpoint_t *endpoint; /** fast box in use (or NULL) */ @@ -82,9 +73,6 @@ struct mca_btl_vader_frag_t { mca_btl_vader_hdr_t *hdr; /** free list this fragment was allocated within */ ompi_free_list_t *my_list; -#if OPAL_BTL_VADER_HAVE_KNEM - uint64_t cookie; -#endif }; typedef struct mca_btl_vader_frag_t mca_btl_vader_frag_t; @@ -108,37 +96,16 @@ static inline int mca_btl_vader_frag_alloc (mca_btl_vader_frag_t **frag, ompi_fr return OPAL_SUCCESS; } -static inline int mca_btl_vader_frag_alloc_rdma (mca_btl_vader_frag_t **frag, ompi_free_list_t *list, - struct mca_btl_base_endpoint_t *endpoint) { - ompi_free_list_item_t *item; - - OMPI_FREE_LIST_GET_MT(list, item); - *frag = (mca_btl_vader_frag_t *) item; - if (OPAL_LIKELY(NULL != item)) { - (*frag)->endpoint = endpoint; - } - - return OPAL_SUCCESS; -} - static inline void mca_btl_vader_frag_return (mca_btl_vader_frag_t *frag) { if (frag->hdr) { frag->hdr->flags = 0; } - frag->segments[0].base.seg_addr.pval = (char *)(frag->hdr + 1); - frag->base.des_local_count = 1; + frag->segments[0].seg_addr.pval = (char *)(frag->hdr + 1); + frag->base.des_segment_count = 1; frag->fbox = NULL; -#if OPAL_BTL_VADER_HAVE_KNEM - if (frag->cookie) { - /* NTH: explicity ignore the return code. Don't care about this cookie anymore anyway. */ - (void) ioctl(mca_btl_vader.knem_fd, KNEM_CMD_DESTROY_REGION, &frag->cookie); - frag->cookie = 0; - } -#endif - OMPI_FREE_LIST_RETURN_MT(frag->my_list, (ompi_free_list_item_t *)frag); } @@ -153,9 +120,6 @@ OBJ_CLASS_DECLARATION(mca_btl_vader_frag_t); #define MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint) \ mca_btl_vader_frag_alloc (&(frag), &mca_btl_vader_component.vader_frags_user, endpoint) -#define MCA_BTL_VADER_FRAG_ALLOC_RDMA(frag, endpoint) \ - mca_btl_vader_frag_alloc_rdma (&(frag), &mca_btl_vader_component.vader_frags_rdma, endpoint) - #define MCA_BTL_VADER_FRAG_RETURN(frag) mca_btl_vader_frag_return(frag) diff --git a/opal/mca/btl/vader/btl_vader_get.c b/opal/mca/btl/vader/btl_vader_get.c index 8ac99bc128..ce8d7b89d8 100644 --- a/opal/mca/btl/vader/btl_vader_get.c +++ b/opal/mca/btl/vader/btl_vader_get.c @@ -33,11 +33,10 @@ * @param descriptor (IN) Description of the data to be transferred */ #if OPAL_BTL_VADER_HAVE_XPMEM -int mca_btl_vader_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +int mca_btl_vader_get_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_mpool_base_registration_t *reg; void *rem_ptr; @@ -63,9 +62,10 @@ int mca_btl_vader_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_en #endif #if OPAL_BTL_VADER_HAVE_CMA -int mca_btl_vader_get_cma (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { struct iovec src_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size}; struct iovec dst_iov = {.iov_base = local_address, .iov_len = size}; @@ -78,36 +78,29 @@ int mca_btl_vader_get_cma (struct mca_btl_base_module_t *btl, } /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } #endif #if OPAL_BTL_VADER_HAVE_KNEM -int mca_btl_vader_get_knem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des; - mca_btl_vader_segment_t *src = (mca_btl_vader_segment_t *) des->des_remote; - mca_btl_vader_segment_t *dst = (mca_btl_vader_segment_t *) des->des_local; - const size_t size = min(dst->base.seg_len, src->base.seg_len); - intptr_t offset = src->base.seg_addr.lval - src->registered_base; struct knem_cmd_param_iovec recv_iovec; struct knem_cmd_inline_copy icopy; /* Fill in the ioctl data fields. There's no async completion, so we don't need to worry about getting a slot, etc. */ - recv_iovec.base = (uintptr_t) dst->base.seg_addr.lval; + recv_iovec.base = (uintptr_t) local_address; recv_iovec.len = size; icopy.local_iovec_array = (uintptr_t) &recv_iovec; icopy.local_iovec_nr = 1; - icopy.remote_cookie = src->cookie; - icopy.remote_offset = offset; + icopy.remote_cookie = remote_handle->cookie; + icopy.remote_offset = remote_address - remote_handle->base_addr; icopy.write = 0; icopy.flags = 0; @@ -115,7 +108,7 @@ int mca_btl_vader_get_knem (struct mca_btl_base_module_t *btl, * is greater than the cutoff. Not that if DMA is not supported * or the user specified 0 for knem_dma_min the knem_dma_min was * set to UINT_MAX in mca_btl_vader_knem_init. */ - if (mca_btl_vader_component.knem_dma_min <= dst->base.seg_len) { + if (mca_btl_vader_component.knem_dma_min <= size) { icopy.flags = KNEM_FLAG_DMA; } /* synchronous flags only, no need to specify icopy.async_status_index */ @@ -131,10 +124,7 @@ int mca_btl_vader_get_knem (struct mca_btl_base_module_t *btl, } /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/vader/btl_vader_knem.c b/opal/mca/btl/vader/btl_vader_knem.c index e776ebf9e1..a58c34fa7c 100644 --- a/opal/mca/btl/vader/btl_vader_knem.c +++ b/opal/mca/btl/vader/btl_vader_knem.c @@ -20,6 +20,71 @@ #include "opal/util/show_help.h" +struct mca_btl_vader_registration_handle_t { + ompi_free_list_item_t super; + mca_btl_base_registration_handle_t btl_handle; +}; +typedef struct mca_btl_vader_registration_handle_t mca_btl_vader_registration_handle_t; + +OBJ_CLASS_INSTANCE(mca_btl_vader_registration_handle_t, ompi_free_list_item_t, NULL, NULL); + +static mca_btl_base_registration_handle_t * +mca_btl_vader_register_mem_knem (struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t *endpoint, + void *base, size_t size, uint32_t flags) +{ + mca_btl_vader_registration_handle_t *handle = NULL; + struct knem_cmd_create_region knem_cr; + struct knem_cmd_param_iovec knem_iov; + + /* NTH: TODO -- Replace this with just using an mpool once we can pass the + * protection flags through. */ + + OMPI_FREE_LIST_GET_MT(&mca_btl_vader.registration_handles, &handle); + if (OPAL_UNLIKELY(NULL == handle)) { + return NULL; + } + + knem_iov.base = (uintptr_t) base; + knem_iov.len = size; + + knem_cr.iovec_array = (uintptr_t) &knem_iov; + knem_cr.iovec_nr = 1; + knem_cr.protection = 0; + + if (flags & MCA_BTL_REG_FLAG_REMOTE_READ) { + knem_cr.protection |= PROT_READ; + } + + if (flags & MCA_BTL_REG_FLAG_REMOTE_WRITE) { + knem_cr.protection |= PROT_WRITE; + } + + /* Vader will explicitly destroy this cookie */ + knem_cr.flags = 0; + if (OPAL_UNLIKELY(ioctl(mca_btl_vader.knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) { + OMPI_FREE_LIST_RETURN_MT(&mca_btl_vader.registration_handles, handle); + return NULL; + } + + handle->btl_handle.cookie = knem_cr.cookie; + handle->btl_handle.base_addr = (intptr_t) base; + + return &handle->btl_handle; +} + +static int +mca_btl_vader_deregister_mem_knem (struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle) +{ + mca_btl_vader_registration_handle_t *vader_handle = + (mca_btl_vader_registration_handle_t *)((intptr_t) handle - offsetof (mca_btl_vader_registration_handle_t, btl_handle)); + + /* NTH: explicity ignore the return code. Don't care about this cookie anymore anyway. */ + (void) ioctl(mca_btl_vader.knem_fd, KNEM_CMD_DESTROY_REGION, &vader_handle->cookie); + + return OPAL_SUCCESS; +} + int mca_btl_vader_knem_init (void) { struct knem_cmd_info knem_info; @@ -74,6 +139,11 @@ int mca_btl_vader_knem_init (void) mca_btl_vader.super.btl_get = mca_btl_vader_get_knem; mca_btl_vader.super.btl_put = mca_btl_vader_put_knem; + /* knem requires registration */ + mca_btl_vader.super.btl_register_mem = mca_btl_vader_vader_register_mem_kem; + mca_btl_vader.super.btl_deregister_mem = mca_btl_vader_vader_deregister_mem_kem; + mca_btl_vader.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); + return OPAL_SUCCESS; } while (0); diff --git a/opal/mca/btl/vader/btl_vader_knem.h b/opal/mca/btl/vader/btl_vader_knem.h index 1d6fa2d164..a80dcb757e 100644 --- a/opal/mca/btl/vader/btl_vader_knem.h +++ b/opal/mca/btl/vader/btl_vader_knem.h @@ -17,6 +17,12 @@ #include #include +/* At this time only knem requires a registration of "RDMA" buffers */ +struct mca_btl_base_registration_handle_t { + uint64_t cookie; + intptr_t base_addr; +}; + int mca_btl_vader_knem_init (void); int mca_btl_vader_knem_fini (void); int mca_btl_vader_knem_progress (void); diff --git a/opal/mca/btl/vader/btl_vader_module.c b/opal/mca/btl/vader/btl_vader_module.c index 9282da33f5..3c2f0181f4 100644 --- a/opal/mca/btl/vader/btl_vader_module.c +++ b/opal/mca/btl/vader/btl_vader_module.c @@ -48,7 +48,6 @@ static int vader_free (struct mca_btl_base_module_t* btl, mca_btl_base_descripto static struct mca_btl_base_descriptor_t *vader_prepare_src ( struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, @@ -98,19 +97,19 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n) /* initialize fragment descriptor free lists */ /* initialize free list for single copy (get, put) */ - if (MCA_BTL_VADER_NONE != mca_btl_vader_component.single_copy_mechanism) { - rc = ompi_free_list_init_ex_new (&component->vader_frags_rdma, - sizeof(mca_btl_vader_frag_t), 8, - OBJ_CLASS(mca_btl_vader_frag_t), - 0, opal_cache_line_size, - component->vader_free_list_num, - component->vader_free_list_max, - component->vader_free_list_inc, - NULL, mca_btl_vader_frag_init, (void *) 0); +#if OPAL_BTL_VADER_HAVE_KNEM + if (MCA_BTL_VADER_KNEM != mca_btl_vader_component.single_copy_mechanism) { + rc = ompi_free_list_init_new (&component->registration_handles, + sizeof(mca_btl_vader_registration_handle_t), 8, + OBJ_CLASS(mca_btl_vader_registration_handle_t), + 0, 8, component->vader_free_list_num, + component->vader_free_list_max, + component->vader_free_list_inc, NULL); if (OPAL_SUCCESS != rc) { return rc; } } +#endif /* initialize free list for small send and inline fragments */ rc = ompi_free_list_init_ex_new(&component->vader_frags_user, @@ -407,7 +406,7 @@ mca_btl_base_descriptor_t *mca_btl_vader_alloc(struct mca_btl_base_module_t *btl } if (OPAL_LIKELY(frag != NULL)) { - frag->segments[0].base.seg_len = size; + frag->segments[0].seg_len = size; frag->base.des_flags = flags; frag->base.order = order; @@ -436,7 +435,6 @@ static int vader_free (struct mca_btl_base_module_t *btl, mca_btl_base_descripto */ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - mca_mpool_base_registration_t *registration, struct opal_convertor_t *convertor, uint8_t order, size_t reserve, size_t *size, uint32_t flags) @@ -449,122 +447,84 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ opal_convertor_get_current_pointer (convertor, &data_ptr); - if (OPAL_LIKELY(reserve)) { - /* in place send fragment */ - if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { - uint32_t iov_count = 1; - struct iovec iov; + /* in place send fragment */ + if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { + uint32_t iov_count = 1; + struct iovec iov; - /* non-contiguous data requires the convertor */ - if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism && - total_size > mca_btl_vader.super.btl_eager_limit) { - (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); - } else - (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); + /* non-contiguous data requires the convertor */ + if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism && + total_size > mca_btl_vader.super.btl_eager_limit) { + (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); + } else + (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); - if (OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - - iov.iov_len = *size; - iov.iov_base = - (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].base.seg_addr.pval)) + - reserve); - - rc = opal_convertor_pack (convertor, &iov, &iov_count, size); - if (OPAL_UNLIKELY(rc < 0)) { - MCA_BTL_VADER_FRAG_RETURN(frag); - return NULL; - } - - frag->segments[0].base.seg_len = *size + reserve; - } else { - if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) { - if (OPAL_LIKELY(total_size <= mca_btl_vader.super.btl_eager_limit)) { - (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); - } else { - (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); - } - } else - (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint); - - if (OPAL_UNLIKELY(NULL == frag)) { - return NULL; - } - -#if OPAL_BTL_VADER_HAVE_XPMEM - /* use xpmem to send this segment if it is above the max inline send size */ - if (OPAL_UNLIKELY(MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism && - total_size > (size_t) mca_btl_vader_component.max_inline_send)) { - /* single copy send */ - frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY; - - /* set up single copy io vector */ - frag->hdr->sc_iov.iov_base = data_ptr; - frag->hdr->sc_iov.iov_len = *size; - - frag->segments[0].base.seg_len = reserve; - frag->segments[1].base.seg_len = *size; - frag->segments[1].base.seg_addr.pval = data_ptr; - frag->base.des_local_count = 2; - } else { -#endif - - /* inline send */ - if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) { - /* try to reserve a fast box for this transfer only if the - * fragment does not belong to the caller */ - fbox = mca_btl_vader_reserve_fbox (endpoint, total_size); - if (OPAL_LIKELY(fbox)) { - frag->segments[0].base.seg_addr.pval = fbox; - } - - frag->fbox = fbox; - } - - /* NTH: the covertor adds some latency so we bypass it here */ - memcpy ((void *)((uintptr_t)frag->segments[0].base.seg_addr.pval + reserve), data_ptr, *size); - frag->segments[0].base.seg_len = total_size; -#if OPAL_BTL_VADER_HAVE_XPMEM - } -#endif - } - } else { - /* put/get fragment */ - if (MCA_BTL_VADER_NONE != mca_btl_vader_component.single_copy_mechanism) { - (void) MCA_BTL_VADER_FRAG_ALLOC_RDMA(frag, endpoint); - } else { - (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint); - } if (OPAL_UNLIKELY(NULL == frag)) { return NULL; } - frag->segments[0].base.seg_addr.lval = (uint64_t)(uintptr_t) data_ptr; - frag->segments[0].base.seg_len = total_size; -#if OPAL_BTL_VADER_HAVE_KNEM - if (MCA_BTL_VADER_KNEM == mca_btl_vader_component.single_copy_mechanism) { - struct knem_cmd_create_region knem_cr; - struct knem_cmd_param_iovec knem_iov; + iov.iov_len = *size; + iov.iov_base = + (IOVBASE_TYPE *)(((uintptr_t)(frag->segments[0].seg_addr.pval)) + + reserve); - knem_iov.base = (uintptr_t) data_ptr; - knem_iov.len = total_size; + rc = opal_convertor_pack (convertor, &iov, &iov_count, size); + if (OPAL_UNLIKELY(rc < 0)) { + MCA_BTL_VADER_FRAG_RETURN(frag); + return NULL; + } - knem_cr.iovec_array = (uintptr_t) &knem_iov; - knem_cr.iovec_nr = 1; - knem_cr.protection = PROT_READ; - /* Vader will explicitly destroy this cookie */ - knem_cr.flags = 0; - if (OPAL_UNLIKELY(ioctl(mca_btl_vader.knem_fd, KNEM_CMD_CREATE_REGION, &knem_cr) < 0)) { - MCA_BTL_VADER_FRAG_RETURN(frag); - return NULL; + frag->segments[0].seg_len = *size + reserve; + } else { + if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) { + if (OPAL_LIKELY(total_size <= mca_btl_vader.super.btl_eager_limit)) { + (void) MCA_BTL_VADER_FRAG_ALLOC_EAGER(frag, endpoint); + } else { + (void) MCA_BTL_VADER_FRAG_ALLOC_MAX(frag, endpoint); + } + } else + (void) MCA_BTL_VADER_FRAG_ALLOC_USER(frag, endpoint); + + if (OPAL_UNLIKELY(NULL == frag)) { + return NULL; + } + +#if OPAL_BTL_VADER_HAVE_XPMEM + /* use xpmem to send this segment if it is above the max inline send size */ + if (OPAL_UNLIKELY(MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism && + total_size > (size_t) mca_btl_vader_component.max_inline_send)) { + /* single copy send */ + frag->hdr->flags = MCA_BTL_VADER_FLAG_SINGLE_COPY; + + /* set up single copy io vector */ + frag->hdr->sc_iov.iov_base = data_ptr; + frag->hdr->sc_iov.iov_len = *size; + + frag->segments[0].seg_len = reserve; + frag->segments[1].seg_len = *size; + frag->segments[1].seg_addr.pval = data_ptr; + frag->base.des_segment_count = 2; + } else { +#endif + + /* inline send */ + if (OPAL_LIKELY(MCA_BTL_DES_FLAGS_BTL_OWNERSHIP & flags)) { + /* try to reserve a fast box for this transfer only if the + * fragment does not belong to the caller */ + fbox = mca_btl_vader_reserve_fbox (endpoint, total_size); + if (OPAL_LIKELY(fbox)) { + frag->segments[0].seg_addr.pval = fbox; + } + + frag->fbox = fbox; } - frag->segments[0].cookie = knem_cr.cookie; - frag->segments[0].registered_base = (intptr_t) data_ptr; - frag->cookie = knem_cr.cookie; + /* NTH: the covertor adds some latency so we bypass it here */ + memcpy ((void *)((uintptr_t)frag->segments[0].seg_addr.pval + reserve), data_ptr, *size); + frag->segments[0].seg_len = total_size; +#if OPAL_BTL_VADER_HAVE_XPMEM } -#endif /* OPAL_BTL_SM_HAVE_KNEM */ +#endif } frag->base.order = order; @@ -583,6 +543,9 @@ static int vader_ft_event (int state) return OPAL_SUCCESS; } +#if OPAL_BTL_VADER_HAVE_KNEM +#endif + static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep) { OBJ_CONSTRUCT(&ep->pending_frags, opal_list_t); diff --git a/opal/mca/btl/vader/btl_vader_put.c b/opal/mca/btl/vader/btl_vader_put.c index 3171449167..3107f420b3 100644 --- a/opal/mca/btl/vader/btl_vader_put.c +++ b/opal/mca/btl/vader/btl_vader_put.c @@ -35,11 +35,10 @@ * @param descriptor (IN) Description of the data to be transferred */ #if OPAL_BTL_VADER_HAVE_XPMEM -int mca_btl_vader_put (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +int mca_btl_vader_put_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_mpool_base_registration_t *reg; void *rem_ptr; @@ -61,9 +60,10 @@ int mca_btl_vader_put (struct mca_btl_base_module_t *btl, struct mca_btl_base_en #endif #if OPAL_BTL_VADER_HAVE_CMA -int mca_btl_vader_put_cma (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_put_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { struct iovec src_iov = {.iov_base = local_address, .iov_len = size}; struct iovec dst_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size}; @@ -76,36 +76,29 @@ int mca_btl_vader_put_cma (struct mca_btl_base_module_t *btl, } /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } #endif #if OPAL_BTL_VADER_HAVE_KNEM -int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - struct mca_btl_base_descriptor_t *des) +int mca_btl_vader_put_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) des; - mca_btl_vader_segment_t *src = (mca_btl_vader_segment_t *) des->des_local; - mca_btl_vader_segment_t *dst = (mca_btl_vader_segment_t *) des->des_remote; - const size_t size = min(dst->base.seg_len, src->base.seg_len); - intptr_t offset = dst->base.seg_addr.lval - dst->registered_base; struct knem_cmd_param_iovec send_iovec; struct knem_cmd_inline_copy icopy; /* Fill in the ioctl data fields. There's no async completion, so we don't need to worry about getting a slot, etc. */ - send_iovec.base = (uintptr_t) src->base.seg_addr.lval; + send_iovec.base = (uintptr_t) local_address; send_iovec.len = size; icopy.local_iovec_array = (uintptr_t) &send_iovec; icopy.local_iovec_nr = 1; - icopy.remote_cookie = dst->cookie; - icopy.remote_offset = offset; + icopy.remote_cookie = remote_handle->cookie; + icopy.remote_offset = remote_address - remote_handle->base_addr; icopy.write = 1; icopy.flags = 0; @@ -113,7 +106,7 @@ int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, * is greater than the cutoff. Not that if DMA is not supported * or the user specified 0 for knem_dma_min the knem_dma_min was * set to UINT_MAX in mca_btl_vader_knem_init. */ - if (mca_btl_vader_component.knem_dma_min <= dst->base.seg_len) { + if (mca_btl_vader_component.knem_dma_min <= size) { icopy.flags = KNEM_FLAG_DMA; } /* synchronous flags only, no need to specify icopy.async_status_index */ @@ -129,10 +122,7 @@ int mca_btl_vader_put_knem (struct mca_btl_base_module_t *btl, } /* always call the callback function */ - frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; - - frag->endpoint = endpoint; - mca_btl_vader_frag_complete (frag); + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/vader/btl_vader_send.c b/opal/mca/btl/vader/btl_vader_send.c index 5182f40ec6..59a10c366a 100644 --- a/opal/mca/btl/vader/btl_vader_send.c +++ b/opal/mca/btl/vader/btl_vader_send.c @@ -40,7 +40,7 @@ int mca_btl_vader_send (struct mca_btl_base_module_t *btl, mca_btl_base_tag_t tag) { mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) descriptor; - const size_t total_size = frag->segments[0].base.seg_len; + const size_t total_size = frag->segments[0].seg_len; if (OPAL_LIKELY(frag->fbox)) { mca_btl_vader_fbox_send (frag->fbox, tag); diff --git a/opal/mca/btl/vader/btl_vader_sendi.c b/opal/mca/btl/vader/btl_vader_sendi.c index 877105192a..25b5d691b6 100644 --- a/opal/mca/btl/vader/btl_vader_sendi.c +++ b/opal/mca/btl/vader/btl_vader_sendi.c @@ -78,7 +78,7 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, frag->hdr->tag = tag; /* write the match header (with MPI comm/tag/etc. info) */ - memcpy (frag->segments[0].base.seg_addr.pval, header, header_size); + memcpy (frag->segments[0].seg_addr.pval, header, header_size); /* write the message data if there is any */ /* we can't use single-copy semantics here since as caller will consider the send @@ -88,7 +88,7 @@ int mca_btl_vader_sendi (struct mca_btl_base_module_t *btl, struct iovec iov; /* pack the data into the supplied buffer */ - iov.iov_base = (IOVBASE_TYPE *)((uintptr_t)frag->segments[0].base.seg_addr.pval + header_size); + iov.iov_base = (IOVBASE_TYPE *)((uintptr_t)frag->segments[0].seg_addr.pval + header_size); iov.iov_len = length = payload_size; (void) opal_convertor_pack (convertor, &iov, &iov_count, &length);