From 0ee12467d6f8d3290f409388787f5ba896f82424 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Thu, 21 Jul 2005 16:06:46 +0000 Subject: [PATCH] * implement RDMA put * remove the recv fragment code, since it really isn't needed * handle memory descriptor binding a bit more sanely, and use thresholds so that Portals does the unlink for us, when it feels like it. This commit was SVN r6575. --- ompi/mca/btl/portals/src/btl_portals.c | 135 ++++++++++++--- ompi/mca/btl/portals/src/btl_portals.h | 3 + .../btl/portals/src/btl_portals_component.c | 3 +- ompi/mca/btl/portals/src/btl_portals_frag.c | 4 - ompi/mca/btl/portals/src/btl_portals_frag.h | 56 +++--- ompi/mca/btl/portals/src/btl_portals_rdma.c | 163 +++++++++++++++++- ompi/mca/btl/portals/src/btl_portals_recv.c | 19 +- ompi/mca/btl/portals/src/btl_portals_recv.h | 3 +- ompi/mca/btl/portals/src/btl_portals_send.c | 27 ++- ompi/mca/btl/portals/src/btl_portals_send.h | 14 +- 10 files changed, 320 insertions(+), 107 deletions(-) diff --git a/ompi/mca/btl/portals/src/btl_portals.c b/ompi/mca/btl/portals/src/btl_portals.c index 081030d2a7..7c7e6c9bb5 100644 --- a/ompi/mca/btl/portals/src/btl_portals.c +++ b/ompi/mca/btl/portals/src/btl_portals.c @@ -219,23 +219,23 @@ mca_btl_portals_register(struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* -mca_btl_portals_alloc(struct mca_btl_base_module_t* btl, +mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base, size_t size) { - mca_btl_portals_module_t* portals_btl = (mca_btl_portals_module_t*) btl; + mca_btl_portals_module_t* btl = (mca_btl_portals_module_t*) btl_base; mca_btl_portals_frag_t* frag; int rc; - if (size <= btl->btl_eager_limit) { - OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(portals_btl, frag, rc); + if (size <= btl->super.btl_eager_limit) { + OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(btl, frag, rc); frag->segment.seg_len = - size <= btl->btl_eager_limit ? - size : btl->btl_eager_limit ; + size <= btl->super.btl_eager_limit ? + size : btl->super.btl_eager_limit ; } else { - OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(portals_btl, frag, rc); + OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl, frag, rc); frag->segment.seg_len = - size <= btl->btl_max_send_size ? - size : btl->btl_max_send_size ; + size <= btl->super.btl_max_send_size ? + size : btl->super.btl_max_send_size ; } frag->base.des_flags = 0; @@ -265,60 +265,78 @@ mca_btl_portals_free(struct mca_btl_base_module_t* btl_base, } -/* BWB - fix me - this needs to do RDMA when we get there... */ mca_btl_base_descriptor_t* -mca_btl_portals_prepare_src(struct mca_btl_base_module_t* btl, +mca_btl_portals_prepare_src(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* peer, mca_mpool_base_registration_t* registration, struct ompi_convertor_t* convertor, size_t reserve, size_t* size) { + mca_btl_portals_module_t* btl = (mca_btl_portals_module_t*) btl_base; mca_btl_portals_frag_t* frag; size_t max_data = *size; struct iovec iov; uint32_t iov_count = 1; int32_t free_after; - int rc; + int ret; - if (max_data+reserve <= btl->btl_eager_limit) { + if (0 == reserve && 0 == ompi_convertor_need_buffers(convertor)) { + /* we can send right out of the buffer (woo!). */ + + OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&btl->super, frag, ret); + if(NULL == frag){ + return NULL; + } + iov.iov_len = max_data; + iov.iov_base = NULL; + + ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, + &free_after); + + frag->segment.seg_len = max_data; + frag->segment.seg_addr.pval = iov.iov_base; + + } else if (max_data+reserve <= btl->super.btl_eager_limit) { /* - * if we aren't pinning the data and the requested size is less - * than the eager limit pack into a fragment from the eager pool + * if we can't send out of the buffer directly and the + * requested size is less than the eager limit, pack into a + * fragment from the eager pool */ - OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(btl, frag, rc); + OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(btl, frag, ret); if (NULL == frag) { return NULL; } iov.iov_len = max_data; iov.iov_base = (unsigned char*) frag->segment.seg_addr.pval + reserve; - rc = ompi_convertor_pack(convertor, &iov, &iov_count, + ret = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); *size = max_data; - if (rc < 0) { + if (ret < 0) { OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(btl, frag); return NULL; } frag->segment.seg_len = max_data + reserve; + } else { /* * otherwise pack as much data as we can into a fragment * that is the max send size. */ - OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl, frag, rc); + OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl, frag, ret); if (NULL == frag) { return NULL; } - if (max_data + reserve > btl->btl_max_send_size){ - max_data = btl->btl_max_send_size - reserve; + if (max_data + reserve > btl->super.btl_max_send_size){ + max_data = btl->super.btl_max_send_size - reserve; } iov.iov_len = max_data; iov.iov_base = (unsigned char*) frag->segment.seg_addr.pval + reserve; - rc = ompi_convertor_pack(convertor, &iov, &iov_count, + ret = ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, &free_after); *size = max_data; - if ( rc < 0 ) { + if ( ret < 0 ) { OMPI_BTL_PORTALS_FRAG_RETURN_MAX(btl, frag); return NULL; } @@ -343,11 +361,74 @@ mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base, size_t reserve, size_t* size) { - /* BWB - FIXME - Implement prepare_dst */ - opal_output(mca_btl_portals_component.portals_output, - "Warning: call to unimplemented function prepare_dst"); + struct mca_btl_portals_module_t *btl = + (struct mca_btl_portals_module_t *) btl_base; + mca_btl_portals_frag_t* frag; + ptl_md_t md; + ptl_handle_me_t me_h; + ptl_handle_md_t md_h; + int ret; - return NULL; + OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&btl->super, frag, ret); + if(NULL == frag) { + return NULL; + } + + frag->segment.seg_len = *size; + frag->segment.seg_addr.pval = convertor->pBaseBuf + convertor->bConverted; + frag->segment.seg_key.key64 = OPAL_THREAD_ADD64(&(btl->portals_rdma_key), 1); + + frag->base.des_src = NULL; + frag->base.des_src_cnt = 0; + frag->base.des_dst = &frag->segment; + frag->base.des_dst_cnt = 1; + frag->base.des_flags = 0; + + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "rdma match posted for frag 0x%x, callback 0x%x, bits %lld", + frag, frag->base.des_cbfunc, frag->segment.seg_key.key64)); + + /* create a match entry */ + ret = PtlMEAttach(btl->portals_ni_h, + OMPI_BTL_PORTALS_RDMA_TABLE_ID, + peer->endpoint_ptl_id, + frag->segment.seg_key.key64, /* match */ + 0, /* ignore */ + PTL_UNLINK, + PTL_INS_AFTER, + &me_h); + if (PTL_OK != ret) { + opal_output(mca_btl_portals_component.portals_output, + "Error creating recv reject ME: %d", ret); + OMPI_BTL_PORTALS_FRAG_RETURN_USER(&btl->super, frag); + return NULL; + } + + /* setup the memory descriptor. RDMA should never need to be + retransmitted, so we set the threshold for the 2 events it will + receive (PUT/GET START and END). No need to track the unlinks + later :) */ + md.start = frag->segment.seg_addr.pval; + md.length = frag->segment.seg_len; + md.threshold = 2; /* unlink after START / END */ + md.max_size = 0; + md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET; + md.user_ptr = frag; /* keep a pointer to ourselves */ + md.eq_handle = btl->portals_eq_handles[OMPI_BTL_PORTALS_EQ_RDMA]; + + ret = PtlMDAttach(me_h, + md, + PTL_UNLINK, + &md_h); + if (PTL_OK != ret) { + opal_output(mca_btl_portals_component.portals_output, + "Error creating recv reject ME: %d", ret); + PtlMEUnlink(me_h); + OMPI_BTL_PORTALS_FRAG_RETURN_USER(&btl->super, frag); + return NULL; + } + + return &frag->base; } diff --git a/ompi/mca/btl/portals/src/btl_portals.h b/ompi/mca/btl/portals/src/btl_portals.h index 064d8139af..23ccd1e116 100644 --- a/ompi/mca/btl/portals/src/btl_portals.h +++ b/ompi/mca/btl/portals/src/btl_portals.h @@ -117,6 +117,9 @@ struct mca_btl_portals_module_t { /* queued sends */ opal_list_t portals_queued_sends; + /* key to use for next rdma operation */ + volatile int64_t portals_rdma_key; + /* our portals network interface */ ptl_handle_ni_t portals_ni_h; /* the limits returned from PtlNIInit for interface */ diff --git a/ompi/mca/btl/portals/src/btl_portals_component.c b/ompi/mca/btl/portals/src/btl_portals_component.c index 744509dda3..69b935c8ff 100644 --- a/ompi/mca/btl/portals/src/btl_portals_component.c +++ b/ompi/mca/btl/portals/src/btl_portals_component.c @@ -165,7 +165,7 @@ mca_btl_portals_component_open(void) mca_btl_portals_module.super.btl_bandwidth = param_register_int("bandwidth", 1000); - mca_btl_portals_module.super.btl_flags = MCA_BTL_FLAGS_SEND; + mca_btl_portals_module.super.btl_flags = MCA_BTL_FLAGS_RDMA; bzero(&(mca_btl_portals_module.portals_reg), sizeof(mca_btl_portals_module.portals_reg)); @@ -202,6 +202,7 @@ mca_btl_portals_component_open(void) mca_btl_portals_module.portals_ni_h = PTL_INVALID_HANDLE; mca_btl_portals_module.portals_sr_dropped = 0; mca_btl_portals_module.portals_outstanding_sends = 0; + mca_btl_portals_module.portals_rdma_key = 1; return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/portals/src/btl_portals_frag.c b/ompi/mca/btl/portals/src/btl_portals_frag.c index 2a22c0a88d..0111fcde29 100644 --- a/ompi/mca/btl/portals/src/btl_portals_frag.c +++ b/ompi/mca/btl/portals/src/btl_portals_frag.c @@ -31,8 +31,6 @@ mca_btl_portals_frag_common_send_constructor(mca_btl_portals_frag_t* frag) frag->segment.seg_addr.pval = frag + 1; frag->segment.seg_len = frag->size; frag->segment.seg_key.key64 = 0; - - frag->type = OMPI_BTL_PORTALS_FRAG_SEND; } @@ -61,8 +59,6 @@ mca_btl_portals_frag_user_constructor(mca_btl_portals_frag_t* frag) frag->base.des_src = 0; frag->base.des_src_cnt = 0; frag->size = 0; - - frag->type = OMPI_BTL_PORTALS_FRAG_SEND; } diff --git a/ompi/mca/btl/portals/src/btl_portals_frag.h b/ompi/mca/btl/portals/src/btl_portals_frag.h index aeea06ae21..dcdc0ad138 100644 --- a/ompi/mca/btl/portals/src/btl_portals_frag.h +++ b/ompi/mca/btl/portals/src/btl_portals_frag.h @@ -22,38 +22,19 @@ extern "C" { #endif OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_portals_frag_t); -typedef enum { - OMPI_BTL_PORTALS_FRAG_SEND, - OMPI_BTL_PORTALS_FRAG_RECV -} mca_btl_portals_frag_type_t; - -struct mca_btl_portals_send_frag_t { - struct mca_btl_portals_module_t *btl; - struct mca_btl_base_endpoint_t *endpoint; - mca_btl_base_header_t hdr; -}; -typedef struct mca_btl_portals_send_frag_t mca_btl_portals_send_frag_t; - -struct mca_btl_portals_recv_frag_t { - struct mca_btl_portals_recv_chunk_t *chunk; -}; -typedef struct mca_btl_portals_recv_frag_t mca_btl_portals_recv_frag_t; - /** - * PORTALS send fraportalsent derived type. + * Portals send fragment derived type */ struct mca_btl_portals_frag_t { mca_btl_base_descriptor_t base; mca_btl_base_segment_t segment; - mca_btl_portals_frag_type_t type; + struct mca_btl_portals_module_t *btl; + struct mca_btl_base_endpoint_t *endpoint; + mca_btl_base_header_t hdr; size_t size; - union { - mca_btl_portals_send_frag_t send_frag; - mca_btl_portals_recv_frag_t recv_frag; - } u; -}; +}; typedef struct mca_btl_portals_frag_t mca_btl_portals_frag_t; OBJ_CLASS_DECLARATION(mca_btl_portals_frag_t); @@ -76,45 +57,48 @@ OBJ_CLASS_DECLARATION(mca_btl_portals_frag_user_t); * free list(s). */ -#define OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(btl, frag, rc) \ +#define OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(btl_macro, frag, rc) \ { \ \ opal_list_item_t *item; \ - OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl)->portals_frag_eager, item, rc); \ + OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, item, rc); \ frag = (mca_btl_portals_frag_t*) item; \ + frag->btl = (mca_btl_portals_module_t*) btl_macro; \ } -#define OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(btl, frag) \ +#define OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(btl_macro, frag) \ { \ - OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl)->portals_frag_eager, \ + OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, \ (opal_list_item_t*)(frag)); \ } -#define OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl, frag, rc) \ +#define OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc) \ { \ \ opal_list_item_t *item; \ - OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl)->portals_frag_max, item, rc); \ + OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_max, item, rc); \ frag = (mca_btl_portals_frag_t*) item; \ + frag->btl = (mca_btl_portals_module_t*) btl_macro; \ } -#define OMPI_BTL_PORTALS_FRAG_RETURN_MAX(btl, frag) \ +#define OMPI_BTL_PORTALS_FRAG_RETURN_MAX(btl_macro, frag) \ { \ - OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl)->portals_frag_max, \ + OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_max, \ (opal_list_item_t*)(frag)); \ } -#define OMPI_BTL_PORTALS_FRAG_ALLOC_USER(btl, frag, rc) \ +#define OMPI_BTL_PORTALS_FRAG_ALLOC_USER(btl_macro, frag, rc) \ { \ opal_list_item_t *item; \ - OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl)->portals_frag_user, item, rc); \ + OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_user, item, rc); \ frag = (mca_btl_portals_frag_t*) item; \ + frag->btl = (mca_btl_portals_module_t*) btl_macro; \ } -#define OMPI_BTL_PORTALS_FRAG_RETURN_USER(btl, frag) \ +#define OMPI_BTL_PORTALS_FRAG_RETURN_USER(btl_macro, frag) \ { \ - OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl)->portals_frag_user, \ + OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_user, \ (opal_list_item_t*)(frag)); \ } diff --git a/ompi/mca/btl/portals/src/btl_portals_rdma.c b/ompi/mca/btl/portals/src/btl_portals_rdma.c index 5ed49a5bed..90b1886a22 100644 --- a/ompi/mca/btl/portals/src/btl_portals_rdma.c +++ b/ompi/mca/btl/portals/src/btl_portals_rdma.c @@ -21,12 +21,121 @@ #include "btl_portals.h" #include "btl_portals_rdma.h" +#include "btl_portals_frag.h" int -mca_btl_portals_process_rdma(mca_btl_portals_module_t *module, +mca_btl_portals_process_rdma(mca_btl_portals_module_t *btl, ptl_event_t *ev) { + mca_btl_portals_frag_t *frag = + (mca_btl_portals_frag_t*) ev->md.user_ptr; + + switch (ev->type) { + case PTL_EVENT_SEND_START: + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "rdma: PTL_EVENT_SEND_START for 0x%x", + frag)); + + if (ev->ni_fail_type != PTL_NI_OK) { + opal_output(mca_btl_portals_component.portals_output, + "Failure to start rdma send event\n"); + /* unlink, since we don't expect to get an end or ack */ + PtlMDUnlink(ev->md_handle); + frag->base.des_cbfunc(&btl->super, + frag->endpoint, + &frag->base, + OMPI_ERROR); + } + break; + + case PTL_EVENT_SEND_END: + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "rdma: PTL_EVENT_SEND_END for 0x%x", + frag)); + + if (ev->ni_fail_type != PTL_NI_OK) { + opal_output(mca_btl_portals_component.portals_output, + "Failure to end rdma send event\n"); + /* unlink, since we don't expect to get an ack */ + PtlMDUnlink(ev->md_handle); + frag->base.des_cbfunc(&btl->super, + frag->endpoint, + &frag->base, + OMPI_ERROR); + } + break; + + case PTL_EVENT_ACK: + /* ok, this is the real work - the message has been received + on the other side. */ + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "rdma: PTL_EVENT_ACK for 0x%x, Ox%x", + frag, frag->base.des_cbfunc)); + + if (ev->ni_fail_type != PTL_NI_OK) { + opal_output(mca_btl_portals_component.portals_output, + "Failure in rdma send event ack\n"); + frag->base.des_cbfunc(&btl->super, + frag->endpoint, + &frag->base, + OMPI_ERROR); + } else { + assert(ev->mlength == frag->segment.seg_len); + + /* let the PML know we're done... */ + frag->base.des_cbfunc(&btl->super, + frag->endpoint, + &frag->base, + OMPI_SUCCESS); + } + break; + + case PTL_EVENT_PUT_START: + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "rdma: PTL_EVENT_PUT_START for 0x%x", + frag)); + + if (ev->ni_fail_type != PTL_NI_OK) { + opal_output(mca_btl_portals_component.portals_output, + "Failure in rdma put start\n"); + /* unlink, since we don't expect to get an end */ + PtlMDUnlink(ev->md_handle); + frag->base.des_cbfunc(&btl->super, + frag->endpoint, + &frag->base, + OMPI_ERROR); + } + break; + + case PTL_EVENT_PUT_END: + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "rdma: PTL_EVENT_PUT_END for 0x%x, Ox%x", + frag, frag->base.des_cbfunc)); + + if (ev->ni_fail_type != PTL_NI_OK) { + opal_output(mca_btl_portals_component.portals_output, + "Failure in rdma put end\n"); + frag->base.des_cbfunc(&btl->super, + frag->endpoint, + &frag->base, + OMPI_ERROR); + } else { + assert(ev->mlength == frag->segment.seg_len); +#if 0 + /* let the PML know we're done... */ + frag->base.des_cbfunc(&btl->super, + frag->endpoint, + &frag->base, + OMPI_SUCCESS); +#endif + } + break; + + default: + break; + } + return OMPI_SUCCESS; } @@ -34,11 +143,55 @@ mca_btl_portals_process_rdma(mca_btl_portals_module_t *module, int mca_btl_portals_put(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* btl_peer, - struct mca_btl_base_descriptor_t* decriptor) + struct mca_btl_base_descriptor_t* descriptor) { - opal_output(mca_btl_portals_component.portals_output, - "Warning: call to unimplemented function put()"); - return OMPI_ERR_NOT_IMPLEMENTED; + mca_btl_portals_frag_t *frag = (mca_btl_portals_frag_t*) descriptor; + ptl_md_t md; + ptl_handle_md_t md_h; + int ret; + + frag->endpoint = btl_peer; + + /* setup the send */ + md.start = frag->segment.seg_addr.pval; + md.length = frag->segment.seg_len; + md.threshold = 3; /* unlink after start, end, ack */ + md.max_size = 0; + md.options = 0; + md.user_ptr = frag; /* keep a pointer to ourselves */ + md.eq_handle = frag->btl->portals_eq_handles[OMPI_BTL_PORTALS_EQ_RDMA]; + + /* make a free-floater */ + ret = PtlMDBind(frag->btl->portals_ni_h, + md, + PTL_UNLINK, + &md_h); + if (ret != PTL_OK) { + opal_output(mca_btl_portals_component.portals_output, + "PtlMDBind failed with error %d", ret); + return OMPI_ERROR; + } + + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "rdma put called for frag 0x%x, callback 0x%xbits %lld", + frag, frag->base.des_cbfunc, frag->base.des_dst[0].seg_key.key64)); + + ret = PtlPut(md_h, + PTL_ACK_REQ, + btl_peer->endpoint_ptl_id, + OMPI_BTL_PORTALS_RDMA_TABLE_ID, + 0, /* ac_index - not used*/ + frag->base.des_dst[0].seg_key.key64, /* match bits */ + 0, /* remote offset - not used */ + frag->hdr.tag); /* hdr_data - tag */ + if (ret != PTL_OK) { + opal_output(mca_btl_portals_component.portals_output, + "PtlPut failed with error %d", ret); + PtlMDUnlink(md_h); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/portals/src/btl_portals_recv.c b/ompi/mca/btl/portals/src/btl_portals_recv.c index 67011f2991..7fa5af9c3e 100644 --- a/ompi/mca/btl/portals/src/btl_portals_recv.c +++ b/ompi/mca/btl/portals/src/btl_portals_recv.c @@ -177,6 +177,7 @@ mca_btl_portals_process_recv(mca_btl_portals_module_t *btl, opal_output(mca_btl_portals_component.portals_output, "Failure to start event\n"); } else { + /* increase reference count on the memory chunk */ OPAL_THREAD_ADD32(&(chunk->pending), 1); } break; @@ -188,7 +189,7 @@ mca_btl_portals_process_recv(mca_btl_portals_module_t *btl, if (ev->ni_fail_type != PTL_NI_OK) { opal_output(mca_btl_portals_component.portals_output, "Failure to end event\n"); - OPAL_THREAD_ADD32(&(chunk->pending), -1); + mca_btl_portals_return_chunk_part(btl, chunk); return OMPI_ERROR; } @@ -196,9 +197,10 @@ mca_btl_portals_process_recv(mca_btl_portals_module_t *btl, OPAL_OUTPUT_VERBOSE((95, mca_btl_portals_component.portals_output, "received data for tag %d\n", tag)); - /* it's a user, so we have to manually setup the segment */ + /* grab a user fragment (since memory is already allocated in + as part of the chunk), fill in the right bits, and call the + callback */ OMPI_BTL_PORTALS_FRAG_ALLOC_USER(btl, frag, ret); - frag->type = OMPI_BTL_PORTALS_FRAG_RECV; frag->size = ev->mlength; frag->base.des_dst = &frag->segment; frag->base.des_dst_cnt = 1; @@ -207,9 +209,6 @@ mca_btl_portals_process_recv(mca_btl_portals_module_t *btl, frag->segment.seg_addr.pval = (((char*) ev->md.start) + ev->offset); frag->segment.seg_len = frag->size; - frag->segment.seg_key.key64 = 0; - - frag->u.recv_frag.chunk = chunk; if (ev->md.length - (ev->offset + ev->mlength) < ev->md.max_size) { /* the chunk is full. It's deactivated automagically, but we @@ -221,11 +220,11 @@ mca_btl_portals_process_recv(mca_btl_portals_module_t *btl, } btl->portals_reg[tag].cbfunc(&btl->super, - tag, - &frag->base, - btl->portals_reg[tag].cbdata); - mca_btl_portals_return_chunk_part(btl, frag); + tag, + &frag->base, + btl->portals_reg[tag].cbdata); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&btl->super, frag); + mca_btl_portals_return_chunk_part(btl, chunk); break; default: break; diff --git a/ompi/mca/btl/portals/src/btl_portals_recv.h b/ompi/mca/btl/portals/src/btl_portals_recv.h index 1d75f5d0b7..8351132719 100644 --- a/ompi/mca/btl/portals/src/btl_portals_recv.h +++ b/ompi/mca/btl/portals/src/btl_portals_recv.h @@ -124,9 +124,8 @@ mca_btl_portals_activate_chunk(mca_btl_portals_recv_chunk_t *chunk) static inline void mca_btl_portals_return_chunk_part(mca_btl_portals_module_t *btl, - mca_btl_portals_frag_t *frag) + mca_btl_portals_recv_chunk_t *chunk) { - mca_btl_portals_recv_chunk_t *chunk = frag->u.recv_frag.chunk; int ret; OPAL_OUTPUT_VERBOSE((100, mca_btl_portals_component.portals_output, diff --git a/ompi/mca/btl/portals/src/btl_portals_send.c b/ompi/mca/btl/portals/src/btl_portals_send.c index af30ed5ade..ec87f9c6c8 100644 --- a/ompi/mca/btl/portals/src/btl_portals_send.c +++ b/ompi/mca/btl/portals/src/btl_portals_send.c @@ -41,8 +41,10 @@ mca_btl_portals_process_send(mca_btl_portals_module_t *btl, if (ev->ni_fail_type != PTL_NI_OK) { opal_output(mca_btl_portals_component.portals_output, "Failure to start send event\n"); + /* unlink, since we don't expect to get an end or ack */ + PtlMDUnlink(ev->md_handle); frag->base.des_cbfunc(&btl->super, - frag->u.send_frag.endpoint, + frag->endpoint, &frag->base, OMPI_ERROR); } @@ -55,8 +57,10 @@ mca_btl_portals_process_send(mca_btl_portals_module_t *btl, if (ev->ni_fail_type != PTL_NI_OK) { opal_output(mca_btl_portals_component.portals_output, "Failure to end send event\n"); + /* unlink, since we don't expect to get an ack */ + PtlMDUnlink(ev->md_handle); frag->base.des_cbfunc(&btl->super, - frag->u.send_frag.endpoint, + frag->endpoint, &frag->base, OMPI_ERROR); } @@ -69,17 +73,17 @@ mca_btl_portals_process_send(mca_btl_portals_module_t *btl, OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "send: PTL_EVENT_ACK for 0x%x, Ox%x", frag, frag->base.des_cbfunc)); + if (ev->ni_fail_type != PTL_NI_OK) { opal_output(mca_btl_portals_component.portals_output, - "Failure to end send event\n"); + "Failure in send event ack\n"); frag->base.des_cbfunc(&btl->super, - frag->u.send_frag.endpoint, + frag->endpoint, &frag->base, OMPI_ERROR); } else if (0 == ev->mlength) { /* other side did not receive the message */ - /* BWB - implement check for retransmit */ opal_output(mca_btl_portals_component.portals_output, "message was dropped. Adding to front of queue list"); opal_list_prepend(&(btl->portals_queued_sends), @@ -88,17 +92,12 @@ mca_btl_portals_process_send(mca_btl_portals_module_t *btl, } else { /* the other side received the message */ OPAL_THREAD_ADD32(&btl->portals_outstanding_sends, -1); - /* we're done with the md - return it. Do this before - anything else in case the PML releases resources, then - gets more resources (ie, what's currently in this - md) */ - PtlMDUnlink(ev->md_handle); assert(ev->mlength == frag->segment.seg_len); /* let the PML know we're done... */ frag->base.des_cbfunc(&btl->super, - frag->u.send_frag.endpoint, + frag->endpoint, &frag->base, OMPI_SUCCESS); @@ -131,13 +130,11 @@ mca_btl_portals_send(struct mca_btl_base_module_t* btl_base, int32_t num_sends; int ret; - frag->u.send_frag.endpoint = endpoint; - frag->u.send_frag.hdr.tag = tag; - frag->u.send_frag.btl = btl; + frag->endpoint = endpoint; + frag->hdr.tag = tag; num_sends = OPAL_THREAD_ADD32(&btl->portals_outstanding_sends, 1); - /* BWB - implement check for too many pending messages */ OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "send called for frag 0x%x, 0x%x", frag, frag->base.des_cbfunc)); diff --git a/ompi/mca/btl/portals/src/btl_portals_send.h b/ompi/mca/btl/portals/src/btl_portals_send.h index d7552c9d3b..57a10675e3 100644 --- a/ompi/mca/btl/portals/src/btl_portals_send.h +++ b/ompi/mca/btl/portals/src/btl_portals_send.h @@ -33,14 +33,14 @@ mca_btl_portals_send_frag(mca_btl_portals_frag_t *frag) /* setup the send */ md.start = frag->segment.seg_addr.pval; md.length = frag->segment.seg_len; - md.threshold = PTL_MD_THRESH_INF; /* unlink based on protocol */ + md.threshold = 3; /* unlink after start, end, ack */ md.max_size = 0; md.options = 0; /* BWB - can we optimize? */ md.user_ptr = frag; /* keep a pointer to ourselves */ - md.eq_handle = frag->u.send_frag.btl->portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND]; + md.eq_handle = frag->btl->portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND]; /* make a free-floater */ - ret = PtlMDBind(frag->u.send_frag.btl->portals_ni_h, + ret = PtlMDBind(frag->btl->portals_ni_h, md, PTL_UNLINK, &md_h); @@ -52,12 +52,12 @@ mca_btl_portals_send_frag(mca_btl_portals_frag_t *frag) ret = PtlPut(md_h, PTL_ACK_REQ, - frag->u.send_frag.endpoint->endpoint_ptl_id, + frag->endpoint->endpoint_ptl_id, OMPI_BTL_PORTALS_SEND_TABLE_ID, 0, /* ac_index - not used*/ frag->segment.seg_key.key64, /* match bits */ 0, /* remote offset - not used */ - frag->u.send_frag.hdr.tag); /* hdr_data - tag */ + frag->hdr.tag); /* hdr_data - tag */ if (ret != PTL_OK) { opal_output(mca_btl_portals_component.portals_output, "PtlPut failed with error %d", ret); @@ -81,9 +81,9 @@ mca_btl_portals_progress_queued_sends(struct mca_btl_portals_module_t *btl) "retransmit for frag 0x%x, 0x%x", frag, frag->base.des_cbfunc)); return mca_btl_portals_send(&btl->super, - frag->u.send_frag.endpoint, + frag->endpoint, &(frag->base), - frag->u.send_frag.hdr.tag); + frag->hdr.tag); } return OMPI_SUCCESS; }