diff --git a/ompi/mca/btl/portals/btl_portals.c b/ompi/mca/btl/portals/btl_portals.c index 252b6308b4..7d0c0b444b 100644 --- a/ompi/mca/btl/portals/btl_portals.c +++ b/ompi/mca/btl/portals/btl_portals.c @@ -153,7 +153,7 @@ mca_btl_portals_add_procs(struct mca_btl_base_module_t* btl_base, /* fill in send memory descriptor */ mca_btl_portals_module.md_send.start = NULL; mca_btl_portals_module.md_send.length = 0; - mca_btl_portals_module.md_send.threshold = 2; /* send and ack */ + mca_btl_portals_module.md_send.threshold = PTL_MD_THRESH_INF; mca_btl_portals_module.md_send.max_size = 0; mca_btl_portals_module.md_send.options = PTL_MD_EVENT_START_DISABLE; mca_btl_portals_module.md_send.user_ptr = NULL; @@ -163,9 +163,6 @@ mca_btl_portals_add_procs(struct mca_btl_base_module_t* btl_base, ret = OMPI_SUCCESS; } - opal_output_verbose(50, mca_btl_portals_component.portals_output, - "count: %d", mca_btl_portals_module.portals_num_procs); - return ret; } @@ -233,21 +230,20 @@ mca_btl_base_descriptor_t* mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base, size_t size) { - mca_btl_portals_frag_t* frag; int rc; + mca_btl_portals_frag_t* frag; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); - OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "alloc called with size %d", size)); - if (size <= mca_btl_portals_module.super.btl_eager_limit) { OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, rc); + if (OMPI_SUCCESS != rc) return NULL; frag->segments[0].seg_len = size <= mca_btl_portals_module.super.btl_eager_limit ? size : mca_btl_portals_module.super.btl_eager_limit ; } else { OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(&mca_btl_portals_module, frag, rc); + if (OMPI_SUCCESS != rc) return NULL; frag->segments[0].seg_len = size <= mca_btl_portals_module.super.btl_max_send_size ? size : mca_btl_portals_module.super.btl_max_send_size ; @@ -256,10 +252,6 @@ mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base, frag->base.des_src_cnt = 1; frag->base.des_flags = 0; - /* can't setup off an alloc right now - we don't know how much the - caller will actually use */ - frag->md_h = PTL_INVALID_HANDLE; - return &frag->base; } @@ -272,20 +264,23 @@ mca_btl_portals_free(struct mca_btl_base_module_t* btl_base, assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); - if (frag->md_h != PTL_INVALID_HANDLE) { - OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "rdma frag free frag 0x%x, callback 0x%x, bits %lld", - frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64)); - PtlMDUnlink(frag->md_h); - } - - if (frag->size == 0) { - OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); - } else if (frag->size == mca_btl_portals_module.super.btl_eager_limit){ + if (frag->size == mca_btl_portals_module.super.btl_eager_limit){ + /* don't ever unlink eager frags */ OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module.super, frag); } else if (frag->size == mca_btl_portals_module.super.btl_max_send_size) { + if (frag->md_h != PTL_INVALID_HANDLE) { + PtlMDUnlink(frag->md_h); + frag->md_h = PTL_INVALID_HANDLE; + } OMPI_BTL_PORTALS_FRAG_RETURN_MAX(&mca_btl_portals_module.super, frag); - } else { + } else if (frag->size == 0) { + if (frag->md_h != PTL_INVALID_HANDLE) { + PtlMDUnlink(frag->md_h); + frag->md_h = PTL_INVALID_HANDLE; + } + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); + OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); + } else { return OMPI_ERR_BAD_PARAM; } @@ -310,165 +305,100 @@ mca_btl_portals_prepare_src(struct mca_btl_base_module_t* btl_base, assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); - OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "prepare_src called with size %d", *size)); - - if (0 != ompi_convertor_need_buffers(convertor)) { - /* if we need to use buffers to pack the data, grab either an - eager or (if we need more space) max buffer, pack the data - into the first segment, and return */ - if (max_data+reserve <= mca_btl_portals_module.super.btl_eager_limit) { - /* - * if we can't send out of the buffer directly and the - * requested size is less than the eager limit, pack into a - * fragment from the eager pool - */ - OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, ret); - if (NULL == frag) { - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve; - ret = ompi_convertor_pack(convertor, &iov, &iov_count, - &max_data, &free_after); - *size = max_data; - if (ret < 0) { - OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module, frag); - return NULL; - } - frag->segments[0].seg_len = max_data + reserve; - frag->base.des_src_cnt = 1; - - } else { - /* - * otherwise pack as much data as we can into a fragment - * that is the max send size. - */ - OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(&mca_btl_portals_module, frag, ret); - if (NULL == frag) { - return NULL; - } - if (max_data + reserve > mca_btl_portals_module.super.btl_max_send_size){ - max_data = mca_btl_portals_module.super.btl_max_send_size - reserve; - } - iov.iov_len = max_data; - iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve; - ret = ompi_convertor_pack(convertor, &iov, &iov_count, - &max_data, &free_after); - *size = max_data; - if ( ret < 0 ) { - OMPI_BTL_PORTALS_FRAG_RETURN_MAX(&mca_btl_portals_module, frag); - return NULL; - } - frag->segments[0].seg_len = max_data + reserve; - frag->base.des_src_cnt = 1; + if (0 != reserve || 0 != ompi_convertor_need_buffers(convertor)) { + frag = (mca_btl_portals_frag_t*) + mca_btl_portals_alloc(btl_base, max_data + reserve); + if (NULL == frag) { + return NULL; } - /* clearly a send - delay setup of memory descriptor until send */ - frag->md_h = PTL_INVALID_HANDLE; + if (max_data + reserve > frag->size) { + max_data = frag->size - reserve; + } + iov.iov_len = max_data; + iov.iov_base = (unsigned char*) frag->segments[0].seg_addr.pval + reserve; + ret = ompi_convertor_pack(convertor, &iov, &iov_count, + &max_data, &free_after); + *size = max_data; + if ( ret < 0 ) { + return NULL; + } + + frag->segments[0].seg_len = max_data + reserve; + frag->base.des_src_cnt = 1; } else { - /* no need to pack - we can send directly out of the user's - buffer. If we have reserve space, use an eager fragment - and give the caller the eager space as reserve. If we have - no reserve space needs, use a user frag */ - if (0 == reserve) { - ptl_md_t md; - ptl_handle_me_t me_h; + /* no need to pack - rdma operation out of user's buffer */ + ptl_md_t md; + ptl_handle_me_t me_h; - /* user frags are always setup to use only one fragment */ - OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret); - if(NULL == frag){ - return NULL; - } - iov.iov_len = max_data; - iov.iov_base = NULL; + /* reserve space in the event queue for rdma operations immediately */ + while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) > + mca_btl_portals_module.portals_max_outstanding_ops) { + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); + mca_btl_portals_component_progress(); + } - ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, - &free_after); + OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret); + if(NULL == frag){ + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); + return NULL; + } + iov.iov_len = max_data; + iov.iov_base = NULL; - frag->segments[0].seg_len = max_data; - frag->segments[0].seg_addr.pval = iov.iov_base; - frag->segments[0].seg_key.key64 = OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1); - frag->base.des_src_cnt = 1; + ompi_convertor_pack(convertor, &iov, &iov_count, &max_data, + &free_after); - /* either a put or get. figure out which later */ + frag->segments[0].seg_len = max_data; + frag->segments[0].seg_addr.pval = iov.iov_base; + frag->segments[0].seg_key.key64 = + OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1); + frag->base.des_src_cnt = 1; - OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "rdma src posted for frag 0x%x, callback 0x%x, bits %lld", - frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64)); + /* either a put or get. figure out which later */ + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "rdma src posted for frag 0x%x, callback 0x%x, bits %lld", + frag, frag->base.des_cbfunc, frag->segments[0].seg_key.key64)); - /* create a match entry */ - ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h, - OMPI_BTL_PORTALS_RDMA_TABLE_ID, - *((mca_btl_base_endpoint_t*) peer), - frag->segments[0].seg_key.key64, /* match */ - 0, /* ignore */ - PTL_UNLINK, - PTL_INS_AFTER, - &me_h); - if (PTL_OK != ret) { - opal_output(mca_btl_portals_component.portals_output, - "Error creating rdma src ME: %d", ret); - OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); - return NULL; - } + /* create a match entry */ + ret = PtlMEAttach(mca_btl_portals_module.portals_ni_h, + OMPI_BTL_PORTALS_RDMA_TABLE_ID, + *((mca_btl_base_endpoint_t*) peer), + frag->segments[0].seg_key.key64, /* match */ + 0, /* ignore */ + PTL_UNLINK, + PTL_INS_AFTER, + &me_h); + if (PTL_OK != ret) { + opal_output(mca_btl_portals_component.portals_output, + "Error creating rdma src ME: %d", ret); + OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); + return NULL; + } - /* setup the memory descriptor. RDMA should never need to be - retransmitted, so we set the threshold for the event it will - receive (PUT/GET START and END). No need to track the unlinks - later :) */ - md.start = frag->segments[0].seg_addr.pval; - md.length = frag->segments[0].seg_len; - md.threshold = PTL_MD_THRESH_INF; - md.max_size = 0; - md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; - md.user_ptr = frag; /* keep a pointer to ourselves */ - md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ]; + /* setup the memory descriptor */ + md.start = frag->segments[0].seg_addr.pval; + md.length = frag->segments[0].seg_len; + md.threshold = PTL_MD_THRESH_INF; + md.max_size = 0; + md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; + md.user_ptr = frag; /* keep a pointer to ourselves */ + md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND]; - ret = PtlMDAttach(me_h, - md, - PTL_UNLINK, - &(frag->md_h)); - if (PTL_OK != ret) { - opal_output(mca_btl_portals_component.portals_output, - "Error creating rdma src MD: %d", ret); - PtlMEUnlink(me_h); - OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); - return NULL; - } - - } else { - OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, ret); - if (NULL == frag) { - return NULL; - } - - iov.iov_len = max_data; - iov.iov_base = NULL; - ret = ompi_convertor_pack(convertor, &iov, &iov_count, - &max_data, &free_after); - - *size = max_data; - if (ret < 0) { - OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(&mca_btl_portals_module, frag); - return NULL; - } - - frag->segments[0].seg_len = reserve; - frag->segments[1].seg_addr.pval = iov.iov_base; - frag->segments[1].seg_len = max_data; - frag->base.des_src_cnt = 2; - - frag->iov[0].iov_base = frag->segments[0].seg_addr.pval; - frag->iov[0].iov_len = frag->segments[0].seg_len; - frag->iov[1].iov_base = frag->segments[1].seg_addr.pval; - frag->iov[1].iov_len = frag->segments[1].seg_len; - - /* clearly a send - delay setup of memory descriptor until send */ - frag->md_h = PTL_INVALID_HANDLE; + ret = PtlMDAttach(me_h, + md, + PTL_UNLINK, + &(frag->md_h)); + if (PTL_OK != ret) { + opal_output(mca_btl_portals_component.portals_output, + "Error creating rdma src MD: %d", ret); + PtlMEUnlink(me_h); + OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); + return NULL; } } @@ -497,22 +427,29 @@ mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base, assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); + /* reserve space in the event queue for rdma operations immediately */ + while (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) > + mca_btl_portals_module.portals_max_outstanding_ops) { + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); + mca_btl_portals_component_progress(); + } + OMPI_BTL_PORTALS_FRAG_ALLOC_USER(&mca_btl_portals_module.super, frag, ret); if(NULL == frag) { + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); return NULL; } ompi_ddt_type_lb(convertor->pDesc, &lb); frag->segments[0].seg_len = *size; frag->segments[0].seg_addr.pval = convertor->pBaseBuf + lb + convertor->bConverted; - frag->segments[0].seg_key.key64 = OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1); - + frag->segments[0].seg_key.key64 = + OPAL_THREAD_ADD64(&(mca_btl_portals_module.portals_rdma_key), 1); frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->base.des_dst = frag->segments; frag->base.des_dst_cnt = 1; frag->base.des_flags = 0; - frag->type = mca_btl_portals_frag_type_rdma; OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "rdma dest posted for frag 0x%x, callback 0x%x, bits %lld", @@ -530,21 +467,19 @@ mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base, if (PTL_OK != ret) { opal_output(mca_btl_portals_component.portals_output, "Error creating rdma dest ME: %d", ret); + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); return NULL; } - /* setup the memory descriptor. RDMA should never need to be - retransmitted, so we set the threshold for the event it will - receive (PUT/GET START and END). No need to track the unlinks - later :) */ + /* setup the memory descriptor. */ md.start = frag->segments[0].seg_addr.pval; md.length = frag->segments[0].seg_len; md.threshold = PTL_MD_THRESH_INF; md.max_size = 0; md.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_EVENT_START_DISABLE; md.user_ptr = frag; /* keep a pointer to ourselves */ - md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ]; + md.eq_handle = mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND]; ret = PtlMDAttach(me_h, md, @@ -554,6 +489,7 @@ mca_btl_portals_prepare_dst(struct mca_btl_base_module_t* btl_base, opal_output(mca_btl_portals_component.portals_output, "Error creating rdma dest MD: %d", ret); PtlMEUnlink(me_h); + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); OMPI_BTL_PORTALS_FRAG_RETURN_USER(&mca_btl_portals_module.super, frag); return NULL; } @@ -570,16 +506,10 @@ mca_btl_portals_finalize(struct mca_btl_base_module_t *btl_base) assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); /* finalize all communication */ - while (mca_btl_portals_module.portals_outstanding_sends > 0) { + while (mca_btl_portals_module.portals_outstanding_ops > 0) { mca_btl_portals_component_progress(); } - if (0 != opal_list_get_size(&(mca_btl_portals_module.portals_queued_sends))) { - opal_output(mca_btl_portals_component.portals_output, - "Warning: there were %d queued sends not sent", - opal_list_get_size(&(mca_btl_portals_module.portals_queued_sends))); - } - if (mca_btl_portals_module.portals_num_procs != 0) { int i; @@ -597,7 +527,10 @@ mca_btl_portals_finalize(struct mca_btl_base_module_t *btl_base) } OBJ_DESTRUCT(&mca_btl_portals_module.portals_recv_blocks); - OBJ_DESTRUCT(&mca_btl_portals_module.portals_queued_sends); + OBJ_DESTRUCT(&mca_btl_portals_module.portals_recv_frag); + OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_eager); + OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_max); + OBJ_DESTRUCT(&mca_btl_portals_module.portals_frag_user); if (PTL_INVALID_HANDLE != mca_btl_portals_module.portals_ni_h) { ret = PtlNIFini(mca_btl_portals_module.portals_ni_h); diff --git a/ompi/mca/btl/portals/btl_portals.h b/ompi/mca/btl/portals/btl_portals.h index ab47a7d6d6..90d7dbb4ea 100644 --- a/ompi/mca/btl/portals/btl_portals.h +++ b/ompi/mca/btl/portals/btl_portals.h @@ -31,6 +31,7 @@ #include "orte/class/orte_proc_table.h" #include "btl_portals_endpoint.h" +#include "btl_portals_frag.h" #define OMPI_BTL_PORTALS_SEND_TABLE_ID (OMPI_BTL_PORTALS_STARTING_TABLE_ID + 0) #define OMPI_BTL_PORTALS_RDMA_TABLE_ID (OMPI_BTL_PORTALS_STARTING_TABLE_ID + 1) @@ -66,12 +67,15 @@ struct mca_btl_portals_component_t { int portals_free_list_max_num; /* numer of elements to grow free lists */ int portals_free_list_inc_num; + + /* number of eager fragments */ + int portals_free_list_eager_max_num; }; typedef struct mca_btl_portals_component_t mca_btl_portals_component_t; #define OMPI_BTL_PORTALS_EQ_SEND 0 -#define OMPI_BTL_PORTALS_EQ 1 +#define OMPI_BTL_PORTALS_EQ_RECV 1 #define OMPI_BTL_PORTALS_EQ_SIZE 2 struct mca_btl_portals_module_t { @@ -89,13 +93,15 @@ struct mca_btl_portals_module_t { ompi_free_list_t portals_frag_eager; ompi_free_list_t portals_frag_max; ompi_free_list_t portals_frag_user; - ompi_free_list_t portals_frag_recv; /* incoming send message receive memory descriptors */ int portals_recv_mds_num; int portals_recv_mds_size; opal_list_t portals_recv_blocks; + /* frag for receive callbacks */ + mca_btl_portals_frag_recv_t portals_recv_frag; + /* event queues. Keep sends on own eq, since we can't control space for the ack otherwise */ int portals_eq_sizes[OMPI_BTL_PORTALS_EQ_SIZE]; @@ -104,11 +110,11 @@ struct mca_btl_portals_module_t { /* "reject" entry for recv match list */ ptl_handle_me_t portals_recv_reject_me_h; - /* number outstanding sends */ - volatile int32_t portals_outstanding_sends; - int32_t portals_max_outstanding_sends; + /* number outstanding sends and local rdma */ + volatile int32_t portals_outstanding_ops; + int32_t portals_max_outstanding_ops; - /* queued sends */ + /* sends queued until there's time to send */ opal_list_t portals_queued_sends; /* key to use for next rdma operation */ diff --git a/ompi/mca/btl/portals/btl_portals_compat_utcp.c b/ompi/mca/btl/portals/btl_portals_compat_utcp.c index 002b5961e4..ec3f6341b1 100644 --- a/ompi/mca/btl/portals/btl_portals_compat_utcp.c +++ b/ompi/mca/btl/portals/btl_portals_compat_utcp.c @@ -119,7 +119,7 @@ mca_btl_portals_add_procs_compat(struct mca_btl_portals_module_t* btl, int ret; if (use_modex) { - int my_rid; + int my_rid = 0; ptl_process_id_t *info; char *nidmap = NULL; char *pidmap = NULL; diff --git a/ompi/mca/btl/portals/btl_portals_component.c b/ompi/mca/btl/portals/btl_portals_component.c index b7b378ecc6..acf4b29a66 100644 --- a/ompi/mca/btl/portals/btl_portals_component.c +++ b/ompi/mca/btl/portals/btl_portals_component.c @@ -20,6 +20,7 @@ #include #include +#include #if OMPI_BTL_PORTALS_REDSTORM #include #endif @@ -91,11 +92,11 @@ mca_btl_portals_component_open(void) "Debugging verbosity (0 - 100)", false, false, - OMPI_BTL_PORTALS_DEFAULT_DEBUG_LEVEL, + 0, &(portals_output_stream.lds_verbose_level)); #if OMPI_BTL_PORTALS_REDSTORM asprintf(&(portals_output_stream.lds_prefix), - "btl: portals (%2d): ", cnos_get_rank()); + "btl: portals (%5d): ", cnos_get_rank()); #else asprintf(&(portals_output_stream.lds_prefix), "btl: portals (%5d): ", getpid()); @@ -118,22 +119,29 @@ mca_btl_portals_component_open(void) "Initial number of elements to initialize in free lists", false, false, - OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_INIT_NUM, + 16, &(mca_btl_portals_component.portals_free_list_init_num)); mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version, "free_list_max_num", "Max number of elements to initialize in free lists", false, false, - OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_MAX_NUM, + 1024, &(mca_btl_portals_component.portals_free_list_max_num)); mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version, "free_list_inc_num", "Increment count for free lists", false, false, - OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_INC_NUM, + 16, &(mca_btl_portals_component.portals_free_list_inc_num)); + mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version, + "eager_frag_limit", + "Maximum number of pre-pinned eager fragments", + false, + false, + 32, + &(mca_btl_portals_component.portals_free_list_eager_max_num)); /* * fill default module state @@ -143,7 +151,7 @@ mca_btl_portals_component_open(void) "Maximum size for eager frag", false, false, - OMPI_BTL_PORTALS_DEFAULT_EAGER_LIMIT, + 32 * 1024, &dummy); mca_btl_portals_module.super.btl_eager_limit = dummy; @@ -152,7 +160,7 @@ mca_btl_portals_component_open(void) "Minimum size for a send frag", false, false, - OMPI_BTL_PORTALS_DEFAULT_MIN_SEND_SIZE, + 32 * 1024, &dummy); mca_btl_portals_module.super.btl_min_send_size = dummy; mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version, @@ -160,7 +168,7 @@ mca_btl_portals_component_open(void) "Maximum size for a send frag", false, false, - OMPI_BTL_PORTALS_DEFAULT_MAX_SEND_SIZE, + 64 * 1024, &dummy); mca_btl_portals_module.super.btl_max_send_size = dummy; mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version, @@ -168,7 +176,7 @@ mca_btl_portals_component_open(void) "Minimum size for a rdma frag", false, false, - OMPI_BTL_PORTALS_DEFAULT_MIN_RDMA_SIZE, + 64 * 1024, &dummy); mca_btl_portals_module.super.btl_min_rdma_size = dummy; mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version, @@ -176,7 +184,7 @@ mca_btl_portals_component_open(void) "Maximum size for a rdma frag", false, false, - OMPI_BTL_PORTALS_DEFAULT_MAX_RDMA_SIZE, + INT_MAX, &dummy); mca_btl_portals_module.super.btl_max_rdma_size = dummy; @@ -205,11 +213,10 @@ mca_btl_portals_component_open(void) &dummy); mca_btl_portals_module.super.btl_bandwidth = dummy; -#if 0 /* it appears that copying is faster than iovecs at present */ - mca_btl_portals_module.super.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE; -#else + /* send in place actually increases our latency because we have to + hold on to the buffer until we're done with it, rather than + copy and send. So don't use it for now. */ mca_btl_portals_module.super.btl_flags = MCA_BTL_FLAGS_RDMA; -#endif mca_btl_portals_module.portals_num_procs = 0; bzero(&(mca_btl_portals_module.portals_reg), @@ -222,23 +229,23 @@ mca_btl_portals_component_open(void) /* eq handles will be created when the module is instantiated. Set sizes here */ mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version, - "eq_size", - "Size of the event queue", + "eq_recv_size", + "Size of the receive event queue", false, false, - OMPI_BTL_PORTALS_DEFAULT_RECV_QUEUE_SIZE, - &(mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ])); + 16 * 1024, + &(mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ_RECV])); mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version, - "eq_send_max_pending", - "Maximum number of pending send frags", + "max_pending_ops", + "Maximum number of pending send/rdma frags", false, false, - OMPI_BTL_PORTALS_MAX_SENDS_PENDING, - &(mca_btl_portals_module.portals_max_outstanding_sends)); - /* sends_pending * 2 for end, ack */ + 8 * 1024, + &(mca_btl_portals_module.portals_max_outstanding_ops)); + /* ops_pending * 2 for end, ack */ mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ_SEND] = - mca_btl_portals_module.portals_max_outstanding_sends * 2; + mca_btl_portals_module.portals_max_outstanding_ops * 2; mca_btl_portals_module.portals_recv_reject_me_h = PTL_INVALID_HANDLE; @@ -247,19 +254,19 @@ mca_btl_portals_component_open(void) "Number of send frag receive descriptors", false, false, - OMPI_BTL_PORTALS_DEFAULT_RECV_MD_NUM, + 3, &(mca_btl_portals_module.portals_recv_mds_num)); mca_base_param_reg_int(&mca_btl_portals_component.super.btl_version, "recv_md_size", "Size of send frag receive descriptors", false, false, - OMPI_BTL_PORTALS_DEFAULT_RECV_MD_SIZE, + 10 * 1024 * 1024, &(mca_btl_portals_module.portals_recv_mds_size)); mca_btl_portals_module.portals_ni_h = PTL_INVALID_HANDLE; mca_btl_portals_module.portals_sr_dropped = 0; - mca_btl_portals_module.portals_outstanding_sends = 0; + mca_btl_portals_module.portals_outstanding_ops = 0; mca_btl_portals_module.portals_rdma_key = 1; return OMPI_SUCCESS; @@ -315,7 +322,6 @@ mca_btl_portals_component_init(int *num_btls, OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_frag_eager), ompi_free_list_t); OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_frag_max), ompi_free_list_t); OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_frag_user), ompi_free_list_t); - OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_frag_recv), ompi_free_list_t); /* eager frags */ ompi_free_list_init(&(mca_btl_portals_module.portals_frag_eager), @@ -323,7 +329,7 @@ mca_btl_portals_component_init(int *num_btls, mca_btl_portals_module.super.btl_eager_limit, OBJ_CLASS(mca_btl_portals_frag_eager_t), mca_btl_portals_component.portals_free_list_init_num, - mca_btl_portals_component.portals_free_list_max_num, + mca_btl_portals_component.portals_free_list_eager_max_num, mca_btl_portals_component.portals_free_list_inc_num, NULL); @@ -347,19 +353,16 @@ mca_btl_portals_component_init(int *num_btls, NULL); /* recv frags */ - ompi_free_list_init(&(mca_btl_portals_module.portals_frag_recv), - sizeof(mca_btl_portals_frag_recv_t), - OBJ_CLASS(mca_btl_portals_frag_recv_t), - mca_btl_portals_component.portals_free_list_init_num, - mca_btl_portals_component.portals_free_list_max_num, - mca_btl_portals_component.portals_free_list_inc_num, - NULL); + OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_recv_frag), + mca_btl_portals_frag_recv_t); /* receive block list */ OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_recv_blocks), opal_list_t); - /* pending sends */ - OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_queued_sends), opal_list_t); + /* list for send requests that have to be delayed */ + OBJ_CONSTRUCT(&(mca_btl_portals_module.portals_queued_sends), + opal_list_t); + *num_btls = 1; opal_output_verbose(20, mca_btl_portals_component.portals_output, @@ -386,18 +389,9 @@ mca_btl_portals_component_progress(void) while (true) { ret = PtlEQPoll(mca_btl_portals_module.portals_eq_handles, OMPI_BTL_PORTALS_EQ_SIZE, -#if OMPI_BTL_PORTALS_REDSTORM - 0, /* timeout */ -#else - /* with a timeout of 0, the reference - implementation seems to get really unhappy - really fast when communication starts between - all peers at the same time. Slowing things - down a bit seems to help a bunch. */ - 1, /* timeout */ -#endif - &ev, - &which); + 0, /* timeout */ + &ev, /* event structure to update */ + &which); /* which queue the event came from - we don't care */ switch (ret) { case PTL_OK: frag = ev.md.user_ptr; @@ -406,7 +400,6 @@ mca_btl_portals_component_progress(void) switch (ev.type) { case PTL_EVENT_GET_START: /* generated on source (target) when a get from memory starts */ - OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output, "PTL_EVENT_GET_START for 0x%x, %d", frag, (int) ev.hdr_data)); @@ -415,7 +408,6 @@ mca_btl_portals_component_progress(void) case PTL_EVENT_GET_END: /* generated on source (target) when a get from memory ends */ - OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output, "PTL_EVENT_GET_END for 0x%x, %d", frag, (int) ev.hdr_data)); @@ -424,7 +416,6 @@ mca_btl_portals_component_progress(void) case PTL_EVENT_PUT_START: /* generated on destination (target) when a put into memory starts */ - OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output, "PTL_EVENT_PUT_START for 0x%x, %d", frag, (int) ev.hdr_data)); @@ -446,7 +437,6 @@ mca_btl_portals_component_progress(void) case PTL_EVENT_PUT_END: /* generated on destination (target) when a put into memory ends */ - OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output, "PTL_EVENT_PUT_END for 0x%x, %d", frag, (int) ev.hdr_data)); @@ -465,13 +455,15 @@ mca_btl_portals_component_progress(void) block = ev.md.user_ptr; tag = ev.hdr_data; - OMPI_BTL_PORTALS_FRAG_ALLOC_RECV(&mca_btl_portals_module, frag, ret); + /* if we ever make this thread hot, need to do + something with the receive fragments */ + frag = &mca_btl_portals_module.portals_recv_frag; frag->segments[0].seg_addr.pval = (((char*) ev.md.start) + ev.offset); frag->segments[0].seg_len = ev.mlength; OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "received send fragment %x (thresh: %d)", - frag, ev.md.threshold)); + "received send fragment %x (thresh: %d, length %d)", + frag, ev.md.threshold, (int) ev.mlength)); if (ev.md.length - (ev.offset + ev.mlength) < ev.md.max_size || ev.md.threshold == 1) { @@ -491,8 +483,6 @@ mca_btl_portals_component_progress(void) tag, &frag->base, mca_btl_portals_module.portals_reg[tag].cbdata); - OMPI_BTL_PORTALS_FRAG_RETURN_RECV(&mca_btl_portals_module.super, - frag); mca_btl_portals_return_block_part(&mca_btl_portals_module, block); } break; @@ -502,8 +492,8 @@ mca_btl_portals_component_progress(void) returning data */ OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output, - "PTL_EVENT_REPLY_START for 0x%x, %d, %d", - frag, (int) frag->type, (int) ev.hdr_data)); + "PTL_EVENT_REPLY_START for 0x%x, %d", + frag, (int) ev.hdr_data)); break; @@ -512,8 +502,7 @@ mca_btl_portals_component_progress(void) done returning data */ OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "PTL_EVENT_REPLY_END for 0x%x, %d", - frag, (int) frag->type)); + "PTL_EVENT_REPLY_END for 0x%x", frag)); /* let the PML know we're done */ frag->base.des_cbfunc(&mca_btl_portals_module.super, @@ -528,18 +517,12 @@ mca_btl_portals_component_progress(void) #if OMPI_ENABLE_DEBUG OPAL_OUTPUT_VERBOSE((900, mca_btl_portals_component.portals_output, - "PTL_EVENT_SEND_START for 0x%x, %d, %d", - frag, (int) frag->type, (int) ev.hdr_data)); + "PTL_EVENT_SEND_START for 0x%x, %d", + frag, (int) ev.hdr_data)); if (ev.ni_fail_type != PTL_NI_OK) { opal_output(mca_btl_portals_component.portals_output, "Failure to start send event\n"); - if (ev.hdr_data < MCA_BTL_TAG_MAX) { - OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends, - -1); - /* unlink, since we don't expect to get an end or ack */ - } - PtlMDUnlink(ev.md_handle); frag->base.des_cbfunc(&mca_btl_portals_module.super, frag->endpoint, &frag->base, @@ -552,18 +535,12 @@ mca_btl_portals_component_progress(void) /* generated on source (origin) when put stops sending */ #if OMPI_ENABLE_DEBUG OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "PTL_EVENT_SEND_END for 0x%x, %d, %d", - frag, (int) frag->type, (int) ev.hdr_data)); + "PTL_EVENT_SEND_END for 0x%x, %d", + frag, (int) ev.hdr_data)); if (ev.ni_fail_type != PTL_NI_OK) { opal_output(mca_btl_portals_component.portals_output, "Failure to end send event\n"); - if (ev.hdr_data < MCA_BTL_TAG_MAX) { - /* unlink, since we don't expect to get an ack */ - OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends, - -1); - PtlMDUnlink(ev.md_handle); - } frag->base.des_cbfunc(&mca_btl_portals_module.super, frag->endpoint, &frag->base, @@ -580,20 +557,12 @@ mca_btl_portals_component_progress(void) Requeue the put on badness */ OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "PTL_EVENT_ACK for 0x%x, %d", - frag, (int) frag->type)); - - if (frag->type == mca_btl_portals_frag_type_send) { - OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends, - -1); - } + "PTL_EVENT_ACK for 0x%x", frag)); #if OMPI_ENABLE_DEBUG if (ev.ni_fail_type != PTL_NI_OK) { opal_output(mca_btl_portals_component.portals_output, "Failure to ack event\n"); - /* unlink, since we don't expect to get an ack */ - PtlMDUnlink(ev.md_handle); frag->base.des_cbfunc(&mca_btl_portals_module.super, frag->endpoint, &frag->base, @@ -608,14 +577,15 @@ mca_btl_portals_component_progress(void) buffer space available for receiving */ opal_output_verbose(50, mca_btl_portals_component.portals_output, - "message was dropped. Adding to front of queue list"); - opal_list_prepend(&(mca_btl_portals_module.portals_queued_sends), - (opal_list_item_t*) frag); - + "message was dropped. Trying again"); + + mca_btl_portals_send(&mca_btl_portals_module.super, + frag->endpoint, + &frag->base, + frag->hdr.tag); } else { /* other side received the message. should have received entire thing */ - /* let the PML know we're done */ frag->base.des_cbfunc(&mca_btl_portals_module.super, frag->endpoint, @@ -623,7 +593,11 @@ mca_btl_portals_component_progress(void) OMPI_SUCCESS); } - if (frag->type == mca_btl_portals_frag_type_send) { + opal_output_verbose(50, mca_btl_portals_component.portals_output, "fuck"); + + if (0 != frag->size) { + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, + -1); MCA_BTL_PORTALS_PROGRESS_QUEUED_SENDS(); } diff --git a/ompi/mca/btl/portals/btl_portals_frag.c b/ompi/mca/btl/portals/btl_portals_frag.c index 659920313d..da4191ddfd 100644 --- a/ompi/mca/btl/portals/btl_portals_frag.c +++ b/ompi/mca/btl/portals/btl_portals_frag.c @@ -34,9 +34,7 @@ mca_btl_portals_frag_common_send_constructor(mca_btl_portals_frag_t* frag) frag->segments[0].seg_len = frag->size; frag->segments[0].seg_key.key64 = 0; - frag->segments[1].seg_addr.pval = 0; - frag->segments[1].seg_len = 0; - frag->segments[1].seg_key.key64 = 0; + frag->md_h = PTL_INVALID_HANDLE; } @@ -48,6 +46,16 @@ mca_btl_portals_frag_eager_constructor(mca_btl_portals_frag_t* frag) } +static void +mca_btl_portals_frag_eager_destructor(mca_btl_portals_frag_t* frag) +{ + if (PTL_INVALID_HANDLE == frag->md_h) { + PtlMDUnlink(frag->md_h); + frag->md_h = PTL_INVALID_HANDLE; + } +} + + static void mca_btl_portals_frag_max_constructor(mca_btl_portals_frag_t* frag) { @@ -76,7 +84,6 @@ mca_btl_portals_frag_recv_constructor(mca_btl_portals_frag_t* frag) frag->base.des_src = NULL; frag->base.des_src_cnt = 0; frag->size = 0; - frag->type = mca_btl_portals_frag_type_recv; } @@ -90,7 +97,7 @@ OBJ_CLASS_INSTANCE( mca_btl_portals_frag_eager_t, mca_btl_base_descriptor_t, mca_btl_portals_frag_eager_constructor, - NULL); + mca_btl_portals_frag_eager_destructor); OBJ_CLASS_INSTANCE( mca_btl_portals_frag_max_t, diff --git a/ompi/mca/btl/portals/btl_portals_frag.h b/ompi/mca/btl/portals/btl_portals_frag.h index 62e8b4384f..439dde1ef5 100644 --- a/ompi/mca/btl/portals/btl_portals_frag.h +++ b/ompi/mca/btl/portals/btl_portals_frag.h @@ -30,16 +30,15 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_portals_frag_t); */ struct mca_btl_portals_frag_t { mca_btl_base_descriptor_t base; - mca_btl_base_segment_t segments[2]; - ptl_md_iovec_t iov[2]; + mca_btl_base_segment_t segments[1]; /* needed for retransmit case */ struct mca_btl_base_endpoint_t *endpoint; + /* needed for retransmit case */ mca_btl_base_header_t hdr; - enum { mca_btl_portals_frag_type_send, - mca_btl_portals_frag_type_recv, - mca_btl_portals_frag_type_rdma} type; /* handle to use for communication */ ptl_handle_md_t md_h; + /* size of the allocated memory region -- not the amount of data + we need to send */ size_t size; }; @@ -59,69 +58,61 @@ OBJ_CLASS_DECLARATION(mca_btl_portals_frag_user_t); typedef struct mca_btl_portals_frag_t mca_btl_portals_frag_recv_t; OBJ_CLASS_DECLARATION(mca_btl_portals_frag_recv_t); + /* * Macros to allocate/return descriptors from module specific * free list(s). */ -#define OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(btl_macro, frag, rc) \ +#define OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(btl_macro, frag, rc) \ { \ \ opal_list_item_t *item; \ - OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, item, rc); \ - frag = (mca_btl_portals_frag_t*) item; \ + OMPI_FREE_LIST_GET(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, item, rc); \ + if (rc == OMPI_ERR_TEMP_OUT_OF_RESOURCE) { \ + OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc); \ + } \ + frag = (mca_btl_portals_frag_t*) item; \ } -#define OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(btl_macro, frag) \ + +#define OMPI_BTL_PORTALS_FRAG_RETURN_EAGER(btl_macro, frag) \ { \ OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_eager, \ (opal_list_item_t*)(frag)); \ } -#define OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc) \ +#define OMPI_BTL_PORTALS_FRAG_ALLOC_MAX(btl_macro, frag, rc) \ { \ \ opal_list_item_t *item; \ OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_max, item, rc); \ - frag = (mca_btl_portals_frag_t*) item; \ + frag = (mca_btl_portals_frag_t*) item; \ } -#define OMPI_BTL_PORTALS_FRAG_RETURN_MAX(btl_macro, frag) \ + +#define OMPI_BTL_PORTALS_FRAG_RETURN_MAX(btl_macro, frag) \ { \ OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_max, \ (opal_list_item_t*)(frag)); \ } -#define OMPI_BTL_PORTALS_FRAG_ALLOC_USER(btl_macro, frag, rc) \ +#define OMPI_BTL_PORTALS_FRAG_ALLOC_USER(btl_macro, frag, rc) \ { \ opal_list_item_t *item; \ OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_user, item, rc); \ - frag = (mca_btl_portals_frag_t*) item; \ + frag = (mca_btl_portals_frag_t*) item; \ } -#define OMPI_BTL_PORTALS_FRAG_RETURN_USER(btl_macro, frag) \ + +#define OMPI_BTL_PORTALS_FRAG_RETURN_USER(btl_macro, frag) \ { \ OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_user, \ - (opal_list_item_t*)(frag)); \ + (opal_list_item_t*)(frag)); \ } -#define OMPI_BTL_PORTALS_FRAG_ALLOC_RECV(btl_macro, frag, rc) \ -{ \ - opal_list_item_t *item; \ - OMPI_FREE_LIST_WAIT(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_recv, item, rc); \ - frag = (mca_btl_portals_frag_t*) item; \ -} - -#define OMPI_BTL_PORTALS_FRAG_RETURN_RECV(btl_macro, frag) \ -{ \ - OMPI_FREE_LIST_RETURN(&((mca_btl_portals_module_t*)btl_macro)->portals_frag_recv, \ - (opal_list_item_t*)(frag)); \ -} - - - #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/ompi/mca/btl/portals/btl_portals_rdma.c b/ompi/mca/btl/portals/btl_portals_rdma.c index 68dd364071..96f2305f94 100644 --- a/ompi/mca/btl/portals/btl_portals_rdma.c +++ b/ompi/mca/btl/portals/btl_portals_rdma.c @@ -42,7 +42,6 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base, frag->endpoint = btl_peer; frag->hdr.tag = MCA_BTL_TAG_MAX; - frag->type = mca_btl_portals_frag_type_rdma; /* setup the send */ assert(1 == frag->base.des_src_cnt); @@ -58,7 +57,6 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base, if (ret != PTL_OK) { opal_output(mca_btl_portals_component.portals_output, "PtlPut failed with error %d", ret); - PtlMDUnlink(frag->md_h); return OMPI_ERROR; } @@ -83,7 +81,6 @@ mca_btl_portals_get(struct mca_btl_base_module_t* btl_base, frag->endpoint = btl_peer; frag->hdr.tag = MCA_BTL_TAG_MAX; - frag->type = mca_btl_portals_frag_type_rdma; ret = PtlGet(frag->md_h, *((mca_btl_base_endpoint_t*) btl_peer), @@ -94,7 +91,6 @@ mca_btl_portals_get(struct mca_btl_base_module_t* btl_base, if (ret != PTL_OK) { opal_output(mca_btl_portals_component.portals_output, "PtlGet failed with error %d", ret); - PtlMDUnlink(frag->md_h); return OMPI_ERROR; } diff --git a/ompi/mca/btl/portals/btl_portals_recv.h b/ompi/mca/btl/portals/btl_portals_recv.h index d0ceb2e904..c5acb832a1 100644 --- a/ompi/mca/btl/portals/btl_portals_recv.h +++ b/ompi/mca/btl/portals/btl_portals_recv.h @@ -96,12 +96,12 @@ mca_btl_portals_activate_block(mca_btl_portals_recv_block_t *block) md.length = block->length; /* try to throttle incoming sends so that we don't overrun the incoming queue size */ - md.threshold = mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ] / + md.threshold = mca_btl_portals_module.portals_eq_sizes[OMPI_BTL_PORTALS_EQ_RECV] / (mca_btl_portals_module.portals_recv_mds_num * 2); md.max_size = block->btl->super.btl_max_send_size; md.options = PTL_MD_OP_PUT | PTL_MD_MAX_SIZE; md.user_ptr = block; - md.eq_handle = block->btl->portals_eq_handles[OMPI_BTL_PORTALS_EQ]; + md.eq_handle = block->btl->portals_eq_handles[OMPI_BTL_PORTALS_EQ_RECV]; block->pending = 0; block->full = false; diff --git a/ompi/mca/btl/portals/btl_portals_send.c b/ompi/mca/btl/portals/btl_portals_send.c index f2ebface51..4d1807ab46 100644 --- a/ompi/mca/btl/portals/btl_portals_send.c +++ b/ompi/mca/btl/portals/btl_portals_send.c @@ -35,95 +35,70 @@ mca_btl_portals_send(struct mca_btl_base_module_t* btl_base, mca_btl_base_tag_t tag) { mca_btl_portals_frag_t *frag = (mca_btl_portals_frag_t*) descriptor; - int32_t num_sends; int ret; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); - assert(frag->md_h == PTL_INVALID_HANDLE); frag->endpoint = endpoint; frag->hdr.tag = tag; - frag->type = mca_btl_portals_frag_type_send; - - num_sends = OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends, 1); - /* make sure that we have enough space to send. This means that - there is enough space in the event queue for all the events - that may be deposited by outstanding sends */ - if (num_sends >= mca_btl_portals_module.portals_max_outstanding_sends) { + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "PtlPut (send) fragment %x", frag)); + + if (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) > + mca_btl_portals_module.portals_max_outstanding_ops) { + /* no space - queue and continute */ opal_output_verbose(50, mca_btl_portals_component.portals_output, - "no space for message 0x%x. Adding to back of queue", - frag); + "no space for message 0x%x. Adding to back of queue", + frag); + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); opal_list_append(&(mca_btl_portals_module.portals_queued_sends), (opal_list_item_t*) frag); - - OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_sends, -1); + } - ret = OMPI_SUCCESS; - } else { - int ret; - ptl_handle_md_t md_h; - OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "PtlPut (send) fragment %x", frag)); - - /* setup the send */ - if (1 == frag->base.des_src_cnt) { - mca_btl_portals_module.md_send.start = frag->segments[0].seg_addr.pval; - mca_btl_portals_module.md_send.length = frag->segments[0].seg_len; - mca_btl_portals_module.md_send.options = PTL_MD_EVENT_START_DISABLE; - OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "fragment info:\n" - "\tstart: 0x%x\n" - "\tlen: %d", - frag->segments[0].seg_addr.pval, - frag->segments[0].seg_len)); - } else { - assert(2 == frag->base.des_src_cnt); - mca_btl_portals_module.md_send.start = frag->iov; - mca_btl_portals_module.md_send.length = 2; - mca_btl_portals_module.md_send.options = - PTL_MD_EVENT_START_DISABLE | PTL_MD_IOVEC; - OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "fragment info:\n" - "\tiov[0].iov_base: 0x%x\n" - "\tiov[0].iov_len: %d\n" - "\tiov[1].iov_base: 0x%x\n" - "\tiov[1].iov_len: %d", - frag->iov[0].iov_base, - frag->iov[0].iov_len, - frag->iov[1].iov_base, - frag->iov[1].iov_len)); - } + if (frag->md_h == PTL_INVALID_HANDLE) { + /* setup the send - always describe entire fragment */ + mca_btl_portals_module.md_send.start = frag->segments[0].seg_addr.pval; + mca_btl_portals_module.md_send.length = + 0 == frag->size ? frag->segments[0].seg_len : frag->size; + mca_btl_portals_module.md_send.options = + PTL_MD_EVENT_START_DISABLE; mca_btl_portals_module.md_send.user_ptr = frag; /* keep a pointer to ourselves */ /* make a free-floater */ ret = PtlMDBind(mca_btl_portals_module.portals_ni_h, mca_btl_portals_module.md_send, PTL_UNLINK, - &md_h); + &frag->md_h); if (ret != PTL_OK) { opal_output(mca_btl_portals_component.portals_output, "PtlMDBind failed with error %d", ret); return OMPI_ERROR; } + } - ret = PtlPut(md_h, - PTL_ACK_REQ, - *((mca_btl_base_endpoint_t*) endpoint), - OMPI_BTL_PORTALS_SEND_TABLE_ID, - 0, /* ac_index - not used */ - 0, /* match bits */ - 0, /* remote offset - not used */ - frag->hdr.tag); /* hdr_data - tag */ - if (ret != PTL_OK) { - opal_output(mca_btl_portals_component.portals_output, - "send: PtlPut failed with error %d", ret); - PtlMDUnlink(md_h); - return OMPI_ERROR; - } + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "fragment info:\n" + "\tstart: 0x%x\n" + "\tlen: %d", + frag->segments[0].seg_addr.pval, + frag->segments[0].seg_len)); - return OMPI_SUCCESS; + ret = PtlPutRegion(frag->md_h, /* memory descriptor */ + 0, /* fragment offset */ + frag->segments[0].seg_len, /* fragment length */ + PTL_ACK_REQ, + *((mca_btl_base_endpoint_t*) endpoint), + OMPI_BTL_PORTALS_SEND_TABLE_ID, + 0, /* ac_index - not used */ + 0, /* match bits */ + 0, /* remote offset - not used */ + frag->hdr.tag); /* hdr_data: tag */ + if (ret != PTL_OK) { + opal_output(mca_btl_portals_component.portals_output, + "send: PtlPut failed with error %d", ret); + return OMPI_ERROR; } - return ret; + return OMPI_SUCCESS; } diff --git a/ompi/mca/btl/portals/btl_portals_send.h b/ompi/mca/btl/portals/btl_portals_send.h index 7241fe6d23..35a13bf52c 100644 --- a/ompi/mca/btl/portals/btl_portals_send.h +++ b/ompi/mca/btl/portals/btl_portals_send.h @@ -24,8 +24,8 @@ #define MCA_BTL_PORTALS_PROGRESS_QUEUED_SENDS() \ if ((0 != opal_list_get_size(&(mca_btl_portals_module.portals_queued_sends))) && \ - (mca_btl_portals_module.portals_outstanding_sends < \ - mca_btl_portals_module.portals_max_outstanding_sends)) { \ + (mca_btl_portals_module.portals_outstanding_ops < \ + mca_btl_portals_module.portals_max_outstanding_ops)) { \ mca_btl_portals_frag_t *qfrag = (mca_btl_portals_frag_t*) \ opal_list_remove_first(&(mca_btl_portals_module.portals_queued_sends)); \ OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, \ diff --git a/ompi/mca/btl/portals/configure.m4 b/ompi/mca/btl/portals/configure.m4 index a94d509f2f..6b4c6b40f2 100644 --- a/ompi/mca/btl/portals/configure.m4 +++ b/ompi/mca/btl/portals/configure.m4 @@ -17,80 +17,6 @@ # $HEADER$ # - -# _MCA_btl_portals_config_val(config_name, define_name, -# default_val, descrtiption) -# ----------------------------------------------------- -AC_DEFUN([MCA_btl_portals_CONFIG_VAL], [ - AC_ARG_WITH([portals-$1], AC_HELP_STRING([--with-portals-$1], - [$4 (default: $3)])) - case "[$with_]m4_bpatsubst([portals-$1], -, _)" in - "") - $2=$3 - ;; - "no") - AC_MSG_ERROR([--without-portals-$1 is invalid argument]) - ;; - *) - $2="[$with_]m4_bpatsubst([portals-$1], -, _)" - ;; - esac - AC_DEFINE_UNQUOTED([$2], [[$]$2], [$4]) -]) - - -# _MCA_btl_portals_CONFIG_VALS() -# ------------------------------ -AC_DEFUN([MCA_btl_portals_CONFIG_VALS], [ - # User configuration options - MCA_btl_portals_CONFIG_VAL([debug-level], - [OMPI_BTL_PORTALS_DEFAULT_DEBUG_LEVEL], [0], - [debugging level for portals btl]) - - MCA_btl_portals_CONFIG_VAL([eager-limit], - [OMPI_BTL_PORTALS_DEFAULT_EAGER_LIMIT], [32768], - [max size for eager sends]) - - MCA_btl_portals_CONFIG_VAL([min-send-size], - [OMPI_BTL_PORTALS_DEFAULT_MIN_SEND_SIZE], [32768], - [min size for send fragments]) - MCA_btl_portals_CONFIG_VAL([max-send-size], - [OMPI_BTL_PORTALS_DEFAULT_MAX_SEND_SIZE], [65536], - [max size for send fragments]) - - MCA_btl_portals_CONFIG_VAL([md-size], - [OMPI_BTL_PORTALS_DEFAULT_RECV_MD_SIZE], [1048576], - [Size of receive memory descriptors]) - MCA_btl_portals_CONFIG_VAL([md-size], - [OMPI_BTL_PORTALS_DEFAULT_RECV_MD_NUM], [3], - [Number of receive memory descriptors]) - - MCA_btl_portals_CONFIG_VAL([min-rdma-size], - [OMPI_BTL_PORTALS_DEFAULT_MIN_RDMA_SIZE], [65536], - [min size for rdma fragments]) - MCA_btl_portals_CONFIG_VAL([max-rdma-size], - [OMPI_BTL_PORTALS_DEFAULT_MAX_RDMA_SIZE], [2147483647], - [max size for rdma fragments]) - - MCA_btl_portals_CONFIG_VAL([max-sends-pending], - [OMPI_BTL_PORTALS_MAX_SENDS_PENDING], [64], - [max number of sends pending at any time]) - MCA_btl_portals_CONFIG_VAL([recv-queue-size], - [OMPI_BTL_PORTALS_DEFAULT_RECV_QUEUE_SIZE], [8192], - [size of event queue for receiving frags]) - - MCA_btl_portals_CONFIG_VAL([free-list-init-num], - [OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_INIT_NUM], [8], - [starting size of free lists]) - MCA_btl_portals_CONFIG_VAL([free-list-max-num], - [OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_MAX_NUM], [1024], - [maximum size of free lists]) - MCA_btl_portals_CONFIG_VAL([free-list-inc-num], - [OMPI_BTL_PORTALS_DEFAULT_FREE_LIST_INC_NUM], [32], - [grow size for freelists]) -]) - - # _MCA_btl_portals_CONFIG_PLATFORM() # ---------------------------------- AC_DEFUN([MCA_btl_portals_CONFIG_PLATFORM], [ @@ -197,7 +123,6 @@ AC_DEFUN([MCA_btl_portals_CONFIG],[ AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <${btl_portals_header_prefix}portals3.h>], [int i; PtlInit(&i);])], [AC_MSG_RESULT([yes]) - MCA_btl_portals_CONFIG_VALS() btl_portals_WRAPPER_EXTRA_LDFLAGS="$btl_portals_LDFLAGS" btl_portals_WRAPPER_EXTRA_LIBS="$btl_portals_LIBS" $1],