diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h index 61a42c4d5c..e710d9a96a 100644 --- a/ompi/mca/bml/bml.h +++ b/ompi/mca/bml/bml.h @@ -69,6 +69,7 @@ struct mca_bml_base_btl_t { mca_btl_base_module_alloc_fn_t btl_alloc; mca_btl_base_module_free_fn_t btl_free; mca_btl_base_module_send_fn_t btl_send; + mca_btl_base_module_sendi_fn_t btl_sendi; mca_btl_base_module_prepare_fn_t btl_prepare_src; mca_btl_base_module_prepare_fn_t btl_prepare_dst; mca_btl_base_module_put_fn_t btl_put; @@ -304,6 +305,22 @@ static inline int mca_bml_base_send_status( return bml_btl->btl_send(bml_btl->btl, bml_btl->btl_endpoint, des, tag); } +static inline int mca_bml_base_sendi( + mca_bml_base_btl_t* bml_btl, + struct ompi_convertor_t* convertor, + void* header, + size_t header_size, + size_t payload_size, + uint8_t order, + uint32_t flags, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t** descriptor) +{ + return bml_btl->btl_sendi(bml_btl->btl, bml_btl->btl_endpoint, + convertor, header, header_size, + payload_size, order, flags, tag, descriptor); +} + static inline int mca_bml_base_put(mca_bml_base_btl_t* bml_btl, mca_btl_base_descriptor_t* des) { des->des_context = (void*) bml_btl; return bml_btl->btl_put( diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c index d65de340ef..30f5756a52 100644 --- a/ompi/mca/bml/r2/bml_r2.c +++ b/ompi/mca/bml/r2/bml_r2.c @@ -296,6 +296,7 @@ int mca_bml_r2_add_procs( bml_btl->btl_prepare_src = btl->btl_prepare_src; bml_btl->btl_prepare_dst = btl->btl_prepare_dst; bml_btl->btl_send = btl->btl_send; + bml_btl->btl_sendi = btl->btl_sendi; bml_btl->btl_flags = btl->btl_flags; bml_btl->btl_put = btl->btl_put; if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == bml_btl->btl_put) ) { diff --git a/ompi/mca/btl/btl.h b/ompi/mca/btl/btl.h index 0496dca724..57c03c96f1 100644 --- a/ompi/mca/btl/btl.h +++ b/ompi/mca/btl/btl.h @@ -118,9 +118,7 @@ #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" -#if defined(c_plusplus) || defined(__cplusplus) -extern "C" { -#endif +BEGIN_C_DECLS /* * BTL types @@ -273,8 +271,15 @@ typedef struct mca_btl_base_descriptor_t mca_btl_base_descriptor_t; OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_descriptor_t); -#define MCA_BTL_DES_FLAGS_PRIORITY 0x0001 -#define MCA_BTL_DES_FLAGS_BTL_OWNERSHIP 0x0002 +#define MCA_BTL_DES_FLAGS_PRIORITY 0x0001 +/* Allow the BTL to dispose the descriptor once the callback + * associated was triggered. + */ +#define MCA_BTL_DES_FLAGS_BTL_OWNERSHIP 0x0002 +/* Allow the BTL to avoid calling the descriptor callback + * if the send succeded in the btl_send (i.e in the fast path). + */ +#define MCA_BTL_DES_SEND_ALWAYS_CALLBACK 0x0004 /** * Maximum number of allowed segments in src/dst fields of a descriptor. @@ -624,6 +629,45 @@ typedef int (*mca_btl_base_module_send_fn_t)( mca_btl_base_tag_t tag ); +/** + * Initiate an immediate blocking send. + * Completion Semantics: the BTL will make a best effort + * to send the header and "size" bytes from the datatype using the convertor. + * The header is guaranteed to be delivered entirely in the first segment. + * Should the BTL be unable to deliver the data due to resource constraints + * the BTL will return a descriptor (via the OUT param) + * of size "payload_size + header_size". + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param convertor (IN) Data type convertor + * @param header (IN) Pointer to header. + * @param header_size (IN) Size of header. + * @param payload_size (IN) Size of payload (from convertor). + * @param order (IN) The ordering tag (may be MCA_BTL_NO_ORDER) + * @param tag (IN) The tag value used to notify the peer. + * @param descriptor (OUT) The descriptor to be returned unable to be sent immediately + + * @retval OMPI_SUCCESS The send was successfully queued + * @retval OMPI_ERROR The send failed + * @retval OMPI_ERR_UNREACH The endpoint is not reachable + * @retval OMPI_ERR_RESOURCE_BUSY The BTL is busy a descriptor will be returned + * (via the OUT param) if descriptors are available + + */ + +typedef int (*mca_btl_base_module_sendi_fn_t)( + struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct ompi_convertor_t* convertor, + void* header, + size_t header_size, + size_t payload_size, + uint8_t order, + uint32_t flags, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t** descriptor + ); /** * Initiate an asynchronous put. @@ -725,14 +769,15 @@ struct mca_btl_base_module_t { mca_btl_base_module_register_fn_t btl_register; mca_btl_base_module_finalize_fn_t btl_finalize; - mca_btl_base_module_alloc_fn_t btl_alloc; - mca_btl_base_module_free_fn_t btl_free; - mca_btl_base_module_prepare_fn_t btl_prepare_src; - mca_btl_base_module_prepare_fn_t btl_prepare_dst; - mca_btl_base_module_send_fn_t btl_send; - mca_btl_base_module_put_fn_t btl_put; - mca_btl_base_module_get_fn_t btl_get; - mca_btl_base_module_dump_fn_t btl_dump; + mca_btl_base_module_alloc_fn_t btl_alloc; + mca_btl_base_module_free_fn_t btl_free; + mca_btl_base_module_prepare_fn_t btl_prepare_src; + mca_btl_base_module_prepare_fn_t btl_prepare_dst; + mca_btl_base_module_send_fn_t btl_send; + mca_btl_base_module_sendi_fn_t btl_sendi; + mca_btl_base_module_put_fn_t btl_put; + mca_btl_base_module_get_fn_t btl_get; + mca_btl_base_module_dump_fn_t btl_dump; /** the mpool associated with this btl (optional) */ mca_mpool_base_module_t* btl_mpool; @@ -763,8 +808,6 @@ typedef struct mca_btl_base_module_t mca_btl_base_module_t; /* btl v1.0 */ \ "btl", 1, 0, 0 -#if defined(c_plusplus) || defined(__cplusplus) -} -#endif +END_C_DECLS #endif /* OMPI_MCA_BTL_H */ diff --git a/ompi/mca/btl/elan/btl_elan.c b/ompi/mca/btl/elan/btl_elan.c index e1c9c37211..f41faa5105 100644 --- a/ompi/mca/btl/elan/btl_elan.c +++ b/ompi/mca/btl/elan/btl_elan.c @@ -661,6 +661,7 @@ mca_btl_elan_module_t mca_btl_elan_module = { mca_btl_elan_prepare_src, mca_btl_elan_prepare_dst, mca_btl_elan_send, + NULL, /* send immediate */ mca_btl_elan_put, mca_btl_elan_get, mca_btl_elan_dump, diff --git a/ompi/mca/btl/gm/btl_gm.c b/ompi/mca/btl/gm/btl_gm.c index c5cb5916f6..c7178ba25f 100644 --- a/ompi/mca/btl/gm/btl_gm.c +++ b/ompi/mca/btl/gm/btl_gm.c @@ -79,6 +79,7 @@ mca_btl_gm_module_t mca_btl_gm_module = { mca_btl_gm_prepare_dst, #if OMPI_ENABLE_MPI_THREADS || OMPI_ENABLE_PROGRESS_THREADS mca_btl_gm_send, + NULL, /* send immediate */ mca_btl_gm_put, mca_btl_gm_get, #else diff --git a/ompi/mca/btl/mx/btl_mx.c b/ompi/mca/btl/mx/btl_mx.c index e9998d6971..02450b5ca0 100644 --- a/ompi/mca/btl/mx/btl_mx.c +++ b/ompi/mca/btl/mx/btl_mx.c @@ -457,16 +457,21 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl, } if( mx_result ) { mx_return = mx_forget( mx_btl->mx_endpoint, &(frag->mx_request) ); - frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS); - if( btl_ownership ) { - MCA_BTL_MX_FRAG_RETURN( mx_btl, frag ); - } if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) { orte_output( 0, "mx_forget failed with error %d (%s)\n", mx_return, mx_strerror(mx_return) ); - return OMPI_ERROR; + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + return OMPI_SUCCESS; } - return OMPI_SUCCESS; + + if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) { + frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, + &(frag->base), OMPI_SUCCESS); + } + if( btl_ownership ) { + MCA_BTL_MX_FRAG_RETURN( mx_btl, frag ); + } + return 1; } } #endif @@ -475,18 +480,22 @@ int mca_btl_mx_send( struct mca_btl_base_module_t* btl, uint32_t mx_result; /* let's check for completness */ - mx_return = mx_test( mx_btl->mx_endpoint, &(frag->mx_request), &mx_status, &mx_result ); - if( OPAL_UNLIKELY(MX_SUCCESS != mx_return) ) - return OMPI_SUCCESS; - /* call the completion callback */ - if( mx_result ) { - frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, &(frag->base), OMPI_SUCCESS); - if( btl_ownership ) { - MCA_BTL_MX_FRAG_RETURN( mx_btl, frag ); + mx_return = mx_test( mx_btl->mx_endpoint, &(frag->mx_request), + &mx_status, &mx_result ); + if( OPAL_LIKELY(MX_SUCCESS == mx_return) ) { + if( mx_result ) { + if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) { + frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, + &(frag->base), OMPI_SUCCESS); + } + if( btl_ownership ) { + MCA_BTL_MX_FRAG_RETURN( mx_btl, frag ); + } + return 1; } - return OMPI_SUCCESS; } } + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; return OMPI_SUCCESS; } @@ -552,6 +561,7 @@ mca_btl_mx_module_t mca_btl_mx_module = { mca_btl_mx_prepare_src, mca_btl_mx_prepare_dst, mca_btl_mx_send, + NULL, /* send immediate */ mca_btl_mx_put, /* put */ NULL, /* get */ mca_btl_base_dump, diff --git a/ompi/mca/btl/mx/btl_mx_component.c b/ompi/mca/btl/mx/btl_mx_component.c index 3d770a972a..b4660a0dfe 100644 --- a/ompi/mca/btl/mx/btl_mx_component.c +++ b/ompi/mca/btl/mx/btl_mx_component.c @@ -620,8 +620,10 @@ int mca_btl_mx_component_progress(void) if( MCA_BTL_MX_SEND == frag->type ) { /* it's a send */ int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); /* call the completion callback */ - frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, - &(frag->base), OMPI_SUCCESS ); + if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) { + frag->base.des_cbfunc( &(mx_btl->super), frag->endpoint, + &(frag->base), OMPI_SUCCESS ); + } if( btl_ownership ) { MCA_BTL_MX_FRAG_RETURN( mx_btl, frag ); } diff --git a/ompi/mca/btl/ofud/btl_ofud.c b/ompi/mca/btl/ofud/btl_ofud.c index e07438dec6..2072a08006 100644 --- a/ompi/mca/btl/ofud/btl_ofud.c +++ b/ompi/mca/btl/ofud/btl_ofud.c @@ -60,6 +60,7 @@ mca_btl_ud_module_t mca_btl_ofud_module = { mca_btl_ud_prepare_src, NULL, /*mca_btl_ud_prepare_dst */ mca_btl_ud_send, + NULL, /* send immediate */ NULL, /*mca_btl_ud_put */ NULL, /*mca_btl_ud_get */ mca_btl_base_dump, diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index f75f8c78c6..97d83dea80 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -82,6 +82,7 @@ mca_btl_openib_module_t mca_btl_openib_module = { mca_btl_openib_prepare_src, mca_btl_openib_prepare_dst, mca_btl_openib_send, + NULL, /* send immediate */ mca_btl_openib_put, mca_btl_openib_get, mca_btl_base_dump, diff --git a/ompi/mca/btl/portals/btl_portals.c b/ompi/mca/btl/portals/btl_portals.c index e125035859..42b09fef83 100644 --- a/ompi/mca/btl/portals/btl_portals.c +++ b/ompi/mca/btl/portals/btl_portals.c @@ -65,6 +65,7 @@ mca_btl_portals_module_t mca_btl_portals_module = { mca_btl_portals_prepare_src, mca_btl_portals_prepare_dst, mca_btl_portals_send, + mca_btl_portals_sendi, mca_btl_portals_put, mca_btl_portals_get, mca_btl_base_dump, @@ -237,7 +238,7 @@ mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base, mca_btl_portals_frag_t* frag; assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); - + if (size <= mca_btl_portals_module.super.btl_eager_limit) { OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, rc); if (OMPI_SUCCESS != rc) return NULL; @@ -251,7 +252,7 @@ mca_btl_portals_alloc(struct mca_btl_base_module_t* btl_base, } frag->base.des_src_cnt = 1; - frag->base.des_flags = flags; + frag->base.des_flags = flags | MCA_BTL_DES_SEND_ALWAYS_CALLBACK; frag->base.order = MCA_BTL_NO_ORDER; return &frag->base; diff --git a/ompi/mca/btl/portals/btl_portals.h b/ompi/mca/btl/portals/btl_portals.h index c81035d573..aa496c7b66 100644 --- a/ompi/mca/btl/portals/btl_portals.h +++ b/ompi/mca/btl/portals/btl_portals.h @@ -204,6 +204,18 @@ int mca_btl_portals_send(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_descriptor_t* descriptor, mca_btl_base_tag_t tag); + +int mca_btl_portals_sendi(struct mca_btl_base_module_t* btl_base, + struct mca_btl_base_endpoint_t* endpoint, + struct ompi_convertor_t* convertor, + void* header, + size_t header_size, + size_t payload_size, + uint8_t order, + uint32_t flags, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t** des); + int mca_btl_portals_put(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* btl_peer, struct mca_btl_base_descriptor_t* decriptor); diff --git a/ompi/mca/btl/portals/btl_portals_component.c b/ompi/mca/btl/portals/btl_portals_component.c index 725b6b37c2..c562120a46 100644 --- a/ompi/mca/btl/portals/btl_portals_component.c +++ b/ompi/mca/btl/portals/btl_portals_component.c @@ -70,7 +70,7 @@ mca_btl_portals_component_t mca_btl_portals_component = { }; -static orte_output_stream_t portals_output_stream; +static opal_output_stream_t portals_output_stream; int mca_btl_portals_component_open(void) @@ -84,7 +84,7 @@ mca_btl_portals_component_open(void) */ /* start up debugging output */ - OBJ_CONSTRUCT(&portals_output_stream, orte_output_stream_t); + OBJ_CONSTRUCT(&portals_output_stream, opal_output_stream_t); portals_output_stream.lds_is_debugging = true; portals_output_stream.lds_want_stdout = true; portals_output_stream.lds_file_suffix = "btl-portals"; @@ -368,10 +368,11 @@ mca_btl_portals_component_progress(void) } break; case PTL_EVENT_PUT_START: + tag = ((unsigned char*) (&ev.hdr_data))[7]; /* generated on destination (target) when a put into memory starts */ ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "PTL_EVENT_PUT_START for 0x%lx, %d", - (unsigned long) frag, (int) ev.hdr_data)); + (unsigned long) frag, (int) tag)); #if OMPI_ENABLE_DEBUG if (ev.ni_fail_type != PTL_NI_OK) { @@ -381,7 +382,7 @@ mca_btl_portals_component_progress(void) } #endif /* if it's a pending unexpected receive, do book keeping. */ - if (ev.hdr_data < MCA_BTL_TAG_MAX) { + if (tag < MCA_BTL_TAG_MAX) { block = ev.md.user_ptr; OPAL_THREAD_ADD32(&(block->pending), 1); } @@ -389,10 +390,11 @@ mca_btl_portals_component_progress(void) break; case PTL_EVENT_PUT_END: + tag = ((unsigned char*) (&ev.hdr_data))[7]; /* generated on destination (target) when a put into memory ends */ ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "PTL_EVENT_PUT_END for 0x%lx, %d", - (unsigned long) frag, (int) ev.hdr_data)); + (unsigned long) frag, (int) tag)); #if OMPI_ENABLE_DEBUG if (ev.ni_fail_type != PTL_NI_OK) { @@ -404,21 +406,51 @@ mca_btl_portals_component_progress(void) } #endif /* if it's an unexpected receive, do book keeping and send to PML */ - if (ev.hdr_data < MCA_BTL_TAG_MAX) { + if (tag < MCA_BTL_TAG_MAX) { block = ev.md.user_ptr; - tag = ev.hdr_data; - - /* if we ever make this thread hot, need to do - something with the receive fragments */ frag = &mca_btl_portals_module.portals_recv_frag; - frag->segments[0].seg_addr.pval = (((char*) ev.md.start) + ev.offset); - frag->segments[0].seg_len = ev.mlength; - - ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, - "received send fragment 0x%lx (thresh: %d, length %d)", - (unsigned long) frag, - ev.md.threshold, (int) ev.mlength)); - + if(ev.match_bits) { + uint8_t header_size = ((uint8_t*) (&ev.hdr_data))[6]; + memcpy(frag->data, &ev.match_bits, header_size > 8 ? 8 : header_size); + if(header_size > 8) { + memcpy(frag->data+8, &ev.hdr_data, header_size - 8); + } + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,"received %x %x %x %x %x %x %x %x %x %x : header_size %x : tag %x \n", + frag->data[0], + frag->data[1], + frag->data[2], + frag->data[3], + frag->data[4], + frag->data[5], + frag->data[6], + frag->data[7], + frag->data[8], + frag->data[9], + header_size, + tag + )); + + OPAL_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output,"received %d bytes \n", ev.mlength)); + frag->segments[0].seg_addr.pval = &frag->data; + frag->segments[0].seg_len = header_size; + if(ev.mlength) { + frag->segments[1].seg_addr.pval = ((((char*) ev.md.start) + ev.offset)); + frag->segments[1].seg_len = ev.mlength; + frag->base.des_dst_cnt = 2; + } else { + frag->base.des_dst_cnt = 1; + } + } else { + /* if we ever make this thread hot, need to do + something with the receive fragments */ + frag->segments[0].seg_addr.pval = (((char*) ev.md.start) + ev.offset); + frag->segments[0].seg_len = ev.mlength; + + ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "received send fragment 0x%lx (thresh: %d, length %d)", + (unsigned long) frag, + ev.md.threshold, (int) ev.mlength)); + } if (ev.md.length - (ev.offset + ev.mlength) < (ptl_size_t) ev.md.max_size || ev.md.threshold == 1) { /* the block is full. It's deactivated automagically, but we @@ -458,11 +490,11 @@ mca_btl_portals_component_progress(void) "PTL_EVENT_REPLY_END for 0x%lx", (unsigned long) frag)); - /* let the PML know we're done */ frag->base.des_cbfunc(&mca_btl_portals_module.super, frag->endpoint, &frag->base, OMPI_SUCCESS); + if( btl_ownership ) { ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "in PTL_EVENT_REPLY_END received a frag with btl_ownership!")); @@ -505,10 +537,12 @@ mca_btl_portals_component_progress(void) if (ev.ni_fail_type != PTL_NI_OK) { orte_output(mca_btl_portals_component.portals_output, "Failure to end send event\n"); - frag->base.des_cbfunc(&mca_btl_portals_module.super, - frag->endpoint, - &frag->base, - OMPI_ERROR); + if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ){ + frag->base.des_cbfunc(&mca_btl_portals_module.super, + frag->endpoint, + &frag->base, + OMPI_ERROR); + } if( btl_ownership ) { mca_btl_portals_free(&mca_btl_portals_module.super, &frag->base); @@ -517,10 +551,12 @@ mca_btl_portals_component_progress(void) #endif if(!mca_btl_portals_component.portals_need_ack) { /* my part's done, in portals we trust! */ - frag->base.des_cbfunc(&mca_btl_portals_module.super, - frag->endpoint, - &frag->base, - OMPI_SUCCESS); + if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ){ + frag->base.des_cbfunc(&mca_btl_portals_module.super, + frag->endpoint, + &frag->base, + OMPI_SUCCESS); + } if( btl_ownership ) { mca_btl_portals_free(&mca_btl_portals_module.super, &frag->base); @@ -553,10 +589,12 @@ mca_btl_portals_component_progress(void) if (ev.ni_fail_type != PTL_NI_OK) { orte_output(mca_btl_portals_component.portals_output, "Failure to ack event\n"); - frag->base.des_cbfunc(&mca_btl_portals_module.super, - frag->endpoint, - &frag->base, - OMPI_ERROR); + if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ){ + frag->base.des_cbfunc(&mca_btl_portals_module.super, + frag->endpoint, + &frag->base, + OMPI_ERROR); + } if( btl_ownership ) { mca_btl_portals_free(&mca_btl_portals_module.super, &frag->base); @@ -564,7 +602,7 @@ mca_btl_portals_component_progress(void) } else #endif - if (0 == ev.mlength) { + if (ev.rlength != ev.mlength) { /* other side received message but truncated to 0. This should only happen for unexpected messages, and only when the other side has no @@ -581,10 +619,12 @@ mca_btl_portals_component_progress(void) /* other side received the message. should have received entire thing */ /* let the PML know we're done */ - frag->base.des_cbfunc(&mca_btl_portals_module.super, - frag->endpoint, - &frag->base, - OMPI_SUCCESS); + if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) { + frag->base.des_cbfunc(&mca_btl_portals_module.super, + frag->endpoint, + &frag->base, + OMPI_SUCCESS); + } if( btl_ownership ) { mca_btl_portals_free(&mca_btl_portals_module.super, &frag->base); diff --git a/ompi/mca/btl/portals/btl_portals_frag.h b/ompi/mca/btl/portals/btl_portals_frag.h index cc6a62a34c..82af4f4cdb 100644 --- a/ompi/mca/btl/portals/btl_portals_frag.h +++ b/ompi/mca/btl/portals/btl_portals_frag.h @@ -42,6 +42,7 @@ struct mca_btl_portals_frag_t { enum { BTL_PORTALS_FRAG_TYPE_EAGER, BTL_PORTALS_FRAG_TYPE_MAX, BTL_PORTALS_FRAG_TYPE_USER } type; + unsigned char data[16]; }; typedef struct mca_btl_portals_frag_t mca_btl_portals_frag_t; OBJ_CLASS_DECLARATION(mca_btl_portals_frag_t); diff --git a/ompi/mca/btl/portals/btl_portals_rdma.c b/ompi/mca/btl/portals/btl_portals_rdma.c index 05106ad5ae..967bdb229c 100644 --- a/ompi/mca/btl/portals/btl_portals_rdma.c +++ b/ompi/mca/btl/portals/btl_portals_rdma.c @@ -35,6 +35,7 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base, { mca_btl_portals_frag_t *frag = (mca_btl_portals_frag_t*) descriptor; int ret; + unsigned char hdr_data[8]; ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "PtlPut (rdma) fragment %lx, bits %" PRIx64, @@ -45,7 +46,7 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base, assert(frag->md_h != PTL_INVALID_HANDLE); frag->endpoint = btl_peer; - frag->hdr.tag = MCA_BTL_TAG_MAX; + hdr_data[7] = frag->hdr.tag = MCA_BTL_TAG_MAX; /* setup the send */ assert(1 == frag->base.des_src_cnt); @@ -57,7 +58,7 @@ mca_btl_portals_put(struct mca_btl_base_module_t* btl_base, 0, /* ac_index - not used*/ frag->base.des_dst[0].seg_key.key64, /* match bits */ 0, /* remote offset - not used */ - MCA_BTL_TAG_MAX); /* hdr_data - invalid tag */ + *((ptl_hdr_data_t*) hdr_data)); /* hdr_data: tag */ if (ret != PTL_OK) { orte_output(mca_btl_portals_component.portals_output, "PtlPut failed with error %d", ret); diff --git a/ompi/mca/btl/portals/btl_portals_recv.c b/ompi/mca/btl/portals/btl_portals_recv.c index 5f2f6738b7..e325190a57 100644 --- a/ompi/mca/btl/portals/btl_portals_recv.c +++ b/ompi/mca/btl/portals/btl_portals_recv.c @@ -39,7 +39,8 @@ mca_btl_portals_recv_enable(mca_btl_portals_module_t *btl) ptl_process_id_t any_proc = {PTL_NID_ANY, PTL_PID_ANY}; int ret; int i; - + uint64_t ignore_bits = ~((uint64_t) 0); + /* create the reject entry */ md.start = NULL; md.length = 0; @@ -55,7 +56,7 @@ mca_btl_portals_recv_enable(mca_btl_portals_module_t *btl) OMPI_BTL_PORTALS_SEND_TABLE_ID, any_proc, 0, /* match */ - 0, /* ignore */ + ignore_bits, /* ignore */ PTL_RETAIN, PTL_INS_BEFORE, &(btl->portals_recv_reject_me_h)); diff --git a/ompi/mca/btl/portals/btl_portals_recv.h b/ompi/mca/btl/portals/btl_portals_recv.h index c5acb832a1..7c4b441afd 100644 --- a/ompi/mca/btl/portals/btl_portals_recv.h +++ b/ompi/mca/btl/portals/btl_portals_recv.h @@ -74,6 +74,7 @@ mca_btl_portals_activate_block(mca_btl_portals_recv_block_t *block) int ret; ptl_process_id_t any_proc = { PTL_NID_ANY, PTL_PID_ANY }; ptl_md_t md; + uint64_t ignore_bits = ~((uint64_t) 0); /* if we have pending operations, something very, very, very bad has happened... */ @@ -85,7 +86,7 @@ mca_btl_portals_activate_block(mca_btl_portals_recv_block_t *block) ret = PtlMEInsert(block->btl->portals_recv_reject_me_h, any_proc, 0, /* match bits */ - 0, /* ignore bits */ + ignore_bits, /* ignore bits */ PTL_UNLINK, PTL_INS_BEFORE, &(block->me_h)); diff --git a/ompi/mca/btl/portals/btl_portals_send.c b/ompi/mca/btl/portals/btl_portals_send.c index 79a454305f..482a4a6292 100644 --- a/ompi/mca/btl/portals/btl_portals_send.c +++ b/ompi/mca/btl/portals/btl_portals_send.c @@ -23,6 +23,8 @@ #include #include "ompi/constants.h" +#include "ompi/datatype/convertor.h" +#include "ompi/datatype/datatype.h" #include "orte/util/output.h" #include "btl_portals.h" @@ -37,11 +39,12 @@ mca_btl_portals_send(struct mca_btl_base_module_t* btl_base, { mca_btl_portals_frag_t *frag = (mca_btl_portals_frag_t*) descriptor; int ret; - + unsigned char hdr_data[8]; + assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); frag->endpoint = endpoint; - frag->hdr.tag = tag; + hdr_data[7] = frag->hdr.tag = tag; ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, "PtlPut (send) fragment %lx", @@ -103,7 +106,146 @@ mca_btl_portals_send(struct mca_btl_base_module_t* btl_base, 0, /* ac_index - not used */ 0, /* match bits */ 0, /* remote offset - not used */ - frag->hdr.tag); /* hdr_data: tag */ + *((ptl_hdr_data_t*) hdr_data)); /* hdr_data: tag */ + if (ret != PTL_OK) { + orte_output(mca_btl_portals_component.portals_output, + "send: PtlPut failed with error %d", ret); + return OMPI_ERROR; + } + + return OMPI_SUCCESS; +} + + + +int mca_btl_portals_sendi(struct mca_btl_base_module_t* btl_base, + struct mca_btl_base_endpoint_t* endpoint, + struct ompi_convertor_t* convertor, + void* header, + size_t header_size, + size_t payload_size, + uint8_t order, + uint32_t flags, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t** des) +{ + mca_btl_portals_frag_t *frag = NULL; + struct iovec iov; + unsigned int iov_count; + unsigned char match_bits[8]; + unsigned char hdr_data[8]; + int ret; + size_t max_data; + + assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base); + *des = NULL; + if (OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, 1) > + mca_btl_portals_module.portals_max_outstanding_ops) { + /* no space - queue and continue */ + orte_output_verbose(50, mca_btl_portals_component.portals_output, + "no resources left for send inline"); + + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); + *des = mca_btl_portals_alloc(btl_base, endpoint, order, + payload_size + header_size, flags); + return OMPI_ERR_RESOURCE_BUSY; + + } else if(14 < header_size) { + /* header is too big */ + *des = mca_btl_portals_alloc(btl_base, endpoint, order, + payload_size + header_size, flags); + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); + return OMPI_ERR_RESOURCE_BUSY; + } + + assert (payload_size <= mca_btl_portals_module.super.btl_eager_limit); + + OMPI_BTL_PORTALS_FRAG_ALLOC_EAGER(&mca_btl_portals_module, frag, ret); + if (OMPI_SUCCESS != ret) { + OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_outstanding_ops, -1); + return OMPI_ERR_RESOURCE_BUSY; + } + frag->segments[0].seg_len = payload_size; + frag->base.des_src_cnt = 1; + frag->base.des_flags = flags; + frag->base.order = MCA_BTL_NO_ORDER; + frag->endpoint = endpoint; + + if(payload_size) { + /* pack the data into the supplied buffer */ + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segments[0].seg_addr.pval); + iov.iov_len = payload_size; + iov_count = 1; + + (void)ompi_convertor_pack( convertor, + &iov, &iov_count, &max_data); + + assert(max_data == payload_size); + } + + if(header_size) { + memcpy(match_bits, header, header_size > 8 ? 8 : header_size); + } + if(header_size > 8 ) { + memcpy(hdr_data, ((unsigned char*) header) + 8, header_size - 8); + } + hdr_data[6] = header_size; + hdr_data[7] = frag->hdr.tag = tag; + + + ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "PtlPut (send) fragment %lx", + (unsigned long) frag)); + + + + if (frag->md_h == PTL_INVALID_HANDLE) { + /* setup the send - always describe entire fragment */ + mca_btl_portals_module.md_send.start = frag->segments[0].seg_addr.pval; + mca_btl_portals_module.md_send.length = + 0 == frag->size ? frag->segments[0].seg_len : frag->size; +#if OMPI_ENABLE_DEBUG + mca_btl_portals_module.md_send.options = + PTL_MD_EVENT_START_DISABLE; +#else + /* optimized build, we can get rid of the END event */ + /* if we are using portals ACK's for completion */ + mca_btl_portals_module.md_send.options = + (PTL_MD_EVENT_START_DISABLE | + (mca_btl_portals_component.portals_need_ack ? PTL_MD_EVENT_END_DISABLE : 0)); +#endif + mca_btl_portals_module.md_send.user_ptr = frag; /* keep a pointer to ourselves */ + + /* make a free-floater */ + ret = PtlMDBind(mca_btl_portals_module.portals_ni_h, + mca_btl_portals_module.md_send, + PTL_UNLINK, + &frag->md_h); + if (ret != PTL_OK) { + orte_output(mca_btl_portals_component.portals_output, + "PtlMDBind failed with error %d", ret); + return OMPI_ERROR; + } + } + + ORTE_OUTPUT_VERBOSE((90, mca_btl_portals_component.portals_output, + "fragment info:\n" + "\tstart: 0x%lx\n" + "\tlen: %d", + (unsigned long) frag->segments[0].seg_addr.pval, + frag->segments[0].seg_len)); + + ret = PtlPutRegion(frag->md_h, /* memory descriptor */ + 0, /* fragment offset */ + frag->segments[0].seg_len, /* fragment length */ + (mca_btl_portals_component.portals_need_ack ? PTL_ACK_REQ : PTL_NO_ACK_REQ), + *((mca_btl_base_endpoint_t*) endpoint), + OMPI_BTL_PORTALS_SEND_TABLE_ID, + 0, /* ac_index - not used */ + *((ptl_match_bits_t*) match_bits), /* match bits */ + 0, /* remote offset - not used */ + *((ptl_hdr_data_t*) hdr_data)); /* hdr_data: tag */ + if (ret != PTL_OK) { orte_output(mca_btl_portals_component.portals_output, "send: PtlPut failed with error %d", ret); diff --git a/ompi/mca/btl/sctp/btl_sctp.c b/ompi/mca/btl/sctp/btl_sctp.c index ba10e7a928..61f8230eea 100644 --- a/ompi/mca/btl/sctp/btl_sctp.c +++ b/ompi/mca/btl/sctp/btl_sctp.c @@ -55,6 +55,7 @@ mca_btl_sctp_module_t mca_btl_sctp_module = { mca_btl_sctp_prepare_src, mca_btl_sctp_prepare_dst, mca_btl_sctp_send, + NULL, /* send immediate */ mca_btl_sctp_put, NULL, /* get */ mca_btl_base_dump, diff --git a/ompi/mca/btl/self/btl_self.c b/ompi/mca/btl/self/btl_self.c index 33af21db4d..175eeb4243 100644 --- a/ompi/mca/btl/self/btl_self.c +++ b/ompi/mca/btl/self/btl_self.c @@ -62,7 +62,8 @@ mca_btl_base_module_t mca_btl_self = { mca_btl_self_free, mca_btl_self_prepare_src, mca_btl_self_prepare_dst, - mca_btl_self_send, + mca_btl_self_send, + NULL, /* send immediate */ mca_btl_self_rdma, /* put */ mca_btl_self_rdma, /* get */ mca_btl_base_dump, @@ -305,11 +306,13 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl, des->des_dst = NULL; des->des_dst_cnt = 0; /* send completion */ - des->des_cbfunc( btl, endpoint, des, OMPI_SUCCESS ); + if( des->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ) { + des->des_cbfunc( btl, endpoint, des, OMPI_SUCCESS ); + } if( btl_ownership ) { mca_btl_self_free( btl, des ); } - return OMPI_SUCCESS; + return 1; } /** diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index d158595881..550bffec53 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -67,6 +67,7 @@ mca_btl_sm_t mca_btl_sm = { mca_btl_sm_prepare_src, NULL, mca_btl_sm_send, + mca_btl_sm_sendi, /* send immediate */ NULL, /* put */ NULL, /* get */ mca_btl_base_dump, @@ -252,7 +253,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) length = sizeof(mca_btl_sm_frag1_t); length_payload = sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.eager_limit; - ompi_free_list_init_new(&mca_btl_sm_component.sm_frags1, length, + ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_eager, length, CACHE_LINE_SIZE, OBJ_CLASS(mca_btl_sm_frag1_t), length_payload, CACHE_LINE_SIZE, mca_btl_sm_component.sm_free_list_num, @@ -263,7 +264,7 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) length = sizeof(mca_btl_sm_frag2_t); length_payload = sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.max_frag_size; - ompi_free_list_init_new(&mca_btl_sm_component.sm_frags2, length, + ompi_free_list_init_new(&mca_btl_sm_component.sm_frags_max, length, CACHE_LINE_SIZE, OBJ_CLASS(mca_btl_sm_frag2_t), length_payload, CACHE_LINE_SIZE, mca_btl_sm_component.sm_free_list_num, @@ -271,14 +272,6 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) mca_btl_sm_component.sm_free_list_inc, mca_btl_sm_component.sm_mpool); - ompi_free_list_init_new(&mca_btl_sm_component.sm_frags, - sizeof(mca_btl_sm_frag_t), CACHE_LINE_SIZE, - OBJ_CLASS(mca_btl_sm_frag_t), 0, CACHE_LINE_SIZE, - mca_btl_sm_component.sm_free_list_num, - -1, - mca_btl_sm_component.sm_free_list_inc, - NULL); - opal_free_list_init(&mca_btl_sm_component.pending_send_fl, sizeof(btl_sm_pending_send_item_t), OBJ_CLASS(opal_free_list_item_t), @@ -467,7 +460,7 @@ int mca_btl_sm_add_procs( mca_btl_sm_component.num_smp_procs += n_local_procs; /* make sure we have enough eager fragmnents for each process */ - return_code = ompi_free_list_resize(&mca_btl_sm_component.sm_frags1, + return_code = ompi_free_list_resize(&mca_btl_sm_component.sm_frags_eager, mca_btl_sm_component.num_smp_procs * 2); if (OMPI_SUCCESS != return_code) goto CLEANUP; @@ -610,6 +603,99 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( return &frag->base; } +#if 0 +#define MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag) \ + do { \ + char* _memory = (char*)(sm_frag)->segment.seg_addr.pval + \ + (sm_frag)->segment.seg_len; \ + int* _intmem; \ + size_t align = (intptr_t)_memory & 0xFUL; \ + switch( align & 0x3 ) { \ + case 3: *_memory = 0; _memory++; \ + case 2: *_memory = 0; _memory++; \ + case 1: *_memory = 0; _memory++; \ + } \ + align >>= 2; \ + _intmem = (int*)_memory; \ + switch( align ) { \ + case 3: *_intmem = 0; _intmem++; \ + case 2: *_intmem = 0; _intmem++; \ + case 1: *_intmem = 0; _intmem++; \ + } \ + } while(0) +#else +#define MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(sm_frag) +#endif + +#if 0 + if( OPAL_LIKELY(align > 0) ) { \ + align = 0xFUL - align; \ + memset( _memory, 0, align ); \ + } \ + +#endif + +/** + * Initiate an inline send to the peer. If failure then return a descriptor. + * + * @param btl (IN) BTL module + * @param peer (IN) BTL peer addressing + */ +int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct ompi_convertor_t* convertor, + void* header, + size_t header_size, + size_t payload_size, + uint8_t order, + uint32_t flags, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t** descriptor ) +{ + size_t max_data, length = (header_size + payload_size); + mca_btl_sm_frag_t* frag; + int rc; + + if( length < mca_btl_sm_component.eager_limit ) { + MCA_BTL_SM_FRAG_ALLOC1(frag, rc); + if( OPAL_UNLIKELY(NULL == frag) ) { + *descriptor = NULL; + return rc; + } + frag->segment.seg_len = length; + frag->hdr->len = length; + assert( 0 == (flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) ); + frag->base.des_flags = flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; + frag->hdr->tag = tag; + frag->endpoint = endpoint; + + memcpy( frag->segment.seg_addr.pval, header, header_size ); + if( payload_size ) { + struct iovec iov; + unsigned int iov_count; + /* pack the data into the supplied buffer */ + iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)frag->segment.seg_addr.pval + header_size); + iov.iov_len = payload_size; + iov_count = 1; + + (void)ompi_convertor_pack( convertor, + &iov, &iov_count, &max_data); + + assert(max_data == payload_size); + } + MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag); + /* + * post the descriptor in the queue - post with the relative + * address + */ + MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank, + endpoint->peer_smp_rank, frag->hdr, false, rc); + return rc; + } + *descriptor = mca_btl_sm_alloc( btl, endpoint, order, + payload_size + header_size, flags); + return OMPI_ERR_RESOURCE_BUSY; +} /** * Initiate a send to the peer. @@ -617,7 +703,6 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( * @param btl (IN) BTL module * @param peer (IN) BTL peer addressing */ - int mca_btl_sm_send( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, @@ -632,6 +717,8 @@ int mca_btl_sm_send( /* type of message, pt-2-pt, one-sided, etc */ frag->hdr->tag = tag; + MCA_BTL_SM_TOUCH_DATA_TILL_CACHELINE_BOUNDARY(frag); + frag->endpoint = endpoint; /* @@ -639,8 +726,8 @@ int mca_btl_sm_send( * address */ MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank, - endpoint->peer_smp_rank, frag->hdr, false, rc); - return rc; + endpoint->peer_smp_rank, frag->hdr, false, rc); + return (rc < 0 ? rc : 1); } int mca_btl_sm_ft_event(int state) { diff --git a/ompi/mca/btl/sm/btl_sm.h b/ompi/mca/btl/sm/btl_sm.h index a338ff6671..2bc9255e5c 100644 --- a/ompi/mca/btl/sm/btl_sm.h +++ b/ompi/mca/btl/sm/btl_sm.h @@ -80,24 +80,23 @@ struct mca_btl_sm_component_t { mca_common_sm_mmap_t *mmap_file; /**< description of mmap'ed file */ mca_common_sm_file_header_t *sm_ctl_header; /* control header in shared memory */ - ompi_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */ - char **shm_bases; /**< pointer to base pointers in shared memory */ - ompi_fifo_t **fifo; /**< cached copy of the pointer to the 2D - fifo array. The address in the shared - memory segment sm_ctl_header is a relative, - but this one, in process private memory, is - a real virtual address */ - size_t size_of_cb_queue; /**< size of each circular buffer queue array */ - size_t cb_lazy_free_freq; /**< frequency of lazy free */ - int cb_max_num; /**< max number of circular buffers for each peer */ - ptrdiff_t *sm_offset; /**< offset to be applied to shared memory - addresses, per local process value */ - int32_t num_smp_procs; /**< current number of smp procs on this host */ - int32_t my_smp_rank; /**< My SMP process rank. Used for accessing - * SMP specfic data structures. */ - ompi_free_list_t sm_frags1; /**< free list of sm first */ - ompi_free_list_t sm_frags2; /**< free list of sm second */ - ompi_free_list_t sm_frags; /**< free list of frags without data */ + ompi_fifo_t **shm_fifo; /**< pointer to fifo 2D array in shared memory */ + char **shm_bases; /**< pointer to base pointers in shared memory */ + ompi_fifo_t **fifo; /**< cached copy of the pointer to the 2D + fifo array. The address in the shared + memory segment sm_ctl_header is a relative, + but this one, in process private memory, is + a real virtual address */ + size_t size_of_cb_queue; /**< size of each circular buffer queue array */ + size_t cb_lazy_free_freq; /**< frequency of lazy free */ + int cb_max_num; /**< max number of circular buffers for each peer */ + ptrdiff_t *sm_offset; /**< offset to be applied to shared memory + addresses, per local process value */ + int32_t num_smp_procs; /**< current number of smp procs on this host */ + int32_t my_smp_rank; /**< My SMP process rank. Used for accessing + * SMP specfic data structures. */ + ompi_free_list_t sm_frags_eager; /**< free list of sm first */ + ompi_free_list_t sm_frags_max; /**< free list of sm second */ ompi_free_list_t sm_first_frags_to_progress; /**< list of first fragments that are awaiting resources */ @@ -270,6 +269,23 @@ struct mca_btl_base_descriptor_t* mca_btl_sm_prepare_src( ); +/** + * Initiate an inlined send to the peer or return a descriptor. + * + * @param btl (IN) BTL module + * @param peer (IN) BTL peer addressing + */ +extern int mca_btl_sm_sendi( struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* endpoint, + struct ompi_convertor_t* convertor, + void* header, + size_t header_size, + size_t payload_size, + uint8_t order, + uint32_t flags, + mca_btl_base_tag_t tag, + mca_btl_base_descriptor_t** descriptor ); + /** * Initiate a send to the peer. * diff --git a/ompi/mca/btl/sm/btl_sm_component.c b/ompi/mca/btl/sm/btl_sm_component.c index 73f8f18df8..8e63a65d20 100644 --- a/ompi/mca/btl/sm/btl_sm_component.c +++ b/ompi/mca/btl/sm/btl_sm_component.c @@ -177,9 +177,8 @@ int mca_btl_sm_component_open(void) /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_sm_component.sm_lock, opal_mutex_t); - OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags, ompi_free_list_t); - OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags1, ompi_free_list_t); - OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags2, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags_eager, ompi_free_list_t); + OBJ_CONSTRUCT(&mca_btl_sm_component.sm_frags_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_sm_component.pending_send_fl, opal_free_list_t); return OMPI_SUCCESS; } @@ -199,8 +198,8 @@ int mca_btl_sm_component_close(void) * directly into the mmapped file, they will auto-magically dissapear * when the file get unmapped. */ - /*OBJ_DESTRUCT(&mca_btl_sm_component.sm_frags1);*/ - /*OBJ_DESTRUCT(&mca_btl_sm_component.sm_frags2);*/ + /*OBJ_DESTRUCT(&mca_btl_sm_component.sm_frags_eager);*/ + /*OBJ_DESTRUCT(&mca_btl_sm_component.sm_frags_max);*/ /* unmap the shared memory control structure */ if(mca_btl_sm_component.mmap_file != NULL) { @@ -407,30 +406,9 @@ int mca_btl_sm_component_progress(void) continue; } + rc++; /* dispatch fragment by type */ switch(((uintptr_t)hdr) & MCA_BTL_SM_FRAG_TYPE_MASK) { - case MCA_BTL_SM_FRAG_ACK: - { - int status = (uintptr_t)hdr & MCA_BTL_SM_FRAG_STATUS_MASK; - struct mca_btl_base_endpoint_t* endpoint; - int btl_ownership; - - frag = (mca_btl_sm_frag_t *)((char*)((uintptr_t)hdr & - (~(MCA_BTL_SM_FRAG_TYPE_MASK | - MCA_BTL_SM_FRAG_STATUS_MASK)))); - endpoint = frag->endpoint; - btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - /* completion callback */ - frag->base.des_cbfunc(&mca_btl_sm.super, frag->endpoint, - &frag->base, status?OMPI_ERROR:OMPI_SUCCESS); - if( btl_ownership ) { - MCA_BTL_SM_FRAG_RETURN(frag); - } - if(opal_list_get_size(&endpoint->pending_sends)) { - process_pending_send(endpoint); - } - break; - } case MCA_BTL_SM_FRAG_SEND: { mca_btl_active_message_callback_t* reg; @@ -452,6 +430,30 @@ int mca_btl_sm_component_progress(void) my_smp_rank, peer_smp_rank, hdr->frag, false, rc); break; } + case MCA_BTL_SM_FRAG_ACK: + { + int status = (uintptr_t)hdr & MCA_BTL_SM_FRAG_STATUS_MASK; + struct mca_btl_base_endpoint_t* endpoint; + int btl_ownership; + + frag = (mca_btl_sm_frag_t *)((char*)((uintptr_t)hdr & + (~(MCA_BTL_SM_FRAG_TYPE_MASK | + MCA_BTL_SM_FRAG_STATUS_MASK)))); + endpoint = frag->endpoint; + btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) { + /* completion callback */ + frag->base.des_cbfunc(&mca_btl_sm.super, frag->endpoint, + &frag->base, status?OMPI_ERROR:OMPI_SUCCESS); + } + if( btl_ownership ) { + MCA_BTL_SM_FRAG_RETURN(frag); + } + if(opal_list_get_size(&endpoint->pending_sends)) { + process_pending_send(endpoint); + } + break; + } default: /* unknown */ hdr = (mca_btl_sm_hdr_t*)((uintptr_t)hdr->frag | @@ -461,7 +463,6 @@ int mca_btl_sm_component_progress(void) my_smp_rank, peer_smp_rank, hdr, false, rc); break; } - rc++; } return rc; } diff --git a/ompi/mca/btl/sm/btl_sm_frag.c b/ompi/mca/btl/sm/btl_sm_frag.c index 21ec6a526f..956baaf530 100644 --- a/ompi/mca/btl/sm/btl_sm_frag.c +++ b/ompi/mca/btl/sm/btl_sm_frag.c @@ -36,24 +36,17 @@ static inline void mca_btl_sm_frag_common_constructor(mca_btl_sm_frag_t* frag) frag->base.des_flags = 0; } -static void mca_btl_sm_frag_constructor(mca_btl_sm_frag_t* frag) -{ - frag->size = 0; - frag->my_list = &mca_btl_sm_component.sm_frags; - mca_btl_sm_frag_common_constructor(frag); -} - static void mca_btl_sm_frag1_constructor(mca_btl_sm_frag_t* frag) { frag->size = mca_btl_sm_component.eager_limit; - frag->my_list = &mca_btl_sm_component.sm_frags1; + frag->my_list = &mca_btl_sm_component.sm_frags_eager; mca_btl_sm_frag_common_constructor(frag); } static void mca_btl_sm_frag2_constructor(mca_btl_sm_frag_t* frag) { frag->size = mca_btl_sm_component.max_frag_size; - frag->my_list = &mca_btl_sm_component.sm_frags2; + frag->my_list = &mca_btl_sm_component.sm_frags_max; mca_btl_sm_frag_common_constructor(frag); } @@ -62,12 +55,6 @@ static void mca_btl_sm_frag_destructor(mca_btl_sm_frag_t* frag) } -OBJ_CLASS_INSTANCE( - mca_btl_sm_frag_t, - mca_btl_base_descriptor_t, - mca_btl_sm_frag_constructor, - mca_btl_sm_frag_destructor); - OBJ_CLASS_INSTANCE( mca_btl_sm_frag1_t, mca_btl_base_descriptor_t, diff --git a/ompi/mca/btl/sm/btl_sm_frag.h b/ompi/mca/btl/sm/btl_sm_frag.h index 7820fc0a83..46aa1f5de4 100644 --- a/ompi/mca/btl/sm/btl_sm_frag.h +++ b/ompi/mca/btl/sm/btl_sm_frag.h @@ -69,24 +69,17 @@ OBJ_CLASS_DECLARATION(mca_btl_sm_frag_t); OBJ_CLASS_DECLARATION(mca_btl_sm_frag1_t); OBJ_CLASS_DECLARATION(mca_btl_sm_frag2_t); -#define MCA_BTL_SM_FRAG_ALLOC(frag, rc) \ -{ \ - ompi_free_list_item_t* item; \ - OMPI_FREE_LIST_WAIT(&mca_btl_sm_component.sm_frags, item, rc); \ - frag = (mca_btl_sm_frag_t*)item; \ -} - #define MCA_BTL_SM_FRAG_ALLOC1(frag, rc) \ { \ ompi_free_list_item_t* item; \ - OMPI_FREE_LIST_GET(&mca_btl_sm_component.sm_frags1, item, rc); \ + OMPI_FREE_LIST_GET(&mca_btl_sm_component.sm_frags_eager, item, rc); \ frag = (mca_btl_sm_frag_t*)item; \ } #define MCA_BTL_SM_FRAG_ALLOC2(frag, rc) \ { \ ompi_free_list_item_t* item; \ - OMPI_FREE_LIST_GET(&mca_btl_sm_component.sm_frags2, item, rc); \ + OMPI_FREE_LIST_GET(&mca_btl_sm_component.sm_frags_max, item, rc); \ frag = (mca_btl_sm_frag_t*)item; \ } diff --git a/ompi/mca/btl/tcp/btl_tcp.c b/ompi/mca/btl/tcp/btl_tcp.c index bbb73db9c8..4db854b00d 100644 --- a/ompi/mca/btl/tcp/btl_tcp.c +++ b/ompi/mca/btl/tcp/btl_tcp.c @@ -58,6 +58,7 @@ mca_btl_tcp_module_t mca_btl_tcp_module = { mca_btl_tcp_prepare_src, mca_btl_tcp_prepare_dst, mca_btl_tcp_send, + NULL, /* send immediate */ mca_btl_tcp_put, NULL, /* get */ mca_btl_base_dump, @@ -428,7 +429,7 @@ int mca_btl_tcp_put( mca_btl_base_module_t* btl, frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT; frag->hdr.count = frag->base.des_dst_cnt; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); - return mca_btl_tcp_endpoint_send(endpoint,frag); + return ((i = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : i); } @@ -448,6 +449,7 @@ int mca_btl_tcp_get( { mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*) btl; mca_btl_tcp_frag_t* frag = (mca_btl_tcp_frag_t*)descriptor; + int rc; frag->btl = tcp_btl; frag->endpoint = endpoint; @@ -464,7 +466,7 @@ int mca_btl_tcp_get( frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET; frag->hdr.count = frag->base.des_src_cnt; if (endpoint->endpoint_nbo) MCA_BTL_TCP_HDR_HTON(frag->hdr); - return mca_btl_tcp_endpoint_send(endpoint,frag); + return ((rc = mca_btl_tcp_endpoint_send(endpoint,frag)) >= 0 ? OMPI_SUCCESS : rc); } diff --git a/ompi/mca/btl/tcp/btl_tcp_endpoint.c b/ompi/mca/btl/tcp/btl_tcp_endpoint.c index 1083b0c190..19307fc663 100644 --- a/ompi/mca/btl/tcp/btl_tcp_endpoint.c +++ b/ompi/mca/btl/tcp/btl_tcp_endpoint.c @@ -245,6 +245,7 @@ int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tcp case MCA_BTL_TCP_CONNECT_ACK: case MCA_BTL_TCP_CLOSED: opal_list_append(&btl_endpoint->endpoint_frags, (opal_list_item_t*)frag); + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; if(btl_endpoint->endpoint_state == MCA_BTL_TCP_CLOSED) rc = mca_btl_tcp_endpoint_start_connect(btl_endpoint); break; @@ -258,16 +259,20 @@ int mca_btl_tcp_endpoint_send(mca_btl_base_endpoint_t* btl_endpoint, mca_btl_tcp int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); - frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, frag->rc); + if( frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ) { + frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, frag->rc); + } if( btl_ownership ) { MCA_BTL_TCP_FRAG_RETURN(frag); } - return OMPI_SUCCESS; + return 1; } else { btl_endpoint->endpoint_send_frag = frag; opal_event_add(&btl_endpoint->endpoint_send_event, 0); + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; } } else { + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; opal_list_append(&btl_endpoint->endpoint_frags, (opal_list_item_t*)frag); } break; @@ -762,6 +767,7 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user) /* if required - update request status and release fragment */ OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); + assert( frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK ); frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, frag->rc); if( btl_ownership ) { MCA_BTL_TCP_FRAG_RETURN(frag); diff --git a/ompi/mca/btl/template/btl_template.c b/ompi/mca/btl/template/btl_template.c index f35737d635..afd5744e3e 100644 --- a/ompi/mca/btl/template/btl_template.c +++ b/ompi/mca/btl/template/btl_template.c @@ -54,6 +54,7 @@ mca_btl_template_module_t mca_btl_template_module = { mca_btl_template_prepare_src, mca_btl_template_prepare_dst, mca_btl_template_send, + NULL, /* send immediate */ mca_btl_template_put, NULL, /* get */ NULL, /*dump */ diff --git a/ompi/mca/btl/udapl/btl_udapl.c b/ompi/mca/btl/udapl/btl_udapl.c index fef75114ab..a77f397ed3 100644 --- a/ompi/mca/btl/udapl/btl_udapl.c +++ b/ompi/mca/btl/udapl/btl_udapl.c @@ -71,6 +71,7 @@ mca_btl_udapl_module_t mca_btl_udapl_module = { mca_btl_udapl_prepare_src, mca_btl_udapl_prepare_dst, mca_btl_udapl_send, + NULL, /* send immediate */ mca_btl_udapl_put, NULL, /* get */ mca_btl_base_dump, diff --git a/ompi/mca/io/romio/src/io_romio_component.c b/ompi/mca/io/romio/src/io_romio_component.c index 721114dd90..be74943b6a 100644 --- a/ompi/mca/io/romio/src/io_romio_component.c +++ b/ompi/mca/io/romio/src/io_romio_component.c @@ -285,7 +285,7 @@ static int progress() /* we're done, so remove us from the pending list */ opal_list_remove_item(&mca_io_romio_pending_requests, item); /* mark as complete (and make sure to wake up any waiters */ - ompi_request_complete((ompi_request_t*) item); + ompi_request_complete((ompi_request_t*) item, true); mca_io_base_request_progress_del(); /* if the request has been freed already, the user isn't * going to call test or wait on us, so we need to do it diff --git a/ompi/mca/mtl/portals/mtl_portals.h b/ompi/mca/mtl/portals/mtl_portals.h index cc07ba337d..ddc803c1d2 100644 --- a/ompi/mca/mtl/portals/mtl_portals.h +++ b/ompi/mca/mtl/portals/mtl_portals.h @@ -92,6 +92,9 @@ struct mca_mtl_portals_module_t { /* use rendezvous for long messages */ bool ptl_use_rendezvous; + /* output channel for debugging */ + int portals_output; + }; typedef struct mca_mtl_portals_module_t mca_mtl_portals_module_t; diff --git a/ompi/mca/mtl/portals/mtl_portals_component.c b/ompi/mca/mtl/portals/mtl_portals_component.c index 744d4b8828..066a7143ad 100644 --- a/ompi/mca/mtl/portals/mtl_portals_component.c +++ b/ompi/mca/mtl/portals/mtl_portals_component.c @@ -20,6 +20,7 @@ #include "opal/event/event.h" #include "opal/mca/base/mca_base_param.h" +#include "orte/util/output.h" #include "ompi/datatype/convertor.h" #include "ompi/mca/common/portals/common_portals.h" @@ -61,6 +62,8 @@ mca_mtl_base_component_1_0_0_t mca_mtl_portals_component = { ompi_mtl_portals_component_init, /* component init */ }; +static opal_output_stream_t mtl_portals_output_stream; + static int ompi_mtl_portals_component_open(void) { @@ -98,6 +101,24 @@ ompi_mtl_portals_component_open(void) 15 * 1024 * 1024, &ompi_mtl_portals.ptl_recv_short_mds_size); + OBJ_CONSTRUCT(&mtl_portals_output_stream, opal_output_stream_t); + mtl_portals_output_stream.lds_is_debugging = true; + mtl_portals_output_stream.lds_want_stdout = true; + mtl_portals_output_stream.lds_file_suffix = "btl-portals"; + mca_base_param_reg_int(&mca_mtl_portals_component.mtl_version, + "debug_level", + "Debugging verbosity (0 - 100)", + false, + false, + 0, + &(mtl_portals_output_stream.lds_verbose_level)); + asprintf(&(mtl_portals_output_stream.lds_prefix), + "btl: portals (%s): ", ompi_common_portals_nodeid()); + ompi_mtl_portals.portals_output = + orte_output_open(&mtl_portals_output_stream, "MTL", "PORTALS", "DEBUG", NULL); + + + ompi_mtl_portals.ptl_ni_h = PTL_INVALID_HANDLE; mca_base_param_reg_int(&mca_mtl_portals_component.mtl_version, diff --git a/ompi/mca/mtl/portals/mtl_portals_send_short.c b/ompi/mca/mtl/portals/mtl_portals_send_short.c index 6ecf1743b6..1a6652cb52 100644 --- a/ompi/mca/mtl/portals/mtl_portals_send_short.c +++ b/ompi/mca/mtl/portals/mtl_portals_send_short.c @@ -18,6 +18,8 @@ #include "ompi_config.h" +#include "orte/util/output.h" + #include "mtl_portals.h" #include "mtl_portals_request.h" #include "mtl_portals_send_short.h" diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 6c729c20e6..b9107a5d8d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -1046,8 +1046,8 @@ rdma_send_info_send(ompi_osc_rdma_module_t *module, bml_btl = peer_send_info->bml_btl; mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER, - sizeof(ompi_osc_rdma_rdma_info_header_t), - MCA_BTL_DES_FLAGS_PRIORITY); + sizeof(ompi_osc_rdma_rdma_info_header_t), + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if (NULL == descriptor) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; diff --git a/ompi/mca/osc/rdma/osc_rdma_data_move.c b/ompi/mca/osc/rdma/osc_rdma_data_move.c index 5f3716caf0..ffe13b9cad 100644 --- a/ompi/mca/osc/rdma/osc_rdma_data_move.c +++ b/ompi/mca/osc/rdma/osc_rdma_data_move.c @@ -448,7 +448,7 @@ ompi_osc_rdma_sendreq_send(ompi_osc_rdma_module_t *module, mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER, module->m_use_buffers ? bml_btl->btl_eager_limit : needed_len < bml_btl->btl_eager_limit ? needed_len : - bml_btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY); + bml_btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if (NULL == descriptor) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; @@ -690,7 +690,7 @@ ompi_osc_rdma_replyreq_send(ompi_osc_rdma_module_t *module, endpoint = (mca_bml_base_endpoint_t*) replyreq->rep_origin_proc->proc_bml; bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER, - bml_btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY); + bml_btl->btl_eager_limit, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if (NULL == descriptor) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; @@ -1295,7 +1295,7 @@ ompi_osc_rdma_control_send(ompi_osc_rdma_module_t *module, bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); mca_bml_base_alloc(bml_btl, &descriptor, MCA_BTL_NO_ORDER, sizeof(ompi_osc_rdma_control_header_t), - MCA_BTL_DES_FLAGS_PRIORITY); + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if (NULL == descriptor) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; @@ -1357,7 +1357,7 @@ ompi_osc_rdma_rdma_ack_send(ompi_osc_rdma_module_t *module, /* Get a BTL and a fragment to go with it */ mca_bml_base_alloc(bml_btl, &descriptor, rdma_btl->rdma_order, sizeof(ompi_osc_rdma_control_header_t), - MCA_BTL_DES_FLAGS_PRIORITY); + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if (NULL == descriptor) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto cleanup; diff --git a/ompi/mca/pml/cm/pml_cm_recvreq.h b/ompi/mca/pml/cm/pml_cm_recvreq.h index bd7edac6a5..8feb32e4a9 100644 --- a/ompi/mca/pml/cm/pml_cm_recvreq.h +++ b/ompi/mca/pml/cm/pml_cm_recvreq.h @@ -214,7 +214,7 @@ do { \ */ #define MCA_PML_CM_THIN_RECV_REQUEST_MPI_COMPLETE( recvreq ) \ do { \ - ompi_request_complete( &(recvreq->req_base.req_ompi) ); \ + ompi_request_complete( &(recvreq->req_base.req_ompi), true ); \ } while (0) @@ -233,7 +233,7 @@ do { \ MCA_PML_CM_THIN_RECV_REQUEST_RETURN( recvreq ); \ } else { \ recvreq->req_base.req_pml_complete = true; \ - ompi_request_complete( &(recvreq->req_base.req_ompi) ); \ + ompi_request_complete( &(recvreq->req_base.req_ompi), true ); \ } \ OPAL_THREAD_UNLOCK(&ompi_request_lock); \ } while(0) @@ -262,7 +262,7 @@ do { \ ompi_convertor_set_position(&recvreq->req_base.req_convertor, &offset); \ } \ recvreq->req_base.req_pml_complete = true; \ - ompi_request_complete( &(recvreq->req_base.req_ompi) ); \ + ompi_request_complete( &(recvreq->req_base.req_ompi), true ); \ } \ OPAL_THREAD_UNLOCK(&ompi_request_lock); \ } while(0) diff --git a/ompi/mca/pml/cm/pml_cm_sendreq.h b/ompi/mca/pml/cm/pml_cm_sendreq.h index 72b13d1232..918b4aa7be 100644 --- a/ompi/mca/pml/cm/pml_cm_sendreq.h +++ b/ompi/mca/pml/cm/pml_cm_sendreq.h @@ -241,28 +241,28 @@ do { \ } while(0); -#define MCA_PML_CM_HVY_SEND_REQUEST_START(sendreq, ret) \ -do { \ - ret = OMPI_SUCCESS; \ - MCA_PML_CM_SEND_REQUEST_START_SETUP(&(sendreq)->req_send); \ - if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) { \ - MCA_PML_CM_HVY_SEND_REQUEST_BSEND_ALLOC(sendreq, ret); \ - } \ - if (OMPI_SUCCESS == ret) { \ - ret = OMPI_MTL_CALL(isend(ompi_mtl, \ - sendreq->req_send.req_base.req_comm, \ - sendreq->req_peer, \ - sendreq->req_tag, \ - &sendreq->req_send.req_base.req_convertor, \ - sendreq->req_send.req_send_mode, \ - sendreq->req_blocking, \ - &sendreq->req_mtl)); \ - if(OMPI_SUCCESS == ret && \ - sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) { \ - sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR = 0; \ - ompi_request_complete(&(sendreq)->req_send.req_base.req_ompi); \ - } \ - } \ +#define MCA_PML_CM_HVY_SEND_REQUEST_START(sendreq, ret) \ +do { \ + ret = OMPI_SUCCESS; \ + MCA_PML_CM_SEND_REQUEST_START_SETUP(&(sendreq)->req_send); \ + if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) { \ + MCA_PML_CM_HVY_SEND_REQUEST_BSEND_ALLOC(sendreq, ret); \ + } \ + if (OMPI_SUCCESS == ret) { \ + ret = OMPI_MTL_CALL(isend(ompi_mtl, \ + sendreq->req_send.req_base.req_comm, \ + sendreq->req_peer, \ + sendreq->req_tag, \ + &sendreq->req_send.req_base.req_convertor, \ + sendreq->req_send.req_send_mode, \ + sendreq->req_blocking, \ + &sendreq->req_mtl)); \ + if(OMPI_SUCCESS == ret && \ + sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED) { \ + sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR = 0; \ + ompi_request_complete(&(sendreq)->req_send.req_base.req_ompi, true); \ + } \ + } \ } while (0) /* @@ -272,36 +272,36 @@ do { \ * This macro will never be called directly from the upper level, as it should * only be an internal call to the PML. */ -#define MCA_PML_CM_HVY_SEND_REQUEST_PML_COMPLETE(sendreq) \ -do { \ - assert( false == sendreq->req_send.req_base.req_pml_complete ); \ - \ - if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && \ - sendreq->req_count > 0 ) { \ - mca_pml_base_bsend_request_free(sendreq->req_buff); \ - } \ - \ - OPAL_THREAD_LOCK(&ompi_request_lock); \ - if( false == sendreq->req_send.req_base.req_ompi.req_complete ) { \ - /* Should only be called for long messages (maybe synchronous) */ \ - ompi_request_complete(&(sendreq->req_send.req_base.req_ompi)); \ - } \ - sendreq->req_send.req_base.req_pml_complete = true; \ - \ - if( sendreq->req_send.req_base.req_free_called ) { \ - MCA_PML_CM_HVY_SEND_REQUEST_RETURN( sendreq ); \ - } else { \ - if(sendreq->req_send.req_base.req_ompi.req_persistent) { \ - /* rewind convertor */ \ - size_t offset = 0; \ +#define MCA_PML_CM_HVY_SEND_REQUEST_PML_COMPLETE(sendreq) \ +do { \ + assert( false == sendreq->req_send.req_base.req_pml_complete ); \ + \ + if (sendreq->req_send.req_send_mode == MCA_PML_BASE_SEND_BUFFERED && \ + sendreq->req_count > 0 ) { \ + mca_pml_base_bsend_request_free(sendreq->req_buff); \ + } \ + \ + OPAL_THREAD_LOCK(&ompi_request_lock); \ + if( false == sendreq->req_send.req_base.req_ompi.req_complete ) { \ + /* Should only be called for long messages (maybe synchronous) */ \ + ompi_request_complete(&(sendreq->req_send.req_base.req_ompi), true); \ + } \ + sendreq->req_send.req_base.req_pml_complete = true; \ + \ + if( sendreq->req_send.req_base.req_free_called ) { \ + MCA_PML_CM_HVY_SEND_REQUEST_RETURN( sendreq ); \ + } else { \ + if(sendreq->req_send.req_base.req_ompi.req_persistent) { \ + /* rewind convertor */ \ + size_t offset = 0; \ ompi_convertor_set_position(&sendreq->req_send.req_base.req_convertor, \ - &offset); \ - } \ - } \ - OPAL_THREAD_UNLOCK(&ompi_request_lock); \ + &offset); \ + } \ + } \ + OPAL_THREAD_UNLOCK(&ompi_request_lock); \ } while (0) - - + + /* * Release resources associated with a request */ @@ -323,21 +323,21 @@ do { \ * This macro will never be called directly from the upper level, as it should * only be an internal call to the PML. */ -#define MCA_PML_CM_THIN_SEND_REQUEST_PML_COMPLETE(sendreq) \ -do { \ - assert( false == sendreq->req_send.req_base.req_pml_complete ); \ - \ - OPAL_THREAD_LOCK(&ompi_request_lock); \ - if( false == sendreq->req_send.req_base.req_ompi.req_complete ) { \ - /* Should only be called for long messages (maybe synchronous) */ \ - ompi_request_complete(&(sendreq->req_send.req_base.req_ompi)); \ - } \ - sendreq->req_send.req_base.req_pml_complete = true; \ - \ - if( sendreq->req_send.req_base.req_free_called ) { \ - MCA_PML_CM_THIN_SEND_REQUEST_RETURN( sendreq ); \ - } \ - OPAL_THREAD_UNLOCK(&ompi_request_lock); \ +#define MCA_PML_CM_THIN_SEND_REQUEST_PML_COMPLETE(sendreq) \ +do { \ + assert( false == sendreq->req_send.req_base.req_pml_complete ); \ + \ + OPAL_THREAD_LOCK(&ompi_request_lock); \ + if( false == sendreq->req_send.req_base.req_ompi.req_complete ) { \ + /* Should only be called for long messages (maybe synchronous) */ \ + ompi_request_complete(&(sendreq->req_send.req_base.req_ompi), true); \ + } \ + sendreq->req_send.req_base.req_pml_complete = true; \ + \ + if( sendreq->req_send.req_base.req_free_called ) { \ + MCA_PML_CM_THIN_SEND_REQUEST_RETURN( sendreq ); \ + } \ + OPAL_THREAD_UNLOCK(&ompi_request_lock); \ } while (0) diff --git a/ompi/mca/pml/dr/pml_dr_recvreq.c b/ompi/mca/pml/dr/pml_dr_recvreq.c index 51e22ee659..d551ab09af 100644 --- a/ompi/mca/pml/dr/pml_dr_recvreq.c +++ b/ompi/mca/pml/dr/pml_dr_recvreq.c @@ -112,7 +112,7 @@ static int mca_pml_dr_recv_request_cancel(struct ompi_request_t* ompi_request, i * on this request will be able to complete. As the status is marked as * cancelled the cancel state will be detected. */ - ompi_request_complete(ompi_request); + ompi_request_complete(ompi_request, true); OPAL_THREAD_UNLOCK(&ompi_request_lock); return OMPI_SUCCESS; } diff --git a/ompi/mca/pml/dr/pml_dr_recvreq.h b/ompi/mca/pml/dr/pml_dr_recvreq.h index 122b75eb59..31da08c8ef 100644 --- a/ompi/mca/pml/dr/pml_dr_recvreq.h +++ b/ompi/mca/pml/dr/pml_dr_recvreq.h @@ -127,16 +127,16 @@ do { \ /* initialize request status */ \ recvreq->req_recv.req_base.req_pml_complete = true; \ - if (recvreq->req_bytes_received > recvreq->req_bytes_delivered) { \ - recvreq->req_recv.req_base.req_ompi.req_status._count = \ - recvreq->req_bytes_delivered; \ - recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = \ - MPI_ERR_TRUNCATE; \ - } else { \ - recvreq->req_recv.req_base.req_ompi.req_status._count = \ - recvreq->req_bytes_received; \ - } \ - ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi) ); \ + if (recvreq->req_bytes_received > recvreq->req_bytes_delivered) { \ + recvreq->req_recv.req_base.req_ompi.req_status._count = \ + recvreq->req_bytes_delivered; \ + recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR = \ + MPI_ERR_TRUNCATE; \ + } else { \ + recvreq->req_recv.req_base.req_ompi.req_status._count = \ + recvreq->req_bytes_received; \ + } \ + ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \ \ if( true == recvreq->req_recv.req_base.req_free_called ) { \ MCA_PML_DR_RECV_REQUEST_RETURN( recvreq ); \ diff --git a/ompi/mca/pml/dr/pml_dr_sendreq.h b/ompi/mca/pml/dr/pml_dr_sendreq.h index 9234016adc..fff1f71358 100644 --- a/ompi/mca/pml/dr/pml_dr_sendreq.h +++ b/ompi/mca/pml/dr/pml_dr_sendreq.h @@ -211,7 +211,7 @@ do { * Mark a send request as completed at the MPI level. */ -#define MCA_PML_DR_SEND_REQUEST_MPI_COMPLETE(sendreq) \ +#define MCA_PML_DR_SEND_REQUEST_MPI_COMPLETE(sendreq) \ do { \ (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \ (sendreq)->req_send.req_base.req_comm->c_my_rank; \ @@ -220,7 +220,7 @@ do { (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \ (sendreq)->req_send.req_base.req_ompi.req_status._count = \ (sendreq)->req_send.req_bytes_packed; \ - ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi) ); \ + ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), true ); \ } while(0) /* diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index ba33f3418a..9b5f1a1de5 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -365,7 +365,8 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc, int rc; mca_bml_base_alloc(bml_btl, &fin, order, sizeof(mca_pml_ob1_fin_hdr_t), - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | + MCA_BTL_DES_SEND_ALWAYS_CALLBACK); if(NULL == fin) { MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); @@ -387,13 +388,15 @@ int mca_pml_ob1_send_fin( ompi_proc_t* proc, rc = mca_bml_base_send( bml_btl, fin, MCA_PML_OB1_HDR_TYPE_FIN ); - if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - mca_bml_base_free(bml_btl, fin); - MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); - return OMPI_ERR_OUT_OF_RESOURCE; + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + MCA_PML_OB1_PROGRESS_PENDING(bml_btl); + } + return OMPI_SUCCESS; } - - return OMPI_SUCCESS; + mca_bml_base_free(bml_btl, fin); + MCA_PML_OB1_ADD_FIN_TO_PENDING(proc, hdr_des, bml_btl, order, status); + return OMPI_ERR_OUT_OF_RESOURCE; } void mca_pml_ob1_process_pending_packets(mca_bml_base_btl_t* bml_btl) diff --git a/ompi/mca/pml/ob1/pml_ob1_hdr.h b/ompi/mca/pml/ob1/pml_ob1_hdr.h index 5e878324a8..af3e3fb82d 100644 --- a/ompi/mca/pml/ob1/pml_ob1_hdr.h +++ b/ompi/mca/pml/ob1/pml_ob1_hdr.h @@ -75,6 +75,12 @@ struct mca_pml_ob1_match_hdr_t { uint8_t hdr_padding[2]; /**< explicitly pad to 16 bytes. Compilers seem to already prefer to do this, but make it explicit just in case */ #endif }; +#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT +#define OMPI_PML_OB1_MATCH_HDR_LEN 16 +#else +#define OMPI_PML_OB1_MATCH_HDR_LEN 14 +#endif + typedef struct mca_pml_ob1_match_hdr_t mca_pml_ob1_match_hdr_t; #define MCA_PML_OB1_MATCH_HDR_NTOH(h) \ diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c index 48f63dd1c0..f385a32c43 100644 --- a/ompi/mca/pml/ob1/pml_ob1_isend.c +++ b/ompi/mca/pml/ob1/pml_ob1_isend.c @@ -68,6 +68,7 @@ int mca_pml_ob1_isend(void *buf, { int rc; mca_pml_ob1_send_request_t *sendreq = NULL; + MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); if (rc != OMPI_SUCCESS) return rc; @@ -99,6 +100,7 @@ int mca_pml_ob1_send(void *buf, { int rc; mca_pml_ob1_send_request_t *sendreq; + MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq, rc); if (rc != OMPI_SUCCESS) return rc; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 0c540b6667..01c237bc8e 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -89,8 +89,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, size_t num_segments = des->des_dst_cnt; size_t bytes_received = 0; - - if( OPAL_UNLIKELY(segment->seg_len < sizeof(mca_pml_ob1_match_hdr_t))) { + if( OPAL_UNLIKELY(segment->seg_len < OMPI_PML_OB1_MATCH_HDR_LEN) ) { return; } ob1_hdr_ntoh(((mca_pml_ob1_hdr_t*) hdr), MCA_PML_OB1_HDR_TYPE_MATCH); @@ -169,7 +168,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, OPAL_THREAD_UNLOCK(&comm->matching_lock); if(OPAL_LIKELY(match)) { - bytes_received = segment->seg_len - sizeof(mca_pml_ob1_match_hdr_t); + bytes_received = segment->seg_len - OMPI_PML_OB1_MATCH_HDR_LEN; match->req_recv.req_bytes_packed = bytes_received; MCA_PML_OB1_RECV_REQUEST_MATCHED(match, hdr); @@ -189,7 +188,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, iov[0].iov_len = bytes_received; iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + - sizeof(mca_pml_ob1_match_hdr_t)); + OMPI_PML_OB1_MATCH_HDR_LEN); if(num_segments > 1) { bytes_received += segment[1].seg_len; iov[1].iov_len = segment[1].seg_len; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index efebeba559..b25f37fe15 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -235,11 +235,14 @@ int mca_pml_ob1_recv_request_ack_send_btl( des->des_cbfunc = mca_pml_ob1_recv_ctl_completion; rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_ACK); - if( OPAL_UNLIKELY(rc != OMPI_SUCCESS) ) { - mca_bml_base_free(bml_btl, des); - return OMPI_ERR_OUT_OF_RESOURCE; + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + MCA_PML_OB1_PROGRESS_PENDING(bml_btl); + } + return OMPI_SUCCESS; } - return OMPI_SUCCESS; + mca_bml_base_free(bml_btl, des); + return OMPI_ERR_OUT_OF_RESOURCE; } static int mca_pml_ob1_recv_request_ack( @@ -614,7 +617,7 @@ void mca_pml_ob1_recv_request_progress_match( mca_pml_ob1_recv_request_t* recvre MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( segments, num_segments, 0, bytes_received ); - bytes_received -= sizeof(mca_pml_ob1_match_hdr_t); + bytes_received -= OMPI_PML_OB1_MATCH_HDR_LEN; recvreq->req_recv.req_bytes_packed = bytes_received; MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_match); @@ -630,7 +633,7 @@ void mca_pml_ob1_recv_request_progress_match( mca_pml_ob1_recv_request_t* recvre MCA_PML_OB1_RECV_REQUEST_UNPACK( recvreq, segments, num_segments, - sizeof(mca_pml_ob1_match_hdr_t), + OMPI_PML_OB1_MATCH_HDR_LEN, data_offset, bytes_received, bytes_delivered); @@ -666,7 +669,7 @@ void mca_pml_ob1_recv_request_matched_probe( mca_pml_ob1_recv_request_t* recvreq case MCA_PML_OB1_HDR_TYPE_MATCH: MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( segments, num_segments, - sizeof(mca_pml_ob1_match_hdr_t), + OMPI_PML_OB1_MATCH_HDR_LEN, bytes_packed ); break; @@ -819,16 +822,19 @@ int mca_pml_ob1_recv_request_schedule_once( /* send rdma request to peer */ rc = mca_bml_base_send(bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); - if(OPAL_LIKELY(OMPI_SUCCESS == rc)) { + if( OPAL_LIKELY( rc >= 0 ) ) { /* update request state */ recvreq->req_rdma_offset += size; OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1); recvreq->req_rdma[rdma_idx].length -= size; bytes_remaining -= size; + if( OPAL_LIKELY( 1 == rc ) ) { + /* The send is completed, trigger the callback */ + MCA_PML_OB1_PROGRESS_PENDING(bml_btl); + } } else { mca_bml_base_free(bml_btl,ctl); mca_bml_base_free(bml_btl,dst); - continue; } } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index abbf7ae46a..eef7018ae6 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -119,7 +119,7 @@ do { \ do { \ PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \ &(recvreq->req_recv.req_base), PERUSE_RECV ); \ - ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi) ); \ + ompi_request_complete( &(recvreq->req_recv.req_base.req_ompi), true ); \ } while (0) /* diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c index 7cc6772548..d77b4c238c 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c @@ -146,25 +146,43 @@ OBJ_CLASS_INSTANCE( mca_pml_ob1_send_request_t, * Completion of a short message - nothing left to schedule. */ -static void -mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor, - int status ) +static inline void +mca_pml_ob1_match_fast_completion_free( struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des ) { - mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)descriptor->des_cbdata; - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context; - + mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; + if( sendreq->req_send.req_bytes_packed > 0 ) { PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, &(sendreq->req_send.req_base), PERUSE_SEND ); } - /* check completion status */ - if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { - /* TSW - FIX */ - orte_output(0, "%s:%d FATAL", __FILE__, __LINE__); - orte_errmgr.abort(-1, NULL); + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_END, + &(sendreq->req_send.req_base), PERUSE_SEND); + } + + /* + * We are on the fast path, so there is no need to lock the request, as at + * this point there is only one reference to it. Moreover, there is no + * need to signal anything, as nobody is waiting on it. + */ + MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, false); + sendreq->req_send.req_base.req_pml_complete = true; + + /* check for pending requests */ + MCA_PML_OB1_PROGRESS_PENDING(bml_btl); +} + +static inline void +mca_pml_ob1_match_completion_free_request( mca_bml_base_btl_t* bml_btl, + mca_pml_ob1_send_request_t* sendreq ) +{ + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, + &(sendreq->req_send.req_base), PERUSE_SEND ); } /* signal request completion */ @@ -174,24 +192,14 @@ mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl, MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } -/* - * Completion of the first fragment of a long message that - * requires an acknowledgement - */ -static void -mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl, - struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor, - int status ) +static inline void +mca_pml_ob1_match_completion_free( struct mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) { - mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)descriptor->des_cbdata; - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)descriptor->des_context; - size_t req_bytes_delivered = 0; - - if( sendreq->req_send.req_bytes_packed > 0 ) { - PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, - &(sendreq->req_send.req_base), PERUSE_SEND ); - } + mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; /* check completion status */ if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { @@ -199,15 +207,18 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl, orte_output(0, "%s:%d FATAL", __FILE__, __LINE__); orte_errmgr.abort(-1, NULL); } + mca_pml_ob1_match_completion_free_request( bml_btl, sendreq ); +} - /* count bytes of user data actually delivered. As the rndv completion only - * happens in one thread, the increase of the req_bytes_delivered does not - * have to be atomic. - */ - MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( descriptor->des_src, - descriptor->des_src_cnt, - sizeof(mca_pml_ob1_rendezvous_hdr_t), - req_bytes_delivered ); +static inline void +mca_pml_ob1_rndv_completion_request( mca_bml_base_btl_t* bml_btl, + mca_pml_ob1_send_request_t* sendreq, + size_t req_bytes_delivered ) +{ + if( sendreq->req_send.req_bytes_packed > 0 ) { + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_XFER_BEGIN, + &(sendreq->req_send.req_base), PERUSE_SEND ); + } OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered); @@ -220,12 +231,45 @@ mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl, MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } +/* + * Completion of the first fragment of a long message that + * requires an acknowledgement + */ +static inline void +mca_pml_ob1_rndv_completion( mca_btl_base_module_t* btl, + struct mca_btl_base_endpoint_t* ep, + struct mca_btl_base_descriptor_t* des, + int status ) +{ + mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*)des->des_context; + size_t req_bytes_delivered = 0; + + /* check completion status */ + if( OPAL_UNLIKELY(OMPI_SUCCESS != status) ) { + /* TSW - FIX */ + orte_output(0, "%s:%d FATAL", __FILE__, __LINE__); + orte_errmgr.abort(-1, NULL); + } + + /* count bytes of user data actually delivered. As the rndv completion only + * happens in one thread, the increase of the req_bytes_delivered does not + * have to be atomic. + */ + MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src, + des->des_src_cnt, + sizeof(mca_pml_ob1_rendezvous_hdr_t), + req_bytes_delivered ); + + mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered ); +} + /** * Completion of a get request. */ -static void +static inline void mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, struct mca_btl_base_descriptor_t* des, @@ -251,13 +295,13 @@ mca_pml_ob1_rget_completion( mca_btl_base_module_t* btl, * Completion of a control message - return resources. */ -static void +static inline void mca_pml_ob1_send_ctl_completion( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor, + struct mca_btl_base_descriptor_t* des, int status ) { - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; /* check for pending requests */ MCA_PML_OB1_PROGRESS_PENDING(bml_btl); @@ -268,14 +312,14 @@ mca_pml_ob1_send_ctl_completion( mca_btl_base_module_t* btl, * to schedule additional fragments. */ -static void +static inline void mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* ep, - struct mca_btl_base_descriptor_t* descriptor, + struct mca_btl_base_descriptor_t* des, int status ) { - mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)descriptor->des_cbdata; - mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) descriptor->des_context; + mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t*)des->des_cbdata; + mca_bml_base_btl_t* bml_btl = (mca_bml_base_btl_t*) des->des_context; size_t req_bytes_delivered = 0; /* check completion status */ @@ -286,8 +330,8 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl, } /* count bytes of user data actually delivered */ - MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( descriptor->des_src, - descriptor->des_src_cnt, + MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src, + des->des_src_cnt, sizeof(mca_pml_ob1_frag_hdr_t), req_bytes_delivered ); @@ -310,23 +354,23 @@ int mca_pml_ob1_send_request_start_buffered( mca_bml_base_btl_t* bml_btl, size_t size) { - mca_btl_base_descriptor_t* descriptor; + mca_btl_base_descriptor_t* des; mca_btl_base_segment_t* segment; mca_pml_ob1_hdr_t* hdr; struct iovec iov; unsigned int iov_count; - size_t max_data; + size_t max_data, req_bytes_delivered; int rc; /* allocate descriptor */ - mca_bml_base_alloc(bml_btl, &descriptor, + mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_rendezvous_hdr_t) + size, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if( OPAL_UNLIKELY(NULL == descriptor) ) { + if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } - segment = descriptor->des_src; + segment = des->des_src; /* pack the data into the BTL supplied buffer */ iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + @@ -338,9 +382,10 @@ int mca_pml_ob1_send_request_start_buffered( &iov, &iov_count, &max_data)) < 0) { - mca_bml_base_free(bml_btl, descriptor); + mca_bml_base_free(bml_btl, des); return rc; } + req_bytes_delivered = max_data; /* build rendezvous header */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; @@ -354,18 +399,18 @@ int mca_pml_ob1_send_request_start_buffered( hdr->hdr_rndv.hdr_src_req.pval = sendreq; ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_RNDV, - sendreq->req_send.req_base.req_proc); + sendreq->req_send.req_base.req_proc); /* update lengths */ segment->seg_len = sizeof(mca_pml_ob1_rendezvous_hdr_t) + max_data; - descriptor->des_cbfunc = mca_pml_ob1_rndv_completion; - descriptor->des_cbdata = sendreq; + des->des_cbfunc = mca_pml_ob1_rndv_completion; + des->des_cbdata = sendreq; /* buffer the remainder of the message */ rc = mca_pml_base_bsend_request_alloc((ompi_request_t*)sendreq); if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - mca_bml_base_free(bml_btl, descriptor); + mca_bml_base_free(bml_btl, des); return rc; } @@ -376,7 +421,7 @@ int mca_pml_ob1_send_request_start_buffered( &iov, &iov_count, &max_data)) < 0) { - mca_bml_base_free(bml_btl, descriptor); + mca_bml_base_free(bml_btl, des); return rc; } @@ -391,14 +436,18 @@ int mca_pml_ob1_send_request_start_buffered( /* request is complete at mpi level */ OPAL_THREAD_LOCK(&ompi_request_lock); - MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq); + MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, true); OPAL_THREAD_UNLOCK(&ompi_request_lock); /* send */ - rc = mca_bml_base_send(bml_btl, descriptor, MCA_PML_OB1_HDR_TYPE_RNDV); - if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - mca_bml_base_free(bml_btl, descriptor ); + rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, req_bytes_delivered); + } + return OMPI_SUCCESS; } + mca_bml_base_free(bml_btl, des ); return rc; } @@ -413,33 +462,75 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, mca_bml_base_btl_t* bml_btl, size_t size ) { - mca_btl_base_descriptor_t* descriptor; + mca_btl_base_descriptor_t* des = NULL; mca_btl_base_segment_t* segment; mca_pml_ob1_hdr_t* hdr; struct iovec iov; unsigned int iov_count; size_t max_data = size; int rc; +#if 0 + if(NULL != bml_btl->btl_sendi) { + mca_pml_ob1_match_hdr_t match; + match.hdr_common.hdr_flags = 0; + match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH; + match.hdr_ctx = sendreq->req_send.req_base.req_comm->c_contextid; + match.hdr_src = sendreq->req_send.req_base.req_comm->c_my_rank; + match.hdr_tag = sendreq->req_send.req_base.req_tag; + match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; + + ob1_hdr_hton(&match, MCA_PML_OB1_HDR_TYPE_MATCH, + sendreq->req_send.req_base.req_proc); - /* allocate descriptor */ - mca_bml_base_alloc( bml_btl, &descriptor, - MCA_BTL_NO_ORDER, - sizeof(mca_pml_ob1_match_hdr_t) + size, - MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); - if( OPAL_UNLIKELY(NULL == descriptor) ) { - return OMPI_ERR_OUT_OF_RESOURCE; + /* try to send immediately */ + rc = mca_bml_base_sendi( bml_btl, &sendreq->req_send.req_base.req_convertor, + &match, OMPI_PML_OB1_MATCH_HDR_LEN, + size, MCA_BTL_NO_ORDER, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, + MCA_PML_OB1_HDR_TYPE_MATCH, + &des); + if( OMPI_SUCCESS == rc ) { + + /* signal request completion */ + send_request_pml_complete(sendreq); + + /* check for pending requests */ + MCA_PML_OB1_PROGRESS_PENDING(bml_btl); + return OMPI_SUCCESS; + } + switch(rc) { + case OMPI_ERR_RESOURCE_BUSY: + if(OPAL_UNLIKELY(NULL == des)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + break; + default: + return rc; + break; + } + } else +#endif + { + /* allocate descriptor */ + mca_bml_base_alloc( bml_btl, &des, + MCA_BTL_NO_ORDER, + OMPI_PML_OB1_MATCH_HDR_LEN + size, + MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); + if( OPAL_UNLIKELY(NULL == des) ) { + return OMPI_ERR_OUT_OF_RESOURCE; + } } - segment = descriptor->des_src; + segment = des->des_src; if(size > 0) { /* pack the data into the supplied buffer */ iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval + - sizeof(mca_pml_ob1_match_hdr_t)); + OMPI_PML_OB1_MATCH_HDR_LEN); iov.iov_len = size; iov_count = 1; /* * Before copy the user buffer, make the target part - * accessable. + * accessible. */ MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_defined, @@ -471,31 +562,31 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq, hdr->hdr_match.hdr_seq = (uint16_t)sendreq->req_send.req_base.req_sequence; ob1_hdr_hton(hdr, MCA_PML_OB1_HDR_TYPE_MATCH, - sendreq->req_send.req_base.req_proc); + sendreq->req_send.req_base.req_proc); /* update lengths */ - segment->seg_len = sizeof(mca_pml_ob1_match_hdr_t) + max_data; + segment->seg_len = OMPI_PML_OB1_MATCH_HDR_LEN + max_data; /* short message */ - descriptor->des_cbdata = sendreq; - descriptor->des_cbfunc = mca_pml_ob1_match_completion_free; + des->des_cbdata = sendreq; + des->des_cbfunc = mca_pml_ob1_match_completion_free; /* send */ - rc = mca_bml_base_send_status(bml_btl, descriptor, MCA_PML_OB1_HDR_TYPE_MATCH); + rc = mca_bml_base_send_status(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_ob1_match_completion_free_request( bml_btl, sendreq ); + } + return OMPI_SUCCESS; + } switch(rc) { - case OMPI_SUCCESS: - /* packet is on wire; signal request completion */ - OPAL_THREAD_LOCK(&ompi_request_lock); - MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq); - OPAL_THREAD_UNLOCK(&ompi_request_lock); - break; case OMPI_ERR_RESOURCE_BUSY: /* don't signal request completion; will be completed in wait() */ rc = OMPI_SUCCESS; break; default: - mca_bml_base_free(bml_btl, descriptor); + mca_bml_base_free(bml_btl, des); break; } return rc; @@ -510,7 +601,7 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, mca_bml_base_btl_t* bml_btl, size_t size ) { - mca_btl_base_descriptor_t* descriptor; + mca_btl_base_descriptor_t* des; mca_btl_base_segment_t* segment; mca_pml_ob1_hdr_t* hdr; int rc; @@ -520,14 +611,14 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, NULL, &sendreq->req_send.req_base.req_convertor, MCA_BTL_NO_ORDER, - sizeof(mca_pml_ob1_match_hdr_t), + OMPI_PML_OB1_MATCH_HDR_LEN, &size, MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, - &descriptor ); - if( OPAL_UNLIKELY(NULL == descriptor) ) { + &des ); + if( OPAL_UNLIKELY(NULL == des) ) { return OMPI_ERR_OUT_OF_RESOURCE; } - segment = descriptor->des_src; + segment = des->des_src; /* build match header */ hdr = (mca_pml_ob1_hdr_t*)segment->seg_addr.pval; @@ -542,15 +633,18 @@ int mca_pml_ob1_send_request_start_prepare( mca_pml_ob1_send_request_t* sendreq, sendreq->req_send.req_base.req_proc); /* short message */ - descriptor->des_cbfunc = mca_pml_ob1_match_completion_free; - - descriptor->des_cbdata = sendreq; + des->des_cbfunc = mca_pml_ob1_match_completion_free; + des->des_cbdata = sendreq; /* send */ - rc = mca_bml_base_send(bml_btl, descriptor, MCA_PML_OB1_HDR_TYPE_MATCH); - if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - mca_bml_base_free(bml_btl, descriptor ); + rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_ob1_match_completion_free_request( bml_btl, sendreq ); + } + return OMPI_SUCCESS; } + mca_bml_base_free(bml_btl, des ); return rc; } @@ -706,10 +800,14 @@ int mca_pml_ob1_send_request_start_rdma( des->des_cbdata = sendreq; /* send */ - rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV); - if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - mca_bml_base_free(bml_btl, des); + rc = mca_bml_base_send(bml_btl, des, hdr->hdr_common.hdr_type); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, 0 ); + } + return OMPI_SUCCESS; } + mca_bml_base_free(bml_btl, des); return rc; } @@ -787,9 +885,13 @@ int mca_pml_ob1_send_request_start_rndv( mca_pml_ob1_send_request_t* sendreq, /* send */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_RNDV); - if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { - mca_bml_base_free(bml_btl, des ); + if( OPAL_LIKELY( rc >= 0 ) ) { + if( OPAL_LIKELY( 1 == rc ) ) { + mca_pml_ob1_rndv_completion_request( bml_btl, sendreq, size ); + } + return OMPI_SUCCESS; } + mca_bml_base_free(bml_btl, des ); return rc; } @@ -958,7 +1060,7 @@ cannot_pack: &sendreq->req_send.req_base.req_convertor, MCA_BTL_NO_ORDER, sizeof(mca_pml_ob1_frag_hdr_t), - &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, &des); + &size, MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK, &des); MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_noaccess, sendreq->req_send.req_base.req_addr, @@ -998,8 +1100,7 @@ cannot_pack: /* initiate send - note that this may complete before the call returns */ rc = mca_bml_base_send(bml_btl, des, MCA_PML_OB1_HDR_TYPE_FRAG); - - if( OPAL_LIKELY(rc == OMPI_SUCCESS) ) { + if( OPAL_LIKELY(rc >= 0) ) { /* update state */ range->range_btls[btl_idx].length -= size; range->range_send_length -= size; @@ -1011,7 +1112,6 @@ cannot_pack: } } else { mca_bml_base_free(bml_btl,des); - continue; } } diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h index d06b69ed78..58df3a035f 100644 --- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h @@ -180,19 +180,19 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s * Mark a send request as completed at the MPI level. */ -#define MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq) \ -do { \ - (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \ - (sendreq)->req_send.req_base.req_comm->c_my_rank; \ - (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \ - (sendreq)->req_send.req_base.req_tag; \ - (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \ - (sendreq)->req_send.req_base.req_ompi.req_status._count = \ - (int)(sendreq)->req_send.req_bytes_packed; \ - ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi) ); \ - \ - PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \ - &(sendreq->req_send.req_base), PERUSE_SEND); \ +#define MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, with_signal) \ +do { \ + (sendreq)->req_send.req_base.req_ompi.req_status.MPI_SOURCE = \ + (sendreq)->req_send.req_base.req_comm->c_my_rank; \ + (sendreq)->req_send.req_base.req_ompi.req_status.MPI_TAG = \ + (sendreq)->req_send.req_base.req_tag; \ + (sendreq)->req_send.req_base.req_ompi.req_status.MPI_ERROR = OMPI_SUCCESS; \ + (sendreq)->req_send.req_base.req_ompi.req_status._count = \ + (int)(sendreq)->req_send.req_bytes_packed; \ + ompi_request_complete( &((sendreq)->req_send.req_base.req_ompi), (with_signal) ); \ + \ + PERUSE_TRACE_COMM_EVENT( PERUSE_COMM_REQ_COMPLETE, \ + &(sendreq->req_send.req_base), PERUSE_SEND); \ } while(0) /* @@ -237,7 +237,7 @@ send_request_pml_complete(mca_pml_ob1_send_request_t *sendreq) OPAL_THREAD_LOCK(&ompi_request_lock); if(false == sendreq->req_send.req_base.req_ompi.req_complete) { /* Should only be called for long messages (maybe synchronous) */ - MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq); + MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, true); } sendreq->req_send.req_base.req_pml_complete = true; @@ -459,5 +459,4 @@ void mca_pml_ob1_send_request_copy_in_out(mca_pml_ob1_send_request_t *sendreq, END_C_DECLS -#endif /* !defined(OMPI_PML_OB1_SEND_REQUEST_H) */ - +#endif /* OMPI_PML_OB1_SEND_REQUEST_H */ diff --git a/ompi/request/grequest.c b/ompi/request/grequest.c index 703a95ab94..7bd237f650 100644 --- a/ompi/request/grequest.c +++ b/ompi/request/grequest.c @@ -180,7 +180,7 @@ int ompi_grequest_complete(ompi_request_t *req) int rc; OPAL_THREAD_LOCK(&ompi_request_lock); - rc = ompi_request_complete(req); + rc = ompi_request_complete(req, true); OPAL_THREAD_UNLOCK(&ompi_request_lock); OBJ_RELEASE(req); return rc; diff --git a/ompi/request/request.h b/ompi/request/request.h index 42f54fd1cb..4805118c9b 100644 --- a/ompi/request/request.h +++ b/ompi/request/request.h @@ -386,19 +386,23 @@ static inline void ompi_request_wait_completion(ompi_request_t *req) } /** - * Signal a request as complete. Note this will - * wake any thread pending on the request. - * ompi_request_lock should be held while calling this function + * Signal or mark a request as complete. If with_signal is true this will + * wake any thread pending on the request and ompi_request_lock should be + * held while calling this function. If with_signal is false, there will + * signal generated, and no lock required. This is a special case when + * the function is called from the critical path for small messages, where + * we know the current execution flow created the request, and is still + * in the _START macro. */ - -static inline int ompi_request_complete(ompi_request_t* request) +static inline int ompi_request_complete(ompi_request_t* request, bool with_signal) { if( NULL != request->req_complete_cb ) { request->req_complete_cb( request ); } + request->req_complete = true; ompi_request_completed++; request->req_complete = true; - if(ompi_request_waiting) { + if(with_signal && ompi_request_waiting) { /* Broadcast the condition, otherwise if there is already a thread * waiting on another request it can use all signals. */