diff --git a/ompi/mca/btl/usnic/btl_usnic_ack.c b/ompi/mca/btl/usnic/btl_usnic_ack.c index 43b81c38cb..307c76ca32 100644 --- a/ompi/mca/btl/usnic/btl_usnic_ack.c +++ b/ompi/mca/btl/usnic/btl_usnic_ack.c @@ -134,20 +134,22 @@ ompi_btl_usnic_handle_ack( (void*)sseg, (void*)frag, bytes_acked, frag->sf_ack_bytes_left); #endif - /* perform completion callback for PUT here */ + /* If all ACKs received, and this is a put or a regular send + * that needs a callback, perform the callback now + */ if (frag->sf_ack_bytes_left == 0 && - frag->sf_base.uf_dst_seg[0].seg_addr.pval != NULL) { -#if MSGDEBUG1 - opal_output(0, "Calling back %p for PUT completion, frag=%p\n", - (void*)(uintptr_t)frag->sf_base.uf_base.des_cbfunc, (void*)frag); + ((frag->sf_base.uf_dst_seg[0].seg_addr.pval != NULL) || + (frag->sf_base.uf_base.des_flags & + MCA_BTL_DES_SEND_ALWAYS_CALLBACK))) { +#if MSGDEBUG2 + opal_output(0, "completion callback for frag=%p, dest=%p\n", + (void*)frag, frag->sf_base.uf_dst_seg[0].seg_addr.pval); #endif frag->sf_base.uf_base.des_cbfunc(&module->super, frag->sf_endpoint, &frag->sf_base.uf_base, OMPI_SUCCESS); + frag->sf_base.uf_base.des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK; } - /* OK to return this fragment? */ - ompi_btl_usnic_send_frag_return_cond(module, frag); - /* free this segment */ sseg->ss_ack_pending = false; if (sseg->ss_base.us_type == OMPI_BTL_USNIC_SEG_CHUNK && @@ -155,6 +157,9 @@ ompi_btl_usnic_handle_ack( ompi_btl_usnic_chunk_segment_return(module, sseg); } + /* OK to return this fragment? */ + ompi_btl_usnic_send_frag_return_cond(module, frag); + /* indicate this segment has been ACKed */ endpoint->endpoint_sent_segs[WINDOW_SIZE_MOD(is)] = NULL; } diff --git a/ompi/mca/btl/usnic/btl_usnic_frag.h b/ompi/mca/btl/usnic/btl_usnic_frag.h index 91d08e40b0..b8f39f15b4 100644 --- a/ompi/mca/btl/usnic/btl_usnic_frag.h +++ b/ompi/mca/btl/usnic/btl_usnic_frag.h @@ -57,6 +57,19 @@ typedef enum { OMPI_BTL_USNIC_FRAG_PUT_DEST } ompi_btl_usnic_frag_type_t; +#if MSGDEBUG2 +static inline char * +usnic_frag_type(ompi_btl_usnic_frag_type_t t) +{ + switch (t) { + case OMPI_BTL_USNIC_FRAG_LARGE_SEND: return "large"; + case OMPI_BTL_USNIC_FRAG_SMALL_SEND: return "small"; + case OMPI_BTL_USNIC_FRAG_PUT_DEST: return "put dest"; + default: return "unknown"; + } +} +#endif + typedef enum { OMPI_BTL_USNIC_SEG_ACK, OMPI_BTL_USNIC_SEG_FRAG, @@ -64,6 +77,20 @@ typedef enum { OMPI_BTL_USNIC_SEG_RECV } ompi_btl_usnic_seg_type_t; +#if MSGDEBUG2 +static inline char * +usnic_seg_type(ompi_btl_usnic_seg_type_t t) +{ + switch (t) { + case OMPI_BTL_USNIC_SEG_ACK: return "ACK"; + case OMPI_BTL_USNIC_SEG_FRAG: return "FRAG"; + case OMPI_BTL_USNIC_SEG_CHUNK: return "CHUNK"; + case OMPI_BTL_USNIC_SEG_RECV: return "RECV"; + default: return "unknown"; + } +} +#endif + typedef struct ompi_btl_usnic_reg_t { mca_mpool_base_registration_t base; @@ -94,6 +121,7 @@ typedef enum { * holes. */ typedef struct { + /* Hashed RTE process name of the sender */ uint64_t sender; @@ -112,8 +140,9 @@ typedef struct { /* Type of BTL header (see enum, above) */ uint8_t payload_type; - /* Yuck */ - uint8_t padding; + + /* tag for PML, etc */ + mca_btl_base_tag_t tag; } ompi_btl_usnic_btl_header_t; /** @@ -146,15 +175,6 @@ typedef enum { FRAG_MAX = 0xff } ompi_btl_usnic_frag_state_flags_t; - -/* - * Convenience macros for states - */ -#define FRAG_STATE_SET(frag, state) (frag)->state_flags |= (state) -#define FRAG_STATE_CLR(frag, state) (frag)->state_flags &= ~(state) -#define FRAG_STATE_GET(frag, state) ((frag)->state_flags & (state)) -#define FRAG_STATE_ISSET(frag, state) (((frag)->state_flags & (state)) != 0) - /** * Descriptor for a common segment. This is exactly one packet and may * be send or receive @@ -177,7 +197,7 @@ typedef struct ompi_btl_usnic_segment_t { union { uint8_t *raw; - mca_btl_base_header_t *pml_header; + void *pml_header; } us_payload; } ompi_btl_usnic_segment_t; @@ -270,6 +290,7 @@ typedef struct ompi_btl_usnic_large_send_frag_t { ompi_btl_usnic_send_frag_t lsf_base; char lsf_pml_header[64]; /* space for PML header */ + mca_btl_base_tag_t lsf_tag; /* save tag */ uint32_t lsf_frag_id; /* fragment ID for reassembly */ size_t lsf_cur_offset; /* current offset into message */ @@ -424,6 +445,10 @@ ompi_btl_usnic_frag_return( struct ompi_btl_usnic_module_t *module, ompi_btl_usnic_frag_t *frag) { +#if MSGDEBUG2 + opal_output(0, "freeing frag %p, type %s\n", (void *)frag, + usnic_frag_type(frag->uf_type)); +#endif OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super)); } diff --git a/ompi/mca/btl/usnic/btl_usnic_module.c b/ompi/mca/btl/usnic/btl_usnic_module.c index 9f9ca75a34..ffe9e8a19a 100644 --- a/ompi/mca/btl/usnic/btl_usnic_module.c +++ b/ompi/mca/btl/usnic/btl_usnic_module.c @@ -219,35 +219,23 @@ usnic_alloc(struct mca_btl_base_module_t* btl, { ompi_btl_usnic_send_frag_t *frag; ompi_btl_usnic_module_t *module = (ompi_btl_usnic_module_t*) btl; + ompi_btl_usnic_small_send_frag_t *sfrag; mca_btl_base_descriptor_t *desc; - /* will this fit into a small send? */ - if (size <= module->max_frag_payload) { - ompi_btl_usnic_small_send_frag_t *sfrag; + if (size > module->max_frag_payload) + size = module->max_frag_payload; - sfrag = ompi_btl_usnic_small_send_frag_alloc(module); - if (NULL == sfrag) { - return NULL; - } - frag = &sfrag->ssf_base; - } else { - ompi_btl_usnic_large_send_frag_t *lfrag; - - lfrag = ompi_btl_usnic_large_send_frag_alloc(module); - if (NULL == lfrag) { - return NULL; - } - frag = &lfrag->lsf_base; - - BTL_ERROR(("large frag in usnic_alloc()\n")); - abort(); /* XXX - we don't ever want to see this... */ + sfrag = ompi_btl_usnic_small_send_frag_alloc(module); + if (NULL == sfrag) { + return NULL; } + frag = &sfrag->ssf_base; #if MSGDEBUG2 - opal_output(0, "usnic_alloc: %s frag=%p, size=%d\n", + opal_output(0, "usnic_alloc: %s frag=%p, size=%d, flags=0x%x\n", (size <= module->max_frag_payload)?"small":"large", - (void *)frag, (int)size); + (void *)frag, (int)size, flags); #endif /* set # of bytes remaining to be ACKed */ @@ -262,28 +250,34 @@ usnic_alloc(struct mca_btl_base_module_t* btl, /* set up descriptor */ desc = &frag->sf_base.uf_base; + desc->des_flags = flags; desc->des_src[0].seg_len = size; desc->des_src_cnt = 1; - desc->des_flags = flags; return desc; } /** - * Return a small send fragment + * Return an allocated fragment * * Return the send fragment to the appropriate list */ static int usnic_free(struct mca_btl_base_module_t* btl, mca_btl_base_descriptor_t* des) { - ompi_btl_usnic_frag_t* frag = (ompi_btl_usnic_frag_t*)des; + ompi_btl_usnic_send_frag_t* frag = (ompi_btl_usnic_send_frag_t*)des; -#if MSGDEBUG1 +#if MSGDEBUG2 opal_output(0, "usnic_free: %p\n", (void*)frag); #endif + +#if 1 /* seperate commit for seperate bug */ OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super)); +#else + ompi_btl_usnic_send_frag_return_cond((struct ompi_btl_usnic_module_t *)btl, + frag); +#endif return OMPI_SUCCESS; } @@ -338,6 +332,7 @@ usnic_prepare_src( uint32_t flags) { ompi_btl_usnic_module_t *module = (ompi_btl_usnic_module_t*) base_module; + mca_btl_base_descriptor_t *desc; ompi_btl_usnic_send_frag_t *frag; uint32_t payload_len; struct iovec iov; @@ -498,18 +493,22 @@ usnic_prepare_src( /* fragment accounting */ frag->sf_ack_bytes_left = payload_len; + desc = &frag->sf_base.uf_base; #if MSGDEBUG2 - opal_output(0, "prep_src: %s %s frag %p, size=%d+%d, src=%p\n", + opal_output(0, "prep_src: %s %s frag %p, size=%d+%d\n", module->device->name, payload_len <= module->max_frag_payload?"small":"large", - (void *)frag, (int)reserve, (int)*size, - frag->sf_base.uf_base.des_src[0].seg_addr.pval); - opal_output(0, " data_ptr = %p, conv=%p\n", - data_ptr, (void *)frag->sf_convertor); + (void *)frag, (int)reserve, (int)*size); + { int i; + for (i=0; ides_src_cnt; ++i) + opal_output(0, " %d: ptr:%p len:%d\n", i, + desc->des_src[i].seg_addr.pval, + desc->des_src[i].seg_len); + } #endif - return &frag->sf_base.uf_base; + return desc; } static mca_btl_base_descriptor_t* @@ -571,12 +570,20 @@ usnic_put( frag = (ompi_btl_usnic_send_frag_t *)des; #if MSGDEBUG2 - opal_output(0, "usnic_put: %"PRIu64" bytes to %p\n", - des->des_dst->seg_len, - des->des_dst->seg_addr.pval); - opal_output(0, " des_dst=%p, frag->uf_dst_seg=%p\n", - (void *)des->des_dst, - (void *)frag->sf_base.uf_dst_seg); + opal_output(0, "usnic_put, frag=%p, source=\n", frag); + { int i; + for (i=0; ides_src_cnt; ++i) + opal_output(0, " %d: ptr:%p len:%d\n", i, + des->des_src[i].seg_addr.pval, + des->des_src[i].seg_len); + } + opal_output(0, "dest:\n"); + { int i; + for (i=0; ides_dst_cnt; ++i) + opal_output(0, " %d: ptr:%p len:%d\n", i, + des->des_dst[i].seg_addr.pval, + des->des_dst[i].seg_len); + } #endif /* copy out address - why does he not use ours? silly PML */ @@ -1014,6 +1021,7 @@ usnic_handle_large_send( chp->ch_frag_id = lfrag->lsf_frag_id; chp->ch_frag_size = lfrag->lsf_base.sf_size; chp->ch_frag_offset = lfrag->lsf_cur_offset; + chp->ch_hdr.tag = lfrag->lsf_tag; /* set actual packet length for verbs */ sseg->ss_base.us_sg_entry[0].length = @@ -1036,24 +1044,29 @@ usnic_handle_large_send( /* done with fragment? */ if (lfrag->lsf_bytes_left == 0) { - /* only callback now if this was not a PUT, otherwise - * we need to wait until last byte is ACKed + /* remove this frag from sending list now because PML may + * decide to put it on some other list in the callback */ - if (frag->sf_base.uf_dst_seg[0].seg_addr.pval == NULL) { + opal_list_remove_item(&endpoint->endpoint_frag_send_queue, + &frag->sf_base.uf_base.super.super); -#if MSGDEBUG1 - opal_output(0, " calling back %p, len=%zd\n", - (void*)(uintptr_t)frag->sf_base.uf_base.des_cbfunc, - frag->sf_size); + /* only callback now if this was not a PUT and we own the fragment, + * otherwise we need to wait until last byte is ACKed + */ + if (frag->sf_base.uf_dst_seg[0].seg_addr.pval == NULL && + (frag->sf_base.uf_base.des_flags & + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { + +#if MSGDEBUG2 + opal_output(0, "callback for large frag %p, len=%zd\n", + (void *)frag->sf_base.uf_base.des_cbfunc, frag->sf_size); #endif frag->sf_base.uf_base.des_cbfunc(&module->super, frag->sf_endpoint, &frag->sf_base.uf_base, OMPI_SUCCESS); ++module->pml_send_callbacks; + frag->sf_base.uf_base.des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK; } - - opal_list_remove_item(&endpoint->endpoint_frag_send_queue, - &frag->sf_base.uf_base.super.super); } } @@ -1113,6 +1126,13 @@ ompi_btl_usnic_module_progress_sends( * Send ptr and length will be in uf_src_seg[0] */ if (frag->sf_base.uf_type == OMPI_BTL_USNIC_FRAG_SMALL_SEND) { + + /* remove this frag from sending list now because PML may + * decide to put it on some other list in the callback + */ + opal_list_remove_item(&endpoint->endpoint_frag_send_queue, + &frag->sf_base.uf_base.super.super); + sfrag = (ompi_btl_usnic_small_send_frag_t *)frag; sseg = &sfrag->ssf_segment; @@ -1121,9 +1141,12 @@ ompi_btl_usnic_module_progress_sends( sseg->ss_base.us_btl_header->payload_len = payload_len; #if MSGDEBUG1 - opal_output(0, "send small, ptr=%"PRIu64", payload=%zd, len=%"PRIu32"\n", - sseg->ss_base.us_sg_entry[0].addr, payload_len, - sseg->ss_base.us_sg_entry[0].length); + opal_output(0, "progress send small, frag=%p, ptr=%p, payload=%zd, len=%"PRIu32", ep=%p, tag=%d\n", + (void *)frag, + (void *)sseg->ss_base.us_sg_entry[0].addr, payload_len, + sseg->ss_base.us_sg_entry[0].length, + (void *)frag->sf_endpoint, + sseg->ss_base.us_btl_header->tag); #endif /* post the send */ @@ -1131,23 +1154,30 @@ ompi_btl_usnic_module_progress_sends( /* don't do callback yet if this is a put */ if (frag->sf_base.uf_dst_seg[0].seg_addr.pval == NULL) { -#if MSGDEBUG1 - opal_output(0, " calling back %p, len=%"PRIu64"\n", - (void*)(uintptr_t)frag->sf_base.uf_base.des_cbfunc, - frag->sf_base.uf_src_seg[0].seg_len); + /* we have copied the data, perform a callback if + * we own the fragment and callback is requested. + * If we don't own the fragment, we cannot callback yet + * because we are not done with the segment inside. + * (ACK not received yet) + */ + if ((frag->sf_base.uf_base.des_flags & + (MCA_BTL_DES_SEND_ALWAYS_CALLBACK | + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) == + (MCA_BTL_DES_SEND_ALWAYS_CALLBACK | + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { +#if MSGDEBUG2 + opal_output(0, "callback frag small %p, len=%"PRIu64"\n", + (void*)frag, frag->sf_base.uf_src_seg[0].seg_len); #endif - /* we have copied the data, proceed with callback */ - /* could be done in usnic_send? XXX */ - frag->sf_base.uf_base.des_cbfunc(&module->super, - frag->sf_endpoint, &frag->sf_base.uf_base, - OMPI_SUCCESS); - ++module->pml_send_callbacks; + frag->sf_base.uf_base.des_cbfunc(&module->super, + frag->sf_endpoint, &frag->sf_base.uf_base, + OMPI_SUCCESS); + ++module->pml_send_callbacks; + frag->sf_base.uf_base.des_flags &= + ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + } } - /* remove frag from sending list */ - opal_list_remove_item(&endpoint->endpoint_frag_send_queue, - &sfrag->ssf_base.sf_base.uf_base.super.super); - /* Large sends... */ } else { usnic_handle_large_send(module, endpoint, frag); @@ -1189,6 +1219,21 @@ ompi_btl_usnic_module_progress_sends( /* * Initiate a send. + * + * Send completion callbacks can be done from a few different places. + * + * If this is a send from a fragment we do not own, we always have + * to wait for the last ACK of the fragment, because we cannot allow + * the fragment to be re-used until we know we have no more retransmits to do. + * + * If this is a send from a fragment we own, and we know we have copied the + * data from the user's buffer, we can perform the callback immediately + * (or possibly not at all, simply returning "1" to indicate completion. + * + * If this is a send from a fragment we own and we have not yet copied out + * all the data (as is the case in a large send) then we defer the callback + * until the last of the data has been copied out by routines called + * from ompi_btl_usnic_progress_sends() */ static int usnic_send(struct mca_btl_base_module_t* base_module, struct mca_btl_base_endpoint_t* base_endpoint, @@ -1210,7 +1255,12 @@ static int usnic_send(struct mca_btl_base_module_t* base_module, opal_output(0, "usnic_send: frag=%p, endpoint=%p, tag=%d, sendreq=%p\n", (void *)frag, (void *)endpoint, tag, (void *)descriptor->des_cbdata); - opal_output(0, " data = %p\n", descriptor->des_src[0].seg_addr.pval); + { int i; + for (i=0; ides_src_cnt; ++i) + opal_output(0, " %d: ptr:%p len:%d\n", i, + descriptor->des_src[i].seg_addr.pval, + descriptor->des_src[i].seg_len); + } #endif assert(frag->sf_endpoint == endpoint); @@ -1257,6 +1307,7 @@ static int usnic_send(struct mca_btl_base_module_t* base_module, sseg->ss_send_desc.send_flags |= IBV_SEND_INLINE; sseg->ss_channel = USNIC_PRIORITY_CHANNEL; + sseg->ss_base.us_btl_header->tag = tag; #if MSGDEBUG2 opal_output(0, "conv = %p\n", frag->sf_convertor); opal_output(0, " inline frag %d segs %p(%d) + %p(%d)\n", @@ -1292,14 +1343,30 @@ static int usnic_send(struct mca_btl_base_module_t* base_module, sizeof(ompi_btl_usnic_btl_header_t) + frag->sf_size; } - /* If requested, callback now, else just return 1 to show completion */ - if (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { - frag->sf_base.uf_base.des_cbfunc(&module->super, - frag->sf_endpoint, &frag->sf_base.uf_base, - OMPI_SUCCESS); - rc = 0; + /* If we own the frag and callback was requested, callback now, + * else just return 1 to show completion. + * If we don't own the frag, need to wait for ACK before + * performing callback on the frag + */ + if (descriptor->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) { + if (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { +#if MSGDEBUG2 + opal_output(0, "immediate callback for frag %p\n", (void *)frag); +#endif + frag->sf_base.uf_base.des_cbfunc(&module->super, + frag->sf_endpoint, &frag->sf_base.uf_base, + OMPI_SUCCESS); + rc = 0; + descriptor->des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + } else { +#if MSGDEBUG2 + opal_output(0, "skipping callback for frag %p\n", (void *)frag); +#endif + rc = 1; + } } else { - rc = 1; + descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + rc = 0; } ++module->pml_module_sends; ++module->pml_send_callbacks; /* returning "1" is an implicit CB */ diff --git a/ompi/mca/btl/usnic/btl_usnic_recv.c b/ompi/mca/btl/usnic/btl_usnic_recv.c index 40a7f9b4cf..1e2d477507 100644 --- a/ompi/mca/btl/usnic/btl_usnic_recv.c +++ b/ompi/mca/btl/usnic/btl_usnic_recv.c @@ -56,6 +56,7 @@ void ompi_btl_usnic_recv_call(ompi_btl_usnic_module_t *module, mca_btl_active_message_callback_t* reg; ompi_btl_usnic_endpoint_t *endpoint; ompi_btl_usnic_btl_chunk_header_t *chunk_hdr; + ompi_btl_usnic_btl_header_t *hdr; uint32_t window_index; int rc; #if MSGDEBUG1 @@ -110,11 +111,11 @@ void ompi_btl_usnic_recv_call(ompi_btl_usnic_module_t *module, return; } + hdr = seg->rs_base.us_btl_header; + #if MSGDEBUG1 opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ ", len=%d\n", - (void*) endpoint, - seg->rs_base.us_btl_header->seq, - seg->rs_base.us_btl_header->payload_len); + (void*) endpoint, hdr->seq, hdr->payload_len); #if 0 opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ " from %s to %s: GOOD! (rel seq %d, lowest seq %" UDSEQ ", highest seq: %" UDSEQ ", rwstart %d) seg %p, module %p\n", @@ -126,7 +127,7 @@ void ompi_btl_usnic_recv_call(ompi_btl_usnic_module_t *module, endpoint->endpoint_highest_seq_rcvd, endpoint->endpoint_rfstart, (void*) seg, (void*) module); - if (seg->rs_base.us_btl_header->put_addr != NULL) { + if (hdr->put_addr != NULL) { opal_output(0, " put_addr = %p\n", seg->rs_base.us_btl_header->put_addr); } @@ -139,12 +140,10 @@ void ompi_btl_usnic_recv_call(ompi_btl_usnic_module_t *module, * the frame length to meet minimum sizes, add protocol information, * etc. */ - if (seg->rs_base.us_btl_header->put_addr == NULL) { - reg = mca_btl_base_active_message_trigger + - bseg->us_payload.pml_header->tag; - seg->rs_segment.seg_len = bseg->us_btl_header->payload_len; - reg->cbfunc(&module->super, bseg->us_payload.pml_header->tag, - &seg->rs_desc, reg->cbdata); + if (hdr->put_addr == NULL) { + reg = mca_btl_base_active_message_trigger + hdr->tag; + seg->rs_segment.seg_len = hdr->payload_len; + reg->cbfunc(&module->super, hdr->tag, &seg->rs_desc, reg->cbdata); /* * If this is a PUT, need to copy it to user buffer @@ -269,15 +268,10 @@ opal_output(0, "Start PUT to %p\n", chunk_hdr->ch_hdr.put_addr); fip->rfi_bytes_left -= chunk_hdr->ch_hdr.payload_len; if (0 == fip->rfi_bytes_left) { - mca_btl_base_header_t *pml_header; mca_btl_base_descriptor_t desc; mca_btl_base_segment_t segment; - /* Get access to PML header in assembled fragment so we - * can pull out the tag - */ - pml_header = (mca_btl_base_header_t *)(fip->rfi_data); - segment.seg_addr.pval = pml_header; + segment.seg_addr.pval = fip->rfi_data; segment.seg_len = fip->rfi_frag_size; desc.des_dst = &segment; desc.des_dst_cnt = 1; @@ -289,12 +283,13 @@ opal_output(0, "Start PUT to %p\n", chunk_hdr->ch_hdr.put_addr); #if MSGDEBUG2 opal_output(0, " large FRAG complete, pass up %p, %"PRIu64" bytes, tag=%d\n", desc.des_dst->seg_addr.pval, desc.des_dst->seg_len, - pml_header->tag); + chunk_hdr->ch_hdr.tag); #endif - reg = mca_btl_base_active_message_trigger + pml_header->tag; + reg = mca_btl_base_active_message_trigger + + chunk_hdr->ch_hdr.tag; /* mca_pml_ob1_recv_frag_callback_frag() */ - reg->cbfunc(&module->super, pml_header->tag, + reg->cbfunc(&module->super, chunk_hdr->ch_hdr.tag, &desc, reg->cbdata); /* free temp buffer for non-put */ diff --git a/ompi/mca/btl/usnic/btl_usnic_recv.h b/ompi/mca/btl/usnic/btl_usnic_recv.h index 950fd19de6..c115955d17 100644 --- a/ompi/mca/btl/usnic/btl_usnic_recv.h +++ b/ompi/mca/btl/usnic/btl_usnic_recv.h @@ -283,10 +283,9 @@ ompi_btl_usnic_recv_fast(ompi_btl_usnic_module_t *module, * the frame length to meet minimum sizes, add protocol information, * etc. */ - reg = mca_btl_base_active_message_trigger + - bseg->us_payload.pml_header->tag; + reg = mca_btl_base_active_message_trigger + bseg->us_btl_header->tag; seg->rs_segment.seg_len = bseg->us_btl_header->payload_len; - reg->cbfunc(&module->super, bseg->us_payload.pml_header->tag, + reg->cbfunc(&module->super, bseg->us_btl_header->tag, &seg->rs_desc, reg->cbdata); drop: @@ -384,10 +383,9 @@ ompi_btl_usnic_recv(ompi_btl_usnic_module_t *module, * the frame length to meet minimum sizes, add protocol information, * etc. */ - reg = mca_btl_base_active_message_trigger + - bseg->us_payload.pml_header->tag; + reg = mca_btl_base_active_message_trigger + bseg->us_btl_header->tag; seg->rs_segment.seg_len = bseg->us_btl_header->payload_len; - reg->cbfunc(&module->super, bseg->us_payload.pml_header->tag, + reg->cbfunc(&module->super, bseg->us_btl_header->tag, &seg->rs_desc, reg->cbdata); } else { diff --git a/ompi/mca/btl/usnic/btl_usnic_send.c b/ompi/mca/btl/usnic/btl_usnic_send.c index 8e51e24aee..9cc8c6141c 100644 --- a/ompi/mca/btl/usnic/btl_usnic_send.c +++ b/ompi/mca/btl/usnic/btl_usnic_send.c @@ -152,6 +152,7 @@ ompi_btl_usnic_send_slower( /* use standard channel */ sseg->ss_channel = USNIC_DATA_CHANNEL; + sseg->ss_base.us_btl_header->tag = tag; #if MSGDEBUG2 opal_output(0, " small frag %d segs %p(%d) + %p(%d)\n", (int)frag->sf_base.uf_base.des_src_cnt, @@ -166,6 +167,12 @@ ompi_btl_usnic_send_slower( (void *)sseg->ss_send_desc.sg_list[1].addr, sseg->ss_send_desc.sg_list[1].length); #endif + } else { + ompi_btl_usnic_large_send_frag_t *lfrag; + + lfrag = (ompi_btl_usnic_large_send_frag_t *)frag; + + lfrag->lsf_tag = tag; } /* queue this fragment into the send engine */ diff --git a/ompi/mca/btl/usnic/btl_usnic_send.h b/ompi/mca/btl/usnic/btl_usnic_send.h index 586305bf2a..a6b9fdff94 100644 --- a/ompi/mca/btl/usnic/btl_usnic_send.h +++ b/ompi/mca/btl/usnic/btl_usnic_send.h @@ -53,6 +53,21 @@ ompi_btl_usnic_check_rts( } } +#if MSGDEBUG2 +static inline +int sge_total(struct ibv_send_wr *wr) +{ + int i; + int len; + len=0; + for (i=0; inum_sge; ++i) { + len += wr->sg_list[i].length; + } + + return len; +} +#endif + /* * Common point for posting a segment to VERBS */ @@ -68,10 +83,11 @@ ompi_btl_usnic_post_segment( int ret; #if MSGDEBUG1 - opal_output(0, "post_send: type=%d, addr=%p, len=%d\n", - sseg->ss_base.us_type, + opal_output(0, "post_send: type=%s, addr=%p, len=%d, payload=%d\n", + usnic_seg_type(sseg->ss_base.us_type), (void*) sseg->ss_send_desc.sg_list->addr, - sseg->ss_send_desc.sg_list->length); + sge_total(&sseg->ss_send_desc), + sseg->ss_base.us_btl_header->payload_len); /*ompi_btl_usnic_dump_hex((void *)(sseg->ss_send_desc.sg_list->addr + sizeof(ompi_btl_usnic_btl_header_t)), 16); */ #endif @@ -215,8 +231,9 @@ ompi_btl_usnic_endpoint_enqueue_frag( ompi_btl_usnic_send_frag_t *frag) { #if MSGDEBUG1 - opal_output(0, "enq_frag: frag=%p, endpoint=%p, type=%d, len=%"PRIu64"\n", - (void*)frag, (void*)endpoint, frag->sf_base.uf_type, + opal_output(0, "enq_frag: frag=%p, endpoint=%p, %s, len=%"PRIu64"\n", + (void*)frag, (void*)endpoint, + usnic_frag_type(frag->sf_base.uf_type), frag->sf_base.uf_base.des_src->seg_len); if (frag->sf_base.uf_type == OMPI_BTL_USNIC_FRAG_LARGE_SEND) { ompi_btl_usnic_large_send_frag_t *lfrag;