1
1

Cisco CSCuj12520: various problems running c_fence_put_1

- tag needs to be sent in *our* header, not the PML header
- usnic_alloc() should return smaller value if too much data requested
- be careful about callbacks vs removing items from lists
  (we need to remove from outr lists *before* the callback)
- improve send callback handling
- add some more MSGDEBUG2 logging and cleanup

This commit was SVN r29181.
Этот коммит содержится в:
Reese Faucette 2013-09-17 07:20:44 +00:00
родитель 2245ac0e7e
Коммит 89b5f0899b
7 изменённых файлов: 236 добавлений и 122 удалений

Просмотреть файл

@ -134,20 +134,22 @@ ompi_btl_usnic_handle_ack(
(void*)sseg, (void*)frag, bytes_acked, frag->sf_ack_bytes_left); (void*)sseg, (void*)frag, bytes_acked, frag->sf_ack_bytes_left);
#endif #endif
/* perform completion callback for PUT here */ /* If all ACKs received, and this is a put or a regular send
* that needs a callback, perform the callback now
*/
if (frag->sf_ack_bytes_left == 0 && if (frag->sf_ack_bytes_left == 0 &&
frag->sf_base.uf_dst_seg[0].seg_addr.pval != NULL) { ((frag->sf_base.uf_dst_seg[0].seg_addr.pval != NULL) ||
#if MSGDEBUG1 (frag->sf_base.uf_base.des_flags &
opal_output(0, "Calling back %p for PUT completion, frag=%p\n", MCA_BTL_DES_SEND_ALWAYS_CALLBACK))) {
(void*)(uintptr_t)frag->sf_base.uf_base.des_cbfunc, (void*)frag); #if MSGDEBUG2
opal_output(0, "completion callback for frag=%p, dest=%p\n",
(void*)frag, frag->sf_base.uf_dst_seg[0].seg_addr.pval);
#endif #endif
frag->sf_base.uf_base.des_cbfunc(&module->super, frag->sf_endpoint, frag->sf_base.uf_base.des_cbfunc(&module->super, frag->sf_endpoint,
&frag->sf_base.uf_base, OMPI_SUCCESS); &frag->sf_base.uf_base, OMPI_SUCCESS);
frag->sf_base.uf_base.des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
} }
/* OK to return this fragment? */
ompi_btl_usnic_send_frag_return_cond(module, frag);
/* free this segment */ /* free this segment */
sseg->ss_ack_pending = false; sseg->ss_ack_pending = false;
if (sseg->ss_base.us_type == OMPI_BTL_USNIC_SEG_CHUNK && if (sseg->ss_base.us_type == OMPI_BTL_USNIC_SEG_CHUNK &&
@ -155,6 +157,9 @@ ompi_btl_usnic_handle_ack(
ompi_btl_usnic_chunk_segment_return(module, sseg); ompi_btl_usnic_chunk_segment_return(module, sseg);
} }
/* OK to return this fragment? */
ompi_btl_usnic_send_frag_return_cond(module, frag);
/* indicate this segment has been ACKed */ /* indicate this segment has been ACKed */
endpoint->endpoint_sent_segs[WINDOW_SIZE_MOD(is)] = NULL; endpoint->endpoint_sent_segs[WINDOW_SIZE_MOD(is)] = NULL;
} }

Просмотреть файл

@ -57,6 +57,19 @@ typedef enum {
OMPI_BTL_USNIC_FRAG_PUT_DEST OMPI_BTL_USNIC_FRAG_PUT_DEST
} ompi_btl_usnic_frag_type_t; } ompi_btl_usnic_frag_type_t;
#if MSGDEBUG2
static inline char *
usnic_frag_type(ompi_btl_usnic_frag_type_t t)
{
switch (t) {
case OMPI_BTL_USNIC_FRAG_LARGE_SEND: return "large";
case OMPI_BTL_USNIC_FRAG_SMALL_SEND: return "small";
case OMPI_BTL_USNIC_FRAG_PUT_DEST: return "put dest";
default: return "unknown";
}
}
#endif
typedef enum { typedef enum {
OMPI_BTL_USNIC_SEG_ACK, OMPI_BTL_USNIC_SEG_ACK,
OMPI_BTL_USNIC_SEG_FRAG, OMPI_BTL_USNIC_SEG_FRAG,
@ -64,6 +77,20 @@ typedef enum {
OMPI_BTL_USNIC_SEG_RECV OMPI_BTL_USNIC_SEG_RECV
} ompi_btl_usnic_seg_type_t; } ompi_btl_usnic_seg_type_t;
#if MSGDEBUG2
static inline char *
usnic_seg_type(ompi_btl_usnic_seg_type_t t)
{
switch (t) {
case OMPI_BTL_USNIC_SEG_ACK: return "ACK";
case OMPI_BTL_USNIC_SEG_FRAG: return "FRAG";
case OMPI_BTL_USNIC_SEG_CHUNK: return "CHUNK";
case OMPI_BTL_USNIC_SEG_RECV: return "RECV";
default: return "unknown";
}
}
#endif
typedef struct ompi_btl_usnic_reg_t { typedef struct ompi_btl_usnic_reg_t {
mca_mpool_base_registration_t base; mca_mpool_base_registration_t base;
@ -94,6 +121,7 @@ typedef enum {
* holes. * holes.
*/ */
typedef struct { typedef struct {
/* Hashed RTE process name of the sender */ /* Hashed RTE process name of the sender */
uint64_t sender; uint64_t sender;
@ -112,8 +140,9 @@ typedef struct {
/* Type of BTL header (see enum, above) */ /* Type of BTL header (see enum, above) */
uint8_t payload_type; uint8_t payload_type;
/* Yuck */
uint8_t padding; /* tag for PML, etc */
mca_btl_base_tag_t tag;
} ompi_btl_usnic_btl_header_t; } ompi_btl_usnic_btl_header_t;
/** /**
@ -146,15 +175,6 @@ typedef enum {
FRAG_MAX = 0xff FRAG_MAX = 0xff
} ompi_btl_usnic_frag_state_flags_t; } ompi_btl_usnic_frag_state_flags_t;
/*
* Convenience macros for states
*/
#define FRAG_STATE_SET(frag, state) (frag)->state_flags |= (state)
#define FRAG_STATE_CLR(frag, state) (frag)->state_flags &= ~(state)
#define FRAG_STATE_GET(frag, state) ((frag)->state_flags & (state))
#define FRAG_STATE_ISSET(frag, state) (((frag)->state_flags & (state)) != 0)
/** /**
* Descriptor for a common segment. This is exactly one packet and may * Descriptor for a common segment. This is exactly one packet and may
* be send or receive * be send or receive
@ -177,7 +197,7 @@ typedef struct ompi_btl_usnic_segment_t {
union { union {
uint8_t *raw; uint8_t *raw;
mca_btl_base_header_t *pml_header; void *pml_header;
} us_payload; } us_payload;
} ompi_btl_usnic_segment_t; } ompi_btl_usnic_segment_t;
@ -270,6 +290,7 @@ typedef struct ompi_btl_usnic_large_send_frag_t {
ompi_btl_usnic_send_frag_t lsf_base; ompi_btl_usnic_send_frag_t lsf_base;
char lsf_pml_header[64]; /* space for PML header */ char lsf_pml_header[64]; /* space for PML header */
mca_btl_base_tag_t lsf_tag; /* save tag */
uint32_t lsf_frag_id; /* fragment ID for reassembly */ uint32_t lsf_frag_id; /* fragment ID for reassembly */
size_t lsf_cur_offset; /* current offset into message */ size_t lsf_cur_offset; /* current offset into message */
@ -424,6 +445,10 @@ ompi_btl_usnic_frag_return(
struct ompi_btl_usnic_module_t *module, struct ompi_btl_usnic_module_t *module,
ompi_btl_usnic_frag_t *frag) ompi_btl_usnic_frag_t *frag)
{ {
#if MSGDEBUG2
opal_output(0, "freeing frag %p, type %s\n", (void *)frag,
usnic_frag_type(frag->uf_type));
#endif
OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super)); OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super));
} }

Просмотреть файл

@ -219,35 +219,23 @@ usnic_alloc(struct mca_btl_base_module_t* btl,
{ {
ompi_btl_usnic_send_frag_t *frag; ompi_btl_usnic_send_frag_t *frag;
ompi_btl_usnic_module_t *module = (ompi_btl_usnic_module_t*) btl; ompi_btl_usnic_module_t *module = (ompi_btl_usnic_module_t*) btl;
ompi_btl_usnic_small_send_frag_t *sfrag;
mca_btl_base_descriptor_t *desc; mca_btl_base_descriptor_t *desc;
/* will this fit into a small send? */ if (size > module->max_frag_payload)
if (size <= module->max_frag_payload) { size = module->max_frag_payload;
ompi_btl_usnic_small_send_frag_t *sfrag;
sfrag = ompi_btl_usnic_small_send_frag_alloc(module);
if (NULL == sfrag) {
return NULL;
}
frag = &sfrag->ssf_base;
} else { sfrag = ompi_btl_usnic_small_send_frag_alloc(module);
ompi_btl_usnic_large_send_frag_t *lfrag; if (NULL == sfrag) {
return NULL;
lfrag = ompi_btl_usnic_large_send_frag_alloc(module);
if (NULL == lfrag) {
return NULL;
}
frag = &lfrag->lsf_base;
BTL_ERROR(("large frag in usnic_alloc()\n"));
abort(); /* XXX - we don't ever want to see this... */
} }
frag = &sfrag->ssf_base;
#if MSGDEBUG2 #if MSGDEBUG2
opal_output(0, "usnic_alloc: %s frag=%p, size=%d\n", opal_output(0, "usnic_alloc: %s frag=%p, size=%d, flags=0x%x\n",
(size <= module->max_frag_payload)?"small":"large", (size <= module->max_frag_payload)?"small":"large",
(void *)frag, (int)size); (void *)frag, (int)size, flags);
#endif #endif
/* set # of bytes remaining to be ACKed */ /* set # of bytes remaining to be ACKed */
@ -262,28 +250,34 @@ usnic_alloc(struct mca_btl_base_module_t* btl,
/* set up descriptor */ /* set up descriptor */
desc = &frag->sf_base.uf_base; desc = &frag->sf_base.uf_base;
desc->des_flags = flags;
desc->des_src[0].seg_len = size; desc->des_src[0].seg_len = size;
desc->des_src_cnt = 1; desc->des_src_cnt = 1;
desc->des_flags = flags;
return desc; return desc;
} }
/** /**
* Return a small send fragment * Return an allocated fragment
* *
* Return the send fragment to the appropriate list * Return the send fragment to the appropriate list
*/ */
static int usnic_free(struct mca_btl_base_module_t* btl, static int usnic_free(struct mca_btl_base_module_t* btl,
mca_btl_base_descriptor_t* des) mca_btl_base_descriptor_t* des)
{ {
ompi_btl_usnic_frag_t* frag = (ompi_btl_usnic_frag_t*)des; ompi_btl_usnic_send_frag_t* frag = (ompi_btl_usnic_send_frag_t*)des;
#if MSGDEBUG1 #if MSGDEBUG2
opal_output(0, "usnic_free: %p\n", (void*)frag); opal_output(0, "usnic_free: %p\n", (void*)frag);
#endif #endif
#if 1 /* seperate commit for seperate bug */
OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super)); OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super));
#else
ompi_btl_usnic_send_frag_return_cond((struct ompi_btl_usnic_module_t *)btl,
frag);
#endif
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -338,6 +332,7 @@ usnic_prepare_src(
uint32_t flags) uint32_t flags)
{ {
ompi_btl_usnic_module_t *module = (ompi_btl_usnic_module_t*) base_module; ompi_btl_usnic_module_t *module = (ompi_btl_usnic_module_t*) base_module;
mca_btl_base_descriptor_t *desc;
ompi_btl_usnic_send_frag_t *frag; ompi_btl_usnic_send_frag_t *frag;
uint32_t payload_len; uint32_t payload_len;
struct iovec iov; struct iovec iov;
@ -498,18 +493,22 @@ usnic_prepare_src(
/* fragment accounting */ /* fragment accounting */
frag->sf_ack_bytes_left = payload_len; frag->sf_ack_bytes_left = payload_len;
desc = &frag->sf_base.uf_base;
#if MSGDEBUG2 #if MSGDEBUG2
opal_output(0, "prep_src: %s %s frag %p, size=%d+%d, src=%p\n", opal_output(0, "prep_src: %s %s frag %p, size=%d+%d\n",
module->device->name, module->device->name,
payload_len <= module->max_frag_payload?"small":"large", payload_len <= module->max_frag_payload?"small":"large",
(void *)frag, (int)reserve, (int)*size, (void *)frag, (int)reserve, (int)*size);
frag->sf_base.uf_base.des_src[0].seg_addr.pval); { int i;
opal_output(0, " data_ptr = %p, conv=%p\n", for (i=0; i<desc->des_src_cnt; ++i)
data_ptr, (void *)frag->sf_convertor); opal_output(0, " %d: ptr:%p len:%d\n", i,
desc->des_src[i].seg_addr.pval,
desc->des_src[i].seg_len);
}
#endif #endif
return &frag->sf_base.uf_base; return desc;
} }
static mca_btl_base_descriptor_t* static mca_btl_base_descriptor_t*
@ -571,12 +570,20 @@ usnic_put(
frag = (ompi_btl_usnic_send_frag_t *)des; frag = (ompi_btl_usnic_send_frag_t *)des;
#if MSGDEBUG2 #if MSGDEBUG2
opal_output(0, "usnic_put: %"PRIu64" bytes to %p\n", opal_output(0, "usnic_put, frag=%p, source=\n", frag);
des->des_dst->seg_len, { int i;
des->des_dst->seg_addr.pval); for (i=0; i<des->des_src_cnt; ++i)
opal_output(0, " des_dst=%p, frag->uf_dst_seg=%p\n", opal_output(0, " %d: ptr:%p len:%d\n", i,
(void *)des->des_dst, des->des_src[i].seg_addr.pval,
(void *)frag->sf_base.uf_dst_seg); des->des_src[i].seg_len);
}
opal_output(0, "dest:\n");
{ int i;
for (i=0; i<des->des_dst_cnt; ++i)
opal_output(0, " %d: ptr:%p len:%d\n", i,
des->des_dst[i].seg_addr.pval,
des->des_dst[i].seg_len);
}
#endif #endif
/* copy out address - why does he not use ours? silly PML */ /* copy out address - why does he not use ours? silly PML */
@ -1014,6 +1021,7 @@ usnic_handle_large_send(
chp->ch_frag_id = lfrag->lsf_frag_id; chp->ch_frag_id = lfrag->lsf_frag_id;
chp->ch_frag_size = lfrag->lsf_base.sf_size; chp->ch_frag_size = lfrag->lsf_base.sf_size;
chp->ch_frag_offset = lfrag->lsf_cur_offset; chp->ch_frag_offset = lfrag->lsf_cur_offset;
chp->ch_hdr.tag = lfrag->lsf_tag;
/* set actual packet length for verbs */ /* set actual packet length for verbs */
sseg->ss_base.us_sg_entry[0].length = sseg->ss_base.us_sg_entry[0].length =
@ -1036,24 +1044,29 @@ usnic_handle_large_send(
/* done with fragment? */ /* done with fragment? */
if (lfrag->lsf_bytes_left == 0) { if (lfrag->lsf_bytes_left == 0) {
/* only callback now if this was not a PUT, otherwise /* remove this frag from sending list now because PML may
* we need to wait until last byte is ACKed * decide to put it on some other list in the callback
*/ */
if (frag->sf_base.uf_dst_seg[0].seg_addr.pval == NULL) { opal_list_remove_item(&endpoint->endpoint_frag_send_queue,
&frag->sf_base.uf_base.super.super);
#if MSGDEBUG1 /* only callback now if this was not a PUT and we own the fragment,
opal_output(0, " calling back %p, len=%zd\n", * otherwise we need to wait until last byte is ACKed
(void*)(uintptr_t)frag->sf_base.uf_base.des_cbfunc, */
frag->sf_size); if (frag->sf_base.uf_dst_seg[0].seg_addr.pval == NULL &&
(frag->sf_base.uf_base.des_flags &
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
#if MSGDEBUG2
opal_output(0, "callback for large frag %p, len=%zd\n",
(void *)frag->sf_base.uf_base.des_cbfunc, frag->sf_size);
#endif #endif
frag->sf_base.uf_base.des_cbfunc(&module->super, frag->sf_base.uf_base.des_cbfunc(&module->super,
frag->sf_endpoint, &frag->sf_base.uf_base, frag->sf_endpoint, &frag->sf_base.uf_base,
OMPI_SUCCESS); OMPI_SUCCESS);
++module->pml_send_callbacks; ++module->pml_send_callbacks;
frag->sf_base.uf_base.des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
} }
opal_list_remove_item(&endpoint->endpoint_frag_send_queue,
&frag->sf_base.uf_base.super.super);
} }
} }
@ -1113,6 +1126,13 @@ ompi_btl_usnic_module_progress_sends(
* Send ptr and length will be in uf_src_seg[0] * Send ptr and length will be in uf_src_seg[0]
*/ */
if (frag->sf_base.uf_type == OMPI_BTL_USNIC_FRAG_SMALL_SEND) { if (frag->sf_base.uf_type == OMPI_BTL_USNIC_FRAG_SMALL_SEND) {
/* remove this frag from sending list now because PML may
* decide to put it on some other list in the callback
*/
opal_list_remove_item(&endpoint->endpoint_frag_send_queue,
&frag->sf_base.uf_base.super.super);
sfrag = (ompi_btl_usnic_small_send_frag_t *)frag; sfrag = (ompi_btl_usnic_small_send_frag_t *)frag;
sseg = &sfrag->ssf_segment; sseg = &sfrag->ssf_segment;
@ -1121,9 +1141,12 @@ ompi_btl_usnic_module_progress_sends(
sseg->ss_base.us_btl_header->payload_len = payload_len; sseg->ss_base.us_btl_header->payload_len = payload_len;
#if MSGDEBUG1 #if MSGDEBUG1
opal_output(0, "send small, ptr=%"PRIu64", payload=%zd, len=%"PRIu32"\n", opal_output(0, "progress send small, frag=%p, ptr=%p, payload=%zd, len=%"PRIu32", ep=%p, tag=%d\n",
sseg->ss_base.us_sg_entry[0].addr, payload_len, (void *)frag,
sseg->ss_base.us_sg_entry[0].length); (void *)sseg->ss_base.us_sg_entry[0].addr, payload_len,
sseg->ss_base.us_sg_entry[0].length,
(void *)frag->sf_endpoint,
sseg->ss_base.us_btl_header->tag);
#endif #endif
/* post the send */ /* post the send */
@ -1131,23 +1154,30 @@ ompi_btl_usnic_module_progress_sends(
/* don't do callback yet if this is a put */ /* don't do callback yet if this is a put */
if (frag->sf_base.uf_dst_seg[0].seg_addr.pval == NULL) { if (frag->sf_base.uf_dst_seg[0].seg_addr.pval == NULL) {
#if MSGDEBUG1 /* we have copied the data, perform a callback if
opal_output(0, " calling back %p, len=%"PRIu64"\n", * we own the fragment and callback is requested.
(void*)(uintptr_t)frag->sf_base.uf_base.des_cbfunc, * If we don't own the fragment, we cannot callback yet
frag->sf_base.uf_src_seg[0].seg_len); * because we are not done with the segment inside.
* (ACK not received yet)
*/
if ((frag->sf_base.uf_base.des_flags &
(MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) ==
(MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
#if MSGDEBUG2
opal_output(0, "callback frag small %p, len=%"PRIu64"\n",
(void*)frag, frag->sf_base.uf_src_seg[0].seg_len);
#endif #endif
/* we have copied the data, proceed with callback */ frag->sf_base.uf_base.des_cbfunc(&module->super,
/* could be done in usnic_send? XXX */ frag->sf_endpoint, &frag->sf_base.uf_base,
frag->sf_base.uf_base.des_cbfunc(&module->super, OMPI_SUCCESS);
frag->sf_endpoint, &frag->sf_base.uf_base, ++module->pml_send_callbacks;
OMPI_SUCCESS); frag->sf_base.uf_base.des_flags &=
++module->pml_send_callbacks; ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
}
} }
/* remove frag from sending list */
opal_list_remove_item(&endpoint->endpoint_frag_send_queue,
&sfrag->ssf_base.sf_base.uf_base.super.super);
/* Large sends... */ /* Large sends... */
} else { } else {
usnic_handle_large_send(module, endpoint, frag); usnic_handle_large_send(module, endpoint, frag);
@ -1189,6 +1219,21 @@ ompi_btl_usnic_module_progress_sends(
/* /*
* Initiate a send. * Initiate a send.
*
* Send completion callbacks can be done from a few different places.
*
* If this is a send from a fragment we do not own, we always have
* to wait for the last ACK of the fragment, because we cannot allow
* the fragment to be re-used until we know we have no more retransmits to do.
*
* If this is a send from a fragment we own, and we know we have copied the
* data from the user's buffer, we can perform the callback immediately
* (or possibly not at all, simply returning "1" to indicate completion.
*
* If this is a send from a fragment we own and we have not yet copied out
* all the data (as is the case in a large send) then we defer the callback
* until the last of the data has been copied out by routines called
* from ompi_btl_usnic_progress_sends()
*/ */
static int usnic_send(struct mca_btl_base_module_t* base_module, static int usnic_send(struct mca_btl_base_module_t* base_module,
struct mca_btl_base_endpoint_t* base_endpoint, struct mca_btl_base_endpoint_t* base_endpoint,
@ -1210,7 +1255,12 @@ static int usnic_send(struct mca_btl_base_module_t* base_module,
opal_output(0, "usnic_send: frag=%p, endpoint=%p, tag=%d, sendreq=%p\n", opal_output(0, "usnic_send: frag=%p, endpoint=%p, tag=%d, sendreq=%p\n",
(void *)frag, (void *)endpoint, (void *)frag, (void *)endpoint,
tag, (void *)descriptor->des_cbdata); tag, (void *)descriptor->des_cbdata);
opal_output(0, " data = %p\n", descriptor->des_src[0].seg_addr.pval); { int i;
for (i=0; i<descriptor->des_src_cnt; ++i)
opal_output(0, " %d: ptr:%p len:%d\n", i,
descriptor->des_src[i].seg_addr.pval,
descriptor->des_src[i].seg_len);
}
#endif #endif
assert(frag->sf_endpoint == endpoint); assert(frag->sf_endpoint == endpoint);
@ -1257,6 +1307,7 @@ static int usnic_send(struct mca_btl_base_module_t* base_module,
sseg->ss_send_desc.send_flags |= IBV_SEND_INLINE; sseg->ss_send_desc.send_flags |= IBV_SEND_INLINE;
sseg->ss_channel = USNIC_PRIORITY_CHANNEL; sseg->ss_channel = USNIC_PRIORITY_CHANNEL;
sseg->ss_base.us_btl_header->tag = tag;
#if MSGDEBUG2 #if MSGDEBUG2
opal_output(0, "conv = %p\n", frag->sf_convertor); opal_output(0, "conv = %p\n", frag->sf_convertor);
opal_output(0, " inline frag %d segs %p(%d) + %p(%d)\n", opal_output(0, " inline frag %d segs %p(%d) + %p(%d)\n",
@ -1292,14 +1343,30 @@ static int usnic_send(struct mca_btl_base_module_t* base_module,
sizeof(ompi_btl_usnic_btl_header_t) + frag->sf_size; sizeof(ompi_btl_usnic_btl_header_t) + frag->sf_size;
} }
/* If requested, callback now, else just return 1 to show completion */ /* If we own the frag and callback was requested, callback now,
if (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { * else just return 1 to show completion.
frag->sf_base.uf_base.des_cbfunc(&module->super, * If we don't own the frag, need to wait for ACK before
frag->sf_endpoint, &frag->sf_base.uf_base, * performing callback on the frag
OMPI_SUCCESS); */
rc = 0; if (descriptor->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP) {
if (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) {
#if MSGDEBUG2
opal_output(0, "immediate callback for frag %p\n", (void *)frag);
#endif
frag->sf_base.uf_base.des_cbfunc(&module->super,
frag->sf_endpoint, &frag->sf_base.uf_base,
OMPI_SUCCESS);
rc = 0;
descriptor->des_flags &= ~MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
} else {
#if MSGDEBUG2
opal_output(0, "skipping callback for frag %p\n", (void *)frag);
#endif
rc = 1;
}
} else { } else {
rc = 1; descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
rc = 0;
} }
++module->pml_module_sends; ++module->pml_module_sends;
++module->pml_send_callbacks; /* returning "1" is an implicit CB */ ++module->pml_send_callbacks; /* returning "1" is an implicit CB */

Просмотреть файл

@ -56,6 +56,7 @@ void ompi_btl_usnic_recv_call(ompi_btl_usnic_module_t *module,
mca_btl_active_message_callback_t* reg; mca_btl_active_message_callback_t* reg;
ompi_btl_usnic_endpoint_t *endpoint; ompi_btl_usnic_endpoint_t *endpoint;
ompi_btl_usnic_btl_chunk_header_t *chunk_hdr; ompi_btl_usnic_btl_chunk_header_t *chunk_hdr;
ompi_btl_usnic_btl_header_t *hdr;
uint32_t window_index; uint32_t window_index;
int rc; int rc;
#if MSGDEBUG1 #if MSGDEBUG1
@ -110,11 +111,11 @@ void ompi_btl_usnic_recv_call(ompi_btl_usnic_module_t *module,
return; return;
} }
hdr = seg->rs_base.us_btl_header;
#if MSGDEBUG1 #if MSGDEBUG1
opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ ", len=%d\n", opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ ", len=%d\n",
(void*) endpoint, (void*) endpoint, hdr->seq, hdr->payload_len);
seg->rs_base.us_btl_header->seq,
seg->rs_base.us_btl_header->payload_len);
#if 0 #if 0
opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ " from %s to %s: GOOD! (rel seq %d, lowest seq %" UDSEQ ", highest seq: %" UDSEQ ", rwstart %d) seg %p, module %p\n", opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ " from %s to %s: GOOD! (rel seq %d, lowest seq %" UDSEQ ", highest seq: %" UDSEQ ", rwstart %d) seg %p, module %p\n",
@ -126,7 +127,7 @@ void ompi_btl_usnic_recv_call(ompi_btl_usnic_module_t *module,
endpoint->endpoint_highest_seq_rcvd, endpoint->endpoint_highest_seq_rcvd,
endpoint->endpoint_rfstart, endpoint->endpoint_rfstart,
(void*) seg, (void*) module); (void*) seg, (void*) module);
if (seg->rs_base.us_btl_header->put_addr != NULL) { if (hdr->put_addr != NULL) {
opal_output(0, " put_addr = %p\n", opal_output(0, " put_addr = %p\n",
seg->rs_base.us_btl_header->put_addr); seg->rs_base.us_btl_header->put_addr);
} }
@ -139,12 +140,10 @@ void ompi_btl_usnic_recv_call(ompi_btl_usnic_module_t *module,
* the frame length to meet minimum sizes, add protocol information, * the frame length to meet minimum sizes, add protocol information,
* etc. * etc.
*/ */
if (seg->rs_base.us_btl_header->put_addr == NULL) { if (hdr->put_addr == NULL) {
reg = mca_btl_base_active_message_trigger + reg = mca_btl_base_active_message_trigger + hdr->tag;
bseg->us_payload.pml_header->tag; seg->rs_segment.seg_len = hdr->payload_len;
seg->rs_segment.seg_len = bseg->us_btl_header->payload_len; reg->cbfunc(&module->super, hdr->tag, &seg->rs_desc, reg->cbdata);
reg->cbfunc(&module->super, bseg->us_payload.pml_header->tag,
&seg->rs_desc, reg->cbdata);
/* /*
* If this is a PUT, need to copy it to user buffer * If this is a PUT, need to copy it to user buffer
@ -269,15 +268,10 @@ opal_output(0, "Start PUT to %p\n", chunk_hdr->ch_hdr.put_addr);
fip->rfi_bytes_left -= chunk_hdr->ch_hdr.payload_len; fip->rfi_bytes_left -= chunk_hdr->ch_hdr.payload_len;
if (0 == fip->rfi_bytes_left) { if (0 == fip->rfi_bytes_left) {
mca_btl_base_header_t *pml_header;
mca_btl_base_descriptor_t desc; mca_btl_base_descriptor_t desc;
mca_btl_base_segment_t segment; mca_btl_base_segment_t segment;
/* Get access to PML header in assembled fragment so we segment.seg_addr.pval = fip->rfi_data;
* can pull out the tag
*/
pml_header = (mca_btl_base_header_t *)(fip->rfi_data);
segment.seg_addr.pval = pml_header;
segment.seg_len = fip->rfi_frag_size; segment.seg_len = fip->rfi_frag_size;
desc.des_dst = &segment; desc.des_dst = &segment;
desc.des_dst_cnt = 1; desc.des_dst_cnt = 1;
@ -289,12 +283,13 @@ opal_output(0, "Start PUT to %p\n", chunk_hdr->ch_hdr.put_addr);
#if MSGDEBUG2 #if MSGDEBUG2
opal_output(0, " large FRAG complete, pass up %p, %"PRIu64" bytes, tag=%d\n", opal_output(0, " large FRAG complete, pass up %p, %"PRIu64" bytes, tag=%d\n",
desc.des_dst->seg_addr.pval, desc.des_dst->seg_len, desc.des_dst->seg_addr.pval, desc.des_dst->seg_len,
pml_header->tag); chunk_hdr->ch_hdr.tag);
#endif #endif
reg = mca_btl_base_active_message_trigger + pml_header->tag; reg = mca_btl_base_active_message_trigger +
chunk_hdr->ch_hdr.tag;
/* mca_pml_ob1_recv_frag_callback_frag() */ /* mca_pml_ob1_recv_frag_callback_frag() */
reg->cbfunc(&module->super, pml_header->tag, reg->cbfunc(&module->super, chunk_hdr->ch_hdr.tag,
&desc, reg->cbdata); &desc, reg->cbdata);
/* free temp buffer for non-put */ /* free temp buffer for non-put */

Просмотреть файл

@ -283,10 +283,9 @@ ompi_btl_usnic_recv_fast(ompi_btl_usnic_module_t *module,
* the frame length to meet minimum sizes, add protocol information, * the frame length to meet minimum sizes, add protocol information,
* etc. * etc.
*/ */
reg = mca_btl_base_active_message_trigger + reg = mca_btl_base_active_message_trigger + bseg->us_btl_header->tag;
bseg->us_payload.pml_header->tag;
seg->rs_segment.seg_len = bseg->us_btl_header->payload_len; seg->rs_segment.seg_len = bseg->us_btl_header->payload_len;
reg->cbfunc(&module->super, bseg->us_payload.pml_header->tag, reg->cbfunc(&module->super, bseg->us_btl_header->tag,
&seg->rs_desc, reg->cbdata); &seg->rs_desc, reg->cbdata);
drop: drop:
@ -384,10 +383,9 @@ ompi_btl_usnic_recv(ompi_btl_usnic_module_t *module,
* the frame length to meet minimum sizes, add protocol information, * the frame length to meet minimum sizes, add protocol information,
* etc. * etc.
*/ */
reg = mca_btl_base_active_message_trigger + reg = mca_btl_base_active_message_trigger + bseg->us_btl_header->tag;
bseg->us_payload.pml_header->tag;
seg->rs_segment.seg_len = bseg->us_btl_header->payload_len; seg->rs_segment.seg_len = bseg->us_btl_header->payload_len;
reg->cbfunc(&module->super, bseg->us_payload.pml_header->tag, reg->cbfunc(&module->super, bseg->us_btl_header->tag,
&seg->rs_desc, reg->cbdata); &seg->rs_desc, reg->cbdata);
} else { } else {

Просмотреть файл

@ -152,6 +152,7 @@ ompi_btl_usnic_send_slower(
/* use standard channel */ /* use standard channel */
sseg->ss_channel = USNIC_DATA_CHANNEL; sseg->ss_channel = USNIC_DATA_CHANNEL;
sseg->ss_base.us_btl_header->tag = tag;
#if MSGDEBUG2 #if MSGDEBUG2
opal_output(0, " small frag %d segs %p(%d) + %p(%d)\n", opal_output(0, " small frag %d segs %p(%d) + %p(%d)\n",
(int)frag->sf_base.uf_base.des_src_cnt, (int)frag->sf_base.uf_base.des_src_cnt,
@ -166,6 +167,12 @@ ompi_btl_usnic_send_slower(
(void *)sseg->ss_send_desc.sg_list[1].addr, (void *)sseg->ss_send_desc.sg_list[1].addr,
sseg->ss_send_desc.sg_list[1].length); sseg->ss_send_desc.sg_list[1].length);
#endif #endif
} else {
ompi_btl_usnic_large_send_frag_t *lfrag;
lfrag = (ompi_btl_usnic_large_send_frag_t *)frag;
lfrag->lsf_tag = tag;
} }
/* queue this fragment into the send engine */ /* queue this fragment into the send engine */

Просмотреть файл

@ -53,6 +53,21 @@ ompi_btl_usnic_check_rts(
} }
} }
#if MSGDEBUG2
static inline
int sge_total(struct ibv_send_wr *wr)
{
int i;
int len;
len=0;
for (i=0; i<wr->num_sge; ++i) {
len += wr->sg_list[i].length;
}
return len;
}
#endif
/* /*
* Common point for posting a segment to VERBS * Common point for posting a segment to VERBS
*/ */
@ -68,10 +83,11 @@ ompi_btl_usnic_post_segment(
int ret; int ret;
#if MSGDEBUG1 #if MSGDEBUG1
opal_output(0, "post_send: type=%d, addr=%p, len=%d\n", opal_output(0, "post_send: type=%s, addr=%p, len=%d, payload=%d\n",
sseg->ss_base.us_type, usnic_seg_type(sseg->ss_base.us_type),
(void*) sseg->ss_send_desc.sg_list->addr, (void*) sseg->ss_send_desc.sg_list->addr,
sseg->ss_send_desc.sg_list->length); sge_total(&sseg->ss_send_desc),
sseg->ss_base.us_btl_header->payload_len);
/*ompi_btl_usnic_dump_hex((void *)(sseg->ss_send_desc.sg_list->addr + sizeof(ompi_btl_usnic_btl_header_t)), 16); */ /*ompi_btl_usnic_dump_hex((void *)(sseg->ss_send_desc.sg_list->addr + sizeof(ompi_btl_usnic_btl_header_t)), 16); */
#endif #endif
@ -215,8 +231,9 @@ ompi_btl_usnic_endpoint_enqueue_frag(
ompi_btl_usnic_send_frag_t *frag) ompi_btl_usnic_send_frag_t *frag)
{ {
#if MSGDEBUG1 #if MSGDEBUG1
opal_output(0, "enq_frag: frag=%p, endpoint=%p, type=%d, len=%"PRIu64"\n", opal_output(0, "enq_frag: frag=%p, endpoint=%p, %s, len=%"PRIu64"\n",
(void*)frag, (void*)endpoint, frag->sf_base.uf_type, (void*)frag, (void*)endpoint,
usnic_frag_type(frag->sf_base.uf_type),
frag->sf_base.uf_base.des_src->seg_len); frag->sf_base.uf_base.des_src->seg_len);
if (frag->sf_base.uf_type == OMPI_BTL_USNIC_FRAG_LARGE_SEND) { if (frag->sf_base.uf_type == OMPI_BTL_USNIC_FRAG_LARGE_SEND) {
ompi_btl_usnic_large_send_frag_t *lfrag; ompi_btl_usnic_large_send_frag_t *lfrag;