usnic: pack via convertor on the fly
If we need to use a convertor, go back to stashing that convertor in the frag and populating segments "on the fly" (in ompi_btl_usnic_module_progress_sends). Previously we would pack into a chain of chunk segments at prepare_src time, unnecessarily consuming additional memory. Reviewed-by: Jeff Squyres <jsquyres@cisco.com> Reviewed-by: Reese Faucette <rfaucett@cisco.com> This commit was SVN r29592.
Этот коммит содержится в:
родитель
71d0d73575
Коммит
73a943492c
@ -164,6 +164,9 @@ typedef struct ompi_btl_usnic_component_t {
|
|||||||
|
|
||||||
/** retrans characteristics */
|
/** retrans characteristics */
|
||||||
int retrans_timeout;
|
int retrans_timeout;
|
||||||
|
|
||||||
|
/** convertor packing threshold */
|
||||||
|
int pack_lazy_threshold;
|
||||||
} ompi_btl_usnic_component_t;
|
} ompi_btl_usnic_component_t;
|
||||||
|
|
||||||
OMPI_MODULE_DECLSPEC extern ompi_btl_usnic_component_t mca_btl_usnic_component;
|
OMPI_MODULE_DECLSPEC extern ompi_btl_usnic_component_t mca_btl_usnic_component;
|
||||||
|
@ -128,9 +128,11 @@ ompi_btl_usnic_handle_ack(
|
|||||||
frag = sseg->ss_parent_frag;
|
frag = sseg->ss_parent_frag;
|
||||||
|
|
||||||
#if MSGDEBUG1
|
#if MSGDEBUG1
|
||||||
opal_output(0, " ACKED seg %p, frag %p, ack_bytes=%"PRIu32", left=%zd\n",
|
opal_output(0, " ACKED seg %p frag %p ack_bytes=%"PRIu32" left=%zd dst_seg[0].seg_addr=%p des_flags=0x%x\n",
|
||||||
(void*)sseg, (void*)frag, bytes_acked,
|
(void*)sseg, (void*)frag, bytes_acked,
|
||||||
frag->sf_ack_bytes_left-bytes_acked);
|
frag->sf_ack_bytes_left-bytes_acked,
|
||||||
|
frag->sf_base.uf_dst_seg[0].seg_addr.pval,
|
||||||
|
frag->sf_base.uf_base.des_flags);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* If all ACKs received, and this is a put or a regular send
|
/* If all ACKs received, and this is a put or a regular send
|
||||||
|
@ -169,6 +169,7 @@ send_frag_constructor(ompi_btl_usnic_send_frag_t *frag)
|
|||||||
desc->order = MCA_BTL_NO_ORDER;
|
desc->order = MCA_BTL_NO_ORDER;
|
||||||
desc->des_flags = 0;
|
desc->des_flags = 0;
|
||||||
|
|
||||||
|
OBJ_CONSTRUCT(&frag->sf_convertor, opal_convertor_t);
|
||||||
frag->sf_seg_post_cnt = 0;
|
frag->sf_seg_post_cnt = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -183,6 +184,8 @@ send_frag_destructor(ompi_btl_usnic_send_frag_t *frag)
|
|||||||
assert(0 == frag->sf_base.uf_src_seg[0].seg_len);
|
assert(0 == frag->sf_base.uf_src_seg[0].seg_len);
|
||||||
/* PML may change desc->des_dst to point elsewhere, cannot assert that it
|
/* PML may change desc->des_dst to point elsewhere, cannot assert that it
|
||||||
* still points to our embedded segment */
|
* still points to our embedded segment */
|
||||||
|
|
||||||
|
OBJ_DESTRUCT(&frag->sf_convertor);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -230,6 +233,7 @@ large_send_frag_constructor(ompi_btl_usnic_large_send_frag_t *lfrag)
|
|||||||
|
|
||||||
lfrag->lsf_buffer = NULL;
|
lfrag->lsf_buffer = NULL;
|
||||||
OBJ_CONSTRUCT(&lfrag->lsf_seg_chain, opal_list_t);
|
OBJ_CONSTRUCT(&lfrag->lsf_seg_chain, opal_list_t);
|
||||||
|
lfrag->lsf_pack_on_the_fly = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -251,8 +251,8 @@ typedef struct ompi_btl_usnic_send_frag_t {
|
|||||||
|
|
||||||
size_t sf_size; /* total_fragment size (upper + user payload) */
|
size_t sf_size; /* total_fragment size (upper + user payload) */
|
||||||
|
|
||||||
/* original message data if convertor required */
|
struct opal_convertor_t sf_convertor; /* copy of original message data if
|
||||||
struct opal_convertor_t* sf_convertor;
|
convertor required */
|
||||||
|
|
||||||
uint32_t sf_seg_post_cnt; /* total segs currently posted for this frag */
|
uint32_t sf_seg_post_cnt; /* total segs currently posted for this frag */
|
||||||
size_t sf_ack_bytes_left; /* bytes remaining to be ACKed */
|
size_t sf_ack_bytes_left; /* bytes remaining to be ACKed */
|
||||||
@ -272,19 +272,29 @@ typedef struct ompi_btl_usnic_large_send_frag_t {
|
|||||||
mca_btl_base_tag_t lsf_tag; /* save tag */
|
mca_btl_base_tag_t lsf_tag; /* save tag */
|
||||||
|
|
||||||
uint32_t lsf_frag_id; /* fragment ID for reassembly */
|
uint32_t lsf_frag_id; /* fragment ID for reassembly */
|
||||||
size_t lsf_cur_offset; /* current offset into message */
|
|
||||||
size_t lsf_bytes_left; /* bytes remaining to send */
|
size_t lsf_cur_offset; /* next byte offset to be enqueued on the
|
||||||
uint8_t *lsf_cur_ptr; /* current send pointer */
|
endpoint (incl. any convertor payload) */
|
||||||
|
size_t lsf_bytes_left; /* bytes remaining to give enqueue on the
|
||||||
|
endpoint (incl. any convertor payload) */
|
||||||
|
size_t lsf_pack_bytes_left; /* bytes remaining to be packed into chunk
|
||||||
|
segments (incl. any convertor payload) */
|
||||||
|
uint8_t *lsf_cur_ptr; /* current packing pointer */
|
||||||
int lsf_cur_sge;
|
int lsf_cur_sge;
|
||||||
size_t lsf_bytes_left_in_sge;
|
size_t lsf_bytes_left_in_sge;
|
||||||
|
|
||||||
uint8_t *lsf_buffer; /* attached storage for usnic_alloc() */
|
uint8_t *lsf_buffer; /* attached storage for usnic_alloc() */
|
||||||
|
|
||||||
/* this will go away when we update convertor approach */
|
|
||||||
opal_list_t lsf_seg_chain; /* chain of segments for converted data */
|
opal_list_t lsf_seg_chain; /* chain of segments for converted data */
|
||||||
|
|
||||||
|
bool lsf_pack_on_the_fly; /* true if we are packing on the fly */
|
||||||
} ompi_btl_usnic_large_send_frag_t;
|
} ompi_btl_usnic_large_send_frag_t;
|
||||||
|
|
||||||
|
/* Shortcut member macros. Access uf_src_seg array instead of the descriptor's
|
||||||
|
* des_src ptr to save a deref. */
|
||||||
|
#define lsf_des_src lsf_base.sf_base.uf_src_seg
|
||||||
|
#define lsf_des_src_cnt lsf_base.sf_base.uf_base.des_src_cnt
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* small send fragment
|
* small send fragment
|
||||||
* Small send will optimistically use 2 SG entries in hopes of performing
|
* Small send will optimistically use 2 SG entries in hopes of performing
|
||||||
@ -444,6 +454,12 @@ ompi_btl_usnic_frag_return(
|
|||||||
free(lfrag->lsf_buffer);
|
free(lfrag->lsf_buffer);
|
||||||
lfrag->lsf_buffer = NULL;
|
lfrag->lsf_buffer = NULL;
|
||||||
}
|
}
|
||||||
|
lfrag->lsf_pack_on_the_fly = false;
|
||||||
|
|
||||||
|
if (2 == lfrag->lsf_des_src_cnt &&
|
||||||
|
NULL == lfrag->lsf_des_src[1].seg_addr.pval) {
|
||||||
|
opal_convertor_cleanup(&lfrag->lsf_base.sf_convertor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super));
|
OMPI_FREE_LIST_RETURN_MT(frag->uf_freelist, &(frag->uf_base.super));
|
||||||
|
@ -141,6 +141,7 @@ int ompi_btl_usnic_component_register(void)
|
|||||||
static int max_tiny_payload;
|
static int max_tiny_payload;
|
||||||
static int eager_limit;
|
static int eager_limit;
|
||||||
static int rndv_eager_limit;
|
static int rndv_eager_limit;
|
||||||
|
static int pack_lazy_threshold;
|
||||||
static char *vendor_part_ids;
|
static char *vendor_part_ids;
|
||||||
|
|
||||||
#define CHECK(expr) do {\
|
#define CHECK(expr) do {\
|
||||||
@ -244,6 +245,10 @@ int ompi_btl_usnic_component_register(void)
|
|||||||
ompi_btl_usnic_module_template.super.btl_rndv_eager_limit =
|
ompi_btl_usnic_module_template.super.btl_rndv_eager_limit =
|
||||||
rndv_eager_limit;
|
rndv_eager_limit;
|
||||||
|
|
||||||
|
CHECK(reg_int("pack_lazy_threshold", "Convertor packing on-the-fly threshold (-1 = always pack eagerly, 0 = always pack lazily, otherwise will pack on the fly if fragment size is > limit)",
|
||||||
|
USNIC_DFLT_PACK_LAZY_THRESHOLD, &pack_lazy_threshold, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
|
||||||
|
mca_btl_usnic_component.pack_lazy_threshold = pack_lazy_threshold;
|
||||||
|
|
||||||
/* Default to bandwidth auto-detection */
|
/* Default to bandwidth auto-detection */
|
||||||
ompi_btl_usnic_module_template.super.btl_bandwidth = 0;
|
ompi_btl_usnic_module_template.super.btl_bandwidth = 0;
|
||||||
ompi_btl_usnic_module_template.super.btl_latency = 4;
|
ompi_btl_usnic_module_template.super.btl_latency = 4;
|
||||||
|
@ -56,6 +56,20 @@ ompi_btl_usnic_channel_finalize(
|
|||||||
ompi_btl_usnic_module_t *module,
|
ompi_btl_usnic_module_t *module,
|
||||||
struct ompi_btl_usnic_channel_t *channel);
|
struct ompi_btl_usnic_channel_t *channel);
|
||||||
|
|
||||||
|
/* Compute and set the proper value for sfrag->sf_size. This must not be used
|
||||||
|
* during usnic_alloc, since the PML might change the segment size after
|
||||||
|
* usnic_alloc returns. */
|
||||||
|
static inline void compute_sf_size(ompi_btl_usnic_send_frag_t *sfrag)
|
||||||
|
{
|
||||||
|
ompi_btl_usnic_frag_t *frag;
|
||||||
|
|
||||||
|
frag = &sfrag->sf_base;
|
||||||
|
assert(frag->uf_base.des_src_cnt <= 2);
|
||||||
|
sfrag->sf_size = 0;
|
||||||
|
sfrag->sf_size += frag->uf_src_seg[0].seg_len;
|
||||||
|
sfrag->sf_size += frag->uf_src_seg[1].seg_len;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add procs to this BTL module, receiving endpoint information from
|
* Add procs to this BTL module, receiving endpoint information from
|
||||||
* the modex.
|
* the modex.
|
||||||
@ -251,6 +265,7 @@ usnic_alloc(struct mca_btl_base_module_t* btl,
|
|||||||
}
|
}
|
||||||
frag = &lfrag->lsf_base;
|
frag = &lfrag->lsf_base;
|
||||||
|
|
||||||
|
assert(size > 0);
|
||||||
lfrag->lsf_buffer = malloc(size);
|
lfrag->lsf_buffer = malloc(size);
|
||||||
if (OPAL_UNLIKELY(NULL == lfrag->lsf_buffer)) {
|
if (OPAL_UNLIKELY(NULL == lfrag->lsf_buffer)) {
|
||||||
ompi_btl_usnic_frag_return(module, &lfrag->lsf_base.sf_base);
|
ompi_btl_usnic_frag_return(module, &lfrag->lsf_base.sf_base);
|
||||||
@ -259,6 +274,9 @@ usnic_alloc(struct mca_btl_base_module_t* btl,
|
|||||||
|
|
||||||
/* pointer to buffer for caller */
|
/* pointer to buffer for caller */
|
||||||
frag->sf_base.uf_base.des_src[0].seg_addr.pval = lfrag->lsf_buffer;
|
frag->sf_base.uf_base.des_src[0].seg_addr.pval = lfrag->lsf_buffer;
|
||||||
|
|
||||||
|
MSGDEBUG1_OUT("usnic_alloc: packing frag %p on the fly", (void *)frag);
|
||||||
|
lfrag->lsf_pack_on_the_fly = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if MSGDEBUG2
|
#if MSGDEBUG2
|
||||||
@ -270,9 +288,6 @@ usnic_alloc(struct mca_btl_base_module_t* btl,
|
|||||||
/* set endpoint */
|
/* set endpoint */
|
||||||
frag->sf_endpoint = endpoint;
|
frag->sf_endpoint = endpoint;
|
||||||
|
|
||||||
/* no convertor */
|
|
||||||
frag->sf_convertor = NULL;
|
|
||||||
|
|
||||||
/* set up descriptor */
|
/* set up descriptor */
|
||||||
desc = &frag->sf_base.uf_base;
|
desc = &frag->sf_base.uf_base;
|
||||||
desc->des_flags = flags;
|
desc->des_flags = flags;
|
||||||
@ -325,23 +340,359 @@ static int usnic_free(struct mca_btl_base_module_t* btl,
|
|||||||
* - BTL SEND: will be used after ALLOC / PREPARE
|
* - BTL SEND: will be used after ALLOC / PREPARE
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* Responsible for handling "small" frags (reserve + *size <= max_frag_payload)
|
||||||
|
* in the same manner as btl_prepare_src. Must return a smaller amount than
|
||||||
|
* requested if the given convertor cannot process the entire (*size).
|
||||||
|
*/
|
||||||
|
static inline
|
||||||
|
ompi_btl_usnic_send_frag_t *
|
||||||
|
prepare_src_small(
|
||||||
|
struct ompi_btl_usnic_module_t* module,
|
||||||
|
struct mca_btl_base_endpoint_t* endpoint,
|
||||||
|
struct mca_mpool_base_registration_t* registration,
|
||||||
|
struct opal_convertor_t* convertor,
|
||||||
|
uint8_t order,
|
||||||
|
size_t reserve,
|
||||||
|
size_t* size,
|
||||||
|
uint32_t flags)
|
||||||
|
{
|
||||||
|
ompi_btl_usnic_send_frag_t *frag;
|
||||||
|
ompi_btl_usnic_small_send_frag_t *sfrag;
|
||||||
|
size_t payload_len;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
payload_len = *size + reserve;
|
||||||
|
assert(payload_len <= module->max_frag_payload); /* precondition */
|
||||||
|
|
||||||
|
sfrag = ompi_btl_usnic_small_send_frag_alloc(module);
|
||||||
|
if (OPAL_UNLIKELY(NULL == sfrag)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
frag = &sfrag->ssf_base;
|
||||||
|
|
||||||
|
/* In the case of a convertor, we will copy the data in now, since that is
|
||||||
|
* the cheapest way to discover how much we can actually send (since we know
|
||||||
|
* we will pack it anyway later). The alternative is to do all of the
|
||||||
|
* following:
|
||||||
|
* 1) clone_with_position(convertor) and see where the new position ends up
|
||||||
|
* actually being (see ompi_btl_usnic_convertor_pack_peek). Otherwise we
|
||||||
|
* aren't fulfilling our contract w.r.t. (*size).
|
||||||
|
* 2) Add a bunch of branches checking for different cases, both here and in
|
||||||
|
* progress_sends
|
||||||
|
* 3) If we choose to defer the packing, we must clone the convertor because
|
||||||
|
* the PML owns it and might reuse it for another prepare_src call.
|
||||||
|
*
|
||||||
|
* Two convertor clones is likely to be at least as slow as just copying the
|
||||||
|
* data and might consume a similar amount of memory. Plus we still have to
|
||||||
|
* pack it later to send it.
|
||||||
|
*
|
||||||
|
* The reason we do not copy non-convertor buffer at this point is because
|
||||||
|
* we might still use INLINE for the send, and in that case we do not want
|
||||||
|
* to copy the data at all.
|
||||||
|
*/
|
||||||
|
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
|
||||||
|
/* put user data just after end of 1st seg (upper layer header) */
|
||||||
|
assert(payload_len <= module->max_frag_payload);
|
||||||
|
rc = usnic_convertor_pack_simple(
|
||||||
|
convertor,
|
||||||
|
(IOVBASE_TYPE*)(frag->sf_base.uf_src_seg[0].seg_addr.lval + reserve),
|
||||||
|
*size,
|
||||||
|
size);
|
||||||
|
payload_len = reserve + *size;
|
||||||
|
frag->sf_base.uf_base.des_src_cnt = 1;
|
||||||
|
/* PML will copy header into beginning of segment */
|
||||||
|
frag->sf_base.uf_src_seg[0].seg_len = payload_len;
|
||||||
|
} else {
|
||||||
|
opal_convertor_get_current_pointer(convertor,
|
||||||
|
&sfrag->ssf_base.sf_base.uf_src_seg[1].seg_addr.pval);
|
||||||
|
frag->sf_base.uf_base.des_src_cnt = 2;
|
||||||
|
frag->sf_base.uf_src_seg[0].seg_len = reserve;
|
||||||
|
frag->sf_base.uf_src_seg[1].seg_len = *size;
|
||||||
|
}
|
||||||
|
|
||||||
|
frag->sf_base.uf_base.des_flags = flags;
|
||||||
|
frag->sf_endpoint = endpoint;
|
||||||
|
|
||||||
|
return frag;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Packs data from the given large send frag into single new segment and
|
||||||
|
* returns a pointer to it. The packed data comes first from SG[0] (PML
|
||||||
|
* header) and then second from either SG[1] (if seg_addr is non-NULL) or from
|
||||||
|
* the convertor contained in the frag.
|
||||||
|
*
|
||||||
|
* The frag's bookkeeping data will be updated appropriately. */
|
||||||
|
static
|
||||||
|
ompi_btl_usnic_chunk_segment_t *
|
||||||
|
pack_chunk_seg_from_frag(
|
||||||
|
struct ompi_btl_usnic_module_t* module,
|
||||||
|
ompi_btl_usnic_large_send_frag_t *lfrag)
|
||||||
|
{
|
||||||
|
ompi_btl_usnic_chunk_segment_t *seg;
|
||||||
|
uint8_t *copyptr;
|
||||||
|
size_t copylen;
|
||||||
|
size_t seg_space;
|
||||||
|
size_t max_data;
|
||||||
|
mca_btl_base_descriptor_t *desc;
|
||||||
|
|
||||||
|
assert(NULL != lfrag);
|
||||||
|
/* never should be attempting to pack if we've already packed everything */
|
||||||
|
assert(lfrag->lsf_pack_bytes_left > 0);
|
||||||
|
|
||||||
|
desc = &lfrag->lsf_base.sf_base.uf_base;
|
||||||
|
|
||||||
|
seg = ompi_btl_usnic_chunk_segment_alloc(module);
|
||||||
|
if (OPAL_UNLIKELY(NULL == seg)) {
|
||||||
|
/* TODO look at ways to deal with this case more gracefully, possibly as
|
||||||
|
* part of capping the overall BTL memory consumption. Watch out for
|
||||||
|
* possible MPI-layer deadlock. */
|
||||||
|
BTL_ERROR(("chunk segment allocation error"));
|
||||||
|
abort(); /* XXX */
|
||||||
|
}
|
||||||
|
|
||||||
|
seg_space = module->max_chunk_payload;
|
||||||
|
copyptr = seg->ss_base.us_payload.raw;
|
||||||
|
|
||||||
|
/* Keep copying in as long as we have space, there is data to be copied, and
|
||||||
|
* we aren't using a convertor (SG[1] will be NULL if we have a convertor).
|
||||||
|
*/
|
||||||
|
while (seg_space > 0 &&
|
||||||
|
lfrag->lsf_pack_bytes_left > 0 &&
|
||||||
|
NULL != lfrag->lsf_cur_ptr) {
|
||||||
|
if (seg_space > lfrag->lsf_bytes_left_in_sge) {
|
||||||
|
copylen = lfrag->lsf_bytes_left_in_sge;
|
||||||
|
} else {
|
||||||
|
copylen = seg_space;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(copyptr, lfrag->lsf_cur_ptr, copylen);
|
||||||
|
seg_space -= copylen;
|
||||||
|
copyptr += copylen;
|
||||||
|
lfrag->lsf_bytes_left_in_sge -= copylen;
|
||||||
|
lfrag->lsf_pack_bytes_left -= copylen;
|
||||||
|
if (lfrag->lsf_bytes_left_in_sge > 0) {
|
||||||
|
lfrag->lsf_cur_ptr += copylen;
|
||||||
|
} else {
|
||||||
|
++lfrag->lsf_cur_sge;
|
||||||
|
lfrag->lsf_cur_ptr =
|
||||||
|
lfrag->lsf_des_src[lfrag->lsf_cur_sge].seg_addr.pval;
|
||||||
|
lfrag->lsf_bytes_left_in_sge =
|
||||||
|
lfrag->lsf_des_src[lfrag->lsf_cur_sge].seg_len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (seg_space > 0 && lfrag->lsf_pack_bytes_left > 0) {
|
||||||
|
/* the remaining bytes come from a convertor; pack using it */
|
||||||
|
assert(NULL == lfrag->lsf_cur_ptr);
|
||||||
|
assert(1 == lfrag->lsf_cur_sge);
|
||||||
|
|
||||||
|
copylen = lfrag->lsf_pack_bytes_left;
|
||||||
|
if (copylen > seg_space) {
|
||||||
|
copylen = seg_space;
|
||||||
|
}
|
||||||
|
usnic_convertor_pack_simple(&lfrag->lsf_base.sf_convertor, copyptr,
|
||||||
|
copylen, &max_data);
|
||||||
|
seg_space -= max_data;
|
||||||
|
lfrag->lsf_bytes_left_in_sge -= max_data;
|
||||||
|
lfrag->lsf_pack_bytes_left -= max_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
MSGDEBUG1_OUT("%s: packed seg=%p, frag=%p, payload=%zd\n",
|
||||||
|
__func__, (void *)seg, (void *)lfrag,
|
||||||
|
(module->max_chunk_payload - seg_space));
|
||||||
|
|
||||||
|
assert(lfrag->lsf_cur_sge <= 2);
|
||||||
|
assert(seg_space < module->max_chunk_payload); /* must make progress */
|
||||||
|
|
||||||
|
seg->ss_parent_frag = &lfrag->lsf_base;
|
||||||
|
seg->ss_base.us_sg_entry[0].length = module->max_chunk_payload - seg_space;
|
||||||
|
|
||||||
|
return seg;
|
||||||
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
void *
|
||||||
|
pack_chunk_seg_chain_with_reserve(
|
||||||
|
struct ompi_btl_usnic_module_t* module,
|
||||||
|
ompi_btl_usnic_large_send_frag_t *lfrag,
|
||||||
|
size_t reserve_len,
|
||||||
|
opal_convertor_t *convertor,
|
||||||
|
size_t max_convertor_bytes,
|
||||||
|
size_t *convertor_bytes_packed)
|
||||||
|
{
|
||||||
|
ompi_btl_usnic_chunk_segment_t *seg;
|
||||||
|
void *ret_ptr = NULL;
|
||||||
|
int n_segs;
|
||||||
|
uint8_t *copyptr;
|
||||||
|
size_t copylen;
|
||||||
|
size_t seg_space;
|
||||||
|
size_t max_data;
|
||||||
|
bool first_pass;
|
||||||
|
|
||||||
|
assert(NULL != lfrag);
|
||||||
|
assert(NULL != convertor_bytes_packed);
|
||||||
|
|
||||||
|
n_segs = 0;
|
||||||
|
*convertor_bytes_packed = 0;
|
||||||
|
|
||||||
|
first_pass = true;
|
||||||
|
while (*convertor_bytes_packed < max_convertor_bytes ||
|
||||||
|
first_pass) {
|
||||||
|
seg = ompi_btl_usnic_chunk_segment_alloc(module);
|
||||||
|
if (OPAL_UNLIKELY(NULL == seg)) {
|
||||||
|
BTL_ERROR(("chunk segment allocation error"));
|
||||||
|
abort(); /* XXX */
|
||||||
|
}
|
||||||
|
++n_segs;
|
||||||
|
|
||||||
|
seg_space = module->max_chunk_payload;
|
||||||
|
copyptr = seg->ss_base.us_payload.raw;
|
||||||
|
|
||||||
|
if (first_pass && reserve_len > 0) {
|
||||||
|
/* logic could accommodate >max, but currently doesn't */
|
||||||
|
assert(reserve_len <= module->max_chunk_payload);
|
||||||
|
ret_ptr = copyptr;
|
||||||
|
seg_space -= reserve_len;
|
||||||
|
copyptr += reserve_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* now pack any convertor data */
|
||||||
|
if (*convertor_bytes_packed < max_convertor_bytes && seg_space > 0) {
|
||||||
|
copylen = max_convertor_bytes - *convertor_bytes_packed;
|
||||||
|
if (copylen > seg_space) {
|
||||||
|
copylen = seg_space;
|
||||||
|
}
|
||||||
|
usnic_convertor_pack_simple(convertor, copyptr, copylen, &max_data);
|
||||||
|
seg_space -= max_data;
|
||||||
|
*convertor_bytes_packed += max_data;
|
||||||
|
|
||||||
|
/* If unable to pack any of the remaining bytes, release the
|
||||||
|
* most recently allocated segment and finish processing.
|
||||||
|
*/
|
||||||
|
if (seg_space == module->max_chunk_payload) {
|
||||||
|
assert(max_data == 0); /* only way this can happen */
|
||||||
|
ompi_btl_usnic_chunk_segment_return(module, seg);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* bozo checks */
|
||||||
|
assert(seg_space >= 0);
|
||||||
|
assert(seg_space < module->max_chunk_payload);
|
||||||
|
|
||||||
|
/* append segment of data to chain to send */
|
||||||
|
seg->ss_parent_frag = &lfrag->lsf_base;
|
||||||
|
seg->ss_base.us_sg_entry[0].length = module->max_chunk_payload - seg_space;
|
||||||
|
opal_list_append(&lfrag->lsf_seg_chain, &seg->ss_base.us_list.super);
|
||||||
|
|
||||||
|
#if MSGDEBUG1
|
||||||
|
opal_output(0, "%s: appending seg=%p, frag=%p, payload=%zd\n",
|
||||||
|
__func__, (void *)seg, (void *)lfrag,
|
||||||
|
(module->max_chunk_payload - seg_space));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
first_pass = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Responsible for handling "large" frags (reserve + *size > max_frag_payload)
|
||||||
|
* in the same manner as btl_prepare_src. Must return a smaller amount than
|
||||||
|
* requested if the given convertor cannot process the entire (*size).
|
||||||
|
*/
|
||||||
|
static
|
||||||
|
ompi_btl_usnic_send_frag_t *
|
||||||
|
prepare_src_large(
|
||||||
|
struct ompi_btl_usnic_module_t* module,
|
||||||
|
struct mca_btl_base_endpoint_t* endpoint,
|
||||||
|
struct mca_mpool_base_registration_t* registration,
|
||||||
|
struct opal_convertor_t* convertor,
|
||||||
|
uint8_t order,
|
||||||
|
size_t reserve,
|
||||||
|
size_t* size,
|
||||||
|
uint32_t flags)
|
||||||
|
{
|
||||||
|
ompi_btl_usnic_send_frag_t *frag;
|
||||||
|
ompi_btl_usnic_large_send_frag_t *lfrag;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
/* Get holder for the msg */
|
||||||
|
lfrag = ompi_btl_usnic_large_send_frag_alloc(module);
|
||||||
|
if (OPAL_UNLIKELY(NULL == lfrag)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
frag = &lfrag->lsf_base;
|
||||||
|
|
||||||
|
/* The header location goes in SG[0], payload in SG[1]. If we are using a
|
||||||
|
* convertor then SG[1].seg_len is accurate but seg_addr is NULL. */
|
||||||
|
frag->sf_base.uf_base.des_src_cnt = 2;
|
||||||
|
|
||||||
|
/* stash header location, PML will write here */
|
||||||
|
frag->sf_base.uf_src_seg[0].seg_addr.pval = &lfrag->lsf_ompi_header;
|
||||||
|
frag->sf_base.uf_src_seg[0].seg_len = reserve;
|
||||||
|
/* make sure upper header small enough */
|
||||||
|
assert(reserve <= sizeof(lfrag->lsf_ompi_header));
|
||||||
|
|
||||||
|
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
|
||||||
|
/* threshold == -1 means always pack eagerly */
|
||||||
|
if (mca_btl_usnic_component.pack_lazy_threshold >= 0 &&
|
||||||
|
*size >= (size_t)mca_btl_usnic_component.pack_lazy_threshold) {
|
||||||
|
MSGDEBUG1_OUT("packing frag %p on the fly", (void *)frag);
|
||||||
|
lfrag->lsf_pack_on_the_fly = true;
|
||||||
|
|
||||||
|
/* tell the PML we will absorb as much as possible while still
|
||||||
|
* respecting indivisible element boundaries in the convertor */
|
||||||
|
*size = ompi_btl_usnic_convertor_pack_peek(convertor, *size);
|
||||||
|
|
||||||
|
/* Clone the convertor b/c we (the BTL) don't own it and the PML
|
||||||
|
* might mutate it after we return from this function. */
|
||||||
|
rc = opal_convertor_clone(convertor, &frag->sf_convertor,
|
||||||
|
/*copy_stack=*/true);
|
||||||
|
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
|
||||||
|
BTL_ERROR(("unexpected convertor clone error"));
|
||||||
|
abort(); /* XXX */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* pack everything in the convertor into a chain of segments now,
|
||||||
|
* leaving space for the PML header in the first segment */
|
||||||
|
lfrag->lsf_base.sf_base.uf_src_seg[0].seg_addr.pval =
|
||||||
|
pack_chunk_seg_chain_with_reserve(module, lfrag, reserve,
|
||||||
|
convertor, *size, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* We set SG[1] to {NULL,bytes_packed} so that various calculations
|
||||||
|
* by both PML and this BTL will be correct. For example, the PML adds
|
||||||
|
* up the bytes in the descriptor segments to determine if an MPI-level
|
||||||
|
* request is complete or not. */
|
||||||
|
frag->sf_base.uf_src_seg[1].seg_addr.pval = NULL;
|
||||||
|
frag->sf_base.uf_src_seg[1].seg_len = *size;
|
||||||
|
} else {
|
||||||
|
/* convertor not needed, just save the payload pointer in SG[1] */
|
||||||
|
lfrag->lsf_pack_on_the_fly = true;
|
||||||
|
opal_convertor_get_current_pointer(convertor,
|
||||||
|
&frag->sf_base.uf_src_seg[1].seg_addr.pval);
|
||||||
|
frag->sf_base.uf_src_seg[1].seg_len = *size;
|
||||||
|
}
|
||||||
|
|
||||||
|
frag->sf_base.uf_base.des_flags = flags;
|
||||||
|
frag->sf_endpoint = endpoint;
|
||||||
|
|
||||||
|
return frag;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Pack data and return a descriptor that can be used for send (or
|
* Note the "user" data the PML wishes to communicate and return a descriptor
|
||||||
* put, but we don't do that here in usnic).
|
* that can be used for send or put. We create a frag (which is also a
|
||||||
* Four different cases to handle:
|
* descriptor by virtue of its base class) and populate it with enough
|
||||||
* large vs small, small means fits into a single segment
|
* source information to complete a future send/put.
|
||||||
* convertor or not, if convertor we need to copy the data, non-convertor
|
|
||||||
* we will leave data in place
|
|
||||||
*
|
*
|
||||||
* small,convertor: copy the data into the segment associated with small frag,
|
* We will create either a small send frag if < than an MTU, otherwise a large
|
||||||
* caller will put header in this seg, single entry in desc SG
|
* send frag. The convertor will be saved for deferred packing if the user
|
||||||
* small,no convertor: caller will put header in attached segment SG[0],
|
* buffer is noncontiguous. Otherwise it will be saved in one of the
|
||||||
* save pointer to user data in SG[1], 2 SG entries
|
* descriptor's SGEs.
|
||||||
* large,convertor: copy data into chain of segments, leaving room for
|
|
||||||
* caller header at start of 1st segment, 2 SG entries
|
|
||||||
* large,not convertor: caller will put header in buffer in the large frag itself,
|
|
||||||
* save pointer to user data in SG[1]. 2 SG entries
|
|
||||||
*
|
*
|
||||||
* NOTE that the *only* reason this routine is allowed to return a size smaller
|
* NOTE that the *only* reason this routine is allowed to return a size smaller
|
||||||
* than was requested is if the convertor cannot process the entire amount.
|
* than was requested is if the convertor cannot process the entire amount.
|
||||||
@ -358,13 +709,8 @@ usnic_prepare_src(
|
|||||||
uint32_t flags)
|
uint32_t flags)
|
||||||
{
|
{
|
||||||
ompi_btl_usnic_module_t *module = (ompi_btl_usnic_module_t*) base_module;
|
ompi_btl_usnic_module_t *module = (ompi_btl_usnic_module_t*) base_module;
|
||||||
mca_btl_base_descriptor_t *desc;
|
|
||||||
ompi_btl_usnic_send_frag_t *frag;
|
ompi_btl_usnic_send_frag_t *frag;
|
||||||
uint32_t payload_len;
|
uint32_t payload_len;
|
||||||
struct iovec iov;
|
|
||||||
uint32_t iov_count;
|
|
||||||
size_t max_data;
|
|
||||||
int rc;
|
|
||||||
#if MSGDEBUG2
|
#if MSGDEBUG2
|
||||||
size_t osize = *size;
|
size_t osize = *size;
|
||||||
#endif
|
#endif
|
||||||
@ -374,153 +720,23 @@ usnic_prepare_src(
|
|||||||
*/
|
*/
|
||||||
payload_len = *size + reserve;
|
payload_len = *size + reserve;
|
||||||
if (payload_len <= module->max_frag_payload) {
|
if (payload_len <= module->max_frag_payload) {
|
||||||
ompi_btl_usnic_small_send_frag_t *sfrag;
|
frag = prepare_src_small(module, endpoint, registration, convertor,
|
||||||
|
order, reserve, size, flags);
|
||||||
/* Get holder for the msg */
|
|
||||||
sfrag = ompi_btl_usnic_small_send_frag_alloc(module);
|
|
||||||
if (OPAL_UNLIKELY(NULL == sfrag)) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
frag = &sfrag->ssf_base;
|
|
||||||
|
|
||||||
/* In the case of a convertor, we will copy the data in now, since
|
|
||||||
* that is the only way to discover how much we can actually send
|
|
||||||
* The reason we do not copy non-convertor pointer at this point is
|
|
||||||
* because we might still use INLINE for the send, and in that case
|
|
||||||
* we do not want to copy the data at all.
|
|
||||||
*/
|
|
||||||
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
|
|
||||||
|
|
||||||
/* put user data just after end of 1st seg (upper layer header) */
|
|
||||||
if (payload_len > module->max_frag_payload) {
|
|
||||||
payload_len = module->max_frag_payload;
|
|
||||||
}
|
|
||||||
iov.iov_len = payload_len - reserve;
|
|
||||||
iov.iov_base = (IOVBASE_TYPE*)
|
|
||||||
(frag->sf_base.uf_src_seg[0].seg_addr.lval + reserve);
|
|
||||||
iov_count = 1;
|
|
||||||
max_data = iov.iov_len;
|
|
||||||
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
|
||||||
if (OPAL_UNLIKELY(rc < 0)) {
|
|
||||||
ompi_btl_usnic_send_frag_return_cond(module, frag);
|
|
||||||
BTL_ERROR(("small convertor error"));
|
|
||||||
abort(); /* XXX */
|
|
||||||
}
|
|
||||||
*size = max_data;
|
|
||||||
payload_len = max_data + reserve;
|
|
||||||
sfrag->ssf_base.sf_convertor = convertor;
|
|
||||||
frag->sf_base.uf_base.des_src_cnt = 1;
|
|
||||||
frag->sf_base.uf_src_seg[0].seg_len = payload_len;
|
|
||||||
} else {
|
} else {
|
||||||
opal_convertor_get_current_pointer(convertor,
|
frag = prepare_src_large(module, endpoint, registration, convertor,
|
||||||
&sfrag->ssf_base.sf_base.uf_src_seg[1].seg_addr.pval);
|
order, reserve, size, flags);
|
||||||
sfrag->ssf_base.sf_convertor = NULL;
|
|
||||||
frag->sf_base.uf_base.des_src_cnt = 2;
|
|
||||||
frag->sf_base.uf_src_seg[0].seg_len = reserve;
|
|
||||||
frag->sf_base.uf_src_seg[1].seg_len = *size;
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
ompi_btl_usnic_large_send_frag_t *lfrag;
|
|
||||||
|
|
||||||
/* Get holder for the msg */
|
|
||||||
lfrag = ompi_btl_usnic_large_send_frag_alloc(module);
|
|
||||||
if (OPAL_UNLIKELY(NULL == lfrag)) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
frag = &lfrag->lsf_base;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If a convertor is required, pack the data into a chain of segments.
|
|
||||||
* We will later send from the segments one at a time. This allows
|
|
||||||
* us to absorb a large convertor-based send and still give an accurate
|
|
||||||
* data count back to the upper layer
|
|
||||||
*/
|
|
||||||
if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) {
|
|
||||||
ompi_btl_usnic_chunk_segment_t *seg;
|
|
||||||
unsigned ompi_hdr_len;
|
|
||||||
unsigned bytes_to_pack;
|
|
||||||
|
|
||||||
ompi_hdr_len = reserve;
|
|
||||||
bytes_to_pack = *size;
|
|
||||||
while (bytes_to_pack > 0) {
|
|
||||||
seg = ompi_btl_usnic_chunk_segment_alloc(module);
|
|
||||||
if (OPAL_UNLIKELY(NULL == seg)) {
|
|
||||||
BTL_ERROR(("large convertor segment allocation error"));
|
|
||||||
abort(); /* XXX */
|
|
||||||
}
|
|
||||||
|
|
||||||
/* put user data just after end of 1st seg (upper header) */
|
|
||||||
payload_len = ompi_hdr_len + bytes_to_pack;
|
|
||||||
if (payload_len > module->max_chunk_payload) {
|
|
||||||
payload_len = module->max_chunk_payload;
|
|
||||||
}
|
|
||||||
iov.iov_len = payload_len - ompi_hdr_len;
|
|
||||||
iov.iov_base = (IOVBASE_TYPE*)
|
|
||||||
(seg->ss_base.us_payload.raw + ompi_hdr_len);
|
|
||||||
iov_count = 1;
|
|
||||||
max_data = iov.iov_len;
|
|
||||||
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data);
|
|
||||||
if (OPAL_UNLIKELY(rc < 0)) {
|
|
||||||
ompi_btl_usnic_send_frag_return_cond(module, frag);
|
|
||||||
BTL_ERROR(("large convertor error"));
|
|
||||||
abort(); /* XXX */
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If unable to pack any of the remaining bytes, release the
|
|
||||||
* most recently allocated segment and finish processing.
|
|
||||||
*/
|
|
||||||
if (max_data == 0) {
|
|
||||||
ompi_btl_usnic_chunk_segment_return(module, seg);
|
|
||||||
*size -= bytes_to_pack;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* append segment of data to chain to send */
|
|
||||||
opal_list_append(&lfrag->lsf_seg_chain,
|
|
||||||
&seg->ss_base.us_list.super);
|
|
||||||
seg->ss_parent_frag = &lfrag->lsf_base;
|
|
||||||
seg->ss_base.us_sg_entry[0].length = max_data + ompi_hdr_len;
|
|
||||||
|
|
||||||
ompi_hdr_len = 0;
|
|
||||||
bytes_to_pack -= max_data;
|
|
||||||
}
|
|
||||||
payload_len = *size + reserve;
|
|
||||||
|
|
||||||
seg = (ompi_btl_usnic_chunk_segment_t *)
|
|
||||||
opal_list_get_first(&lfrag->lsf_seg_chain);
|
|
||||||
lfrag->lsf_base.sf_base.uf_src_seg[0].seg_addr.pval =
|
|
||||||
seg->ss_base.us_payload.raw;
|
|
||||||
|
|
||||||
lfrag->lsf_base.sf_convertor = convertor;
|
|
||||||
} else {
|
|
||||||
opal_convertor_get_current_pointer(convertor,
|
|
||||||
&lfrag->lsf_base.sf_base.uf_src_seg[1].seg_addr.pval);
|
|
||||||
lfrag->lsf_base.sf_convertor = NULL;
|
|
||||||
lfrag->lsf_base.sf_base.uf_src_seg[0].seg_addr.pval =
|
|
||||||
&lfrag->lsf_ompi_header;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* make sure upper header small enough */
|
|
||||||
assert(reserve < sizeof(lfrag->lsf_ompi_header));
|
|
||||||
|
|
||||||
frag->sf_base.uf_base.des_src_cnt = 2;
|
|
||||||
frag->sf_base.uf_src_seg[0].seg_len = reserve;
|
|
||||||
frag->sf_base.uf_src_seg[1].seg_len = *size;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* set up common parts of frag */
|
|
||||||
frag->sf_base.uf_base.des_flags = flags;
|
|
||||||
frag->sf_endpoint = endpoint;
|
|
||||||
|
|
||||||
desc = &frag->sf_base.uf_base;
|
|
||||||
|
|
||||||
#if MSGDEBUG2
|
#if MSGDEBUG2
|
||||||
opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u)\n",
|
opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n",
|
||||||
module->device->name,
|
module->device->name,
|
||||||
payload_len <= module->max_frag_payload?"small":"large",
|
(reserve + *size) <= module->max_frag_payload?"small":"large",
|
||||||
(void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize);
|
(void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize,
|
||||||
|
(void *)convertor);
|
||||||
#if MSGDEBUG1
|
#if MSGDEBUG1
|
||||||
{ unsigned i;
|
{
|
||||||
|
unsigned i;
|
||||||
|
mca_btl_base_descriptor_t *desc = &frag->sf_base.uf_base;
|
||||||
for (i=0; i<desc->des_src_cnt; ++i) {
|
for (i=0; i<desc->des_src_cnt; ++i) {
|
||||||
opal_output(0, " %d: ptr:%p len:%d\n", i,
|
opal_output(0, " %d: ptr:%p len:%d\n", i,
|
||||||
(void *)desc->des_src[i].seg_addr.pval,
|
(void *)desc->des_src[i].seg_addr.pval,
|
||||||
@ -530,7 +746,7 @@ usnic_prepare_src(
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return desc;
|
return &frag->sf_base.uf_base;
|
||||||
}
|
}
|
||||||
|
|
||||||
static mca_btl_base_descriptor_t*
|
static mca_btl_base_descriptor_t*
|
||||||
@ -586,19 +802,12 @@ usnic_put(
|
|||||||
struct mca_btl_base_endpoint_t *endpoint,
|
struct mca_btl_base_endpoint_t *endpoint,
|
||||||
struct mca_btl_base_descriptor_t *desc)
|
struct mca_btl_base_descriptor_t *desc)
|
||||||
{
|
{
|
||||||
|
int rc;
|
||||||
ompi_btl_usnic_send_frag_t *frag;
|
ompi_btl_usnic_send_frag_t *frag;
|
||||||
ompi_btl_usnic_send_segment_t *sseg;
|
|
||||||
|
|
||||||
frag = (ompi_btl_usnic_send_frag_t *)desc;
|
frag = (ompi_btl_usnic_send_frag_t *)desc;
|
||||||
|
|
||||||
/*
|
compute_sf_size(frag);
|
||||||
* Our descriptors are always either 1 or 2 segments.
|
|
||||||
* We always clear these lengths when the fragment is freed
|
|
||||||
* and only fill in what's needed in either prepare_src or usnic_alloc,
|
|
||||||
* so the total fragment length is always the sum of the 2 lengths.
|
|
||||||
*/
|
|
||||||
frag->sf_size = frag->sf_base.uf_src_seg[0].seg_len +
|
|
||||||
frag->sf_base.uf_src_seg[1].seg_len;
|
|
||||||
frag->sf_ack_bytes_left = frag->sf_size;
|
frag->sf_ack_bytes_left = frag->sf_size;
|
||||||
|
|
||||||
#if MSGDEBUG2
|
#if MSGDEBUG2
|
||||||
@ -625,64 +834,12 @@ usnic_put(
|
|||||||
/* copy out address - why does he not use our provided holder? */
|
/* copy out address - why does he not use our provided holder? */
|
||||||
frag->sf_base.uf_dst_seg[0].seg_addr.pval = desc->des_dst->seg_addr.pval;
|
frag->sf_base.uf_dst_seg[0].seg_addr.pval = desc->des_dst->seg_addr.pval;
|
||||||
|
|
||||||
/*
|
rc = ompi_btl_usnic_finish_put_or_send((ompi_btl_usnic_module_t *)btl,
|
||||||
* If this is small, need to do the copyin now.
|
(ompi_btl_usnic_endpoint_t *)endpoint,
|
||||||
* We don't do this earlier in case we got lucky and were
|
frag,
|
||||||
* able to do an inline send. We did not, so here we are...
|
/*tag=*/MCA_BTL_NO_ORDER);
|
||||||
*/
|
|
||||||
if (OMPI_BTL_USNIC_FRAG_SMALL_SEND == frag->sf_base.uf_type) {
|
|
||||||
ompi_btl_usnic_small_send_frag_t *sfrag;
|
|
||||||
|
|
||||||
sfrag = (ompi_btl_usnic_small_send_frag_t *)frag;
|
return rc;
|
||||||
sseg = &sfrag->ssf_segment;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* copy in user data if there is any, collapsing 2 segments into 1
|
|
||||||
*/
|
|
||||||
if (frag->sf_base.uf_base.des_src_cnt > 1) {
|
|
||||||
|
|
||||||
/* If not convertor, copy now. Already copied in convertor case */
|
|
||||||
if (frag->sf_convertor == NULL) {
|
|
||||||
memcpy(((char *)frag->sf_base.uf_src_seg[0].seg_addr.lval +
|
|
||||||
frag->sf_base.uf_src_seg[0].seg_len),
|
|
||||||
frag->sf_base.uf_src_seg[1].seg_addr.pval,
|
|
||||||
frag->sf_base.uf_src_seg[1].seg_len);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/* update 1st segment length */
|
|
||||||
frag->sf_base.uf_base.des_src_cnt = 1;
|
|
||||||
frag->sf_base.uf_src_seg[0].seg_len +=
|
|
||||||
frag->sf_base.uf_src_seg[1].seg_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* set up VERBS SG list */
|
|
||||||
sseg->ss_base.us_sg_entry[0].length =
|
|
||||||
sizeof(ompi_btl_usnic_btl_header_t) +
|
|
||||||
frag->sf_base.uf_base.des_src[0].seg_len;
|
|
||||||
|
|
||||||
/* use standard channel */
|
|
||||||
sseg->ss_channel = USNIC_DATA_CHANNEL;
|
|
||||||
} else {
|
|
||||||
ompi_btl_usnic_large_send_frag_t *lfrag;
|
|
||||||
unsigned i;
|
|
||||||
|
|
||||||
lfrag = (ompi_btl_usnic_large_send_frag_t *)frag;
|
|
||||||
assert(OMPI_BTL_USNIC_FRAG_LARGE_SEND == frag->sf_base.uf_type);
|
|
||||||
|
|
||||||
/* Save info about the frag */
|
|
||||||
lfrag->lsf_cur_offset = 0;
|
|
||||||
lfrag->lsf_cur_ptr = desc->des_src[0].seg_addr.pval;
|
|
||||||
lfrag->lsf_cur_sge = 0;
|
|
||||||
lfrag->lsf_bytes_left_in_sge = desc->des_src[0].seg_len;
|
|
||||||
lfrag->lsf_bytes_left = desc->des_src[0].seg_len;
|
|
||||||
for (i=1; i<desc->des_src_cnt; ++i) {
|
|
||||||
lfrag->lsf_bytes_left += desc->des_src[i].seg_len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ompi_btl_usnic_endpoint_enqueue_frag(endpoint, frag);
|
|
||||||
return OMPI_SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int usnic_finalize(struct mca_btl_base_module_t* btl)
|
static int usnic_finalize(struct mca_btl_base_module_t* btl)
|
||||||
@ -841,6 +998,12 @@ usnic_do_resends(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Given a large send frag (which is at the head of the given endpoint's send
|
||||||
|
* queue), generate a new segment, fill it with data, and
|
||||||
|
* endpoint_send_segment() it. Takes care of subsequent frag
|
||||||
|
* cleanup/bookkeeping (dequeue, descriptor callback, etc.) if this frag was
|
||||||
|
* completed by this segment.
|
||||||
|
*/
|
||||||
static void
|
static void
|
||||||
usnic_handle_large_send(
|
usnic_handle_large_send(
|
||||||
ompi_btl_usnic_module_t *module,
|
ompi_btl_usnic_module_t *module,
|
||||||
@ -850,83 +1013,45 @@ usnic_handle_large_send(
|
|||||||
ompi_btl_usnic_large_send_frag_t *lfrag;
|
ompi_btl_usnic_large_send_frag_t *lfrag;
|
||||||
ompi_btl_usnic_btl_chunk_header_t *chp;
|
ompi_btl_usnic_btl_chunk_header_t *chp;
|
||||||
ompi_btl_usnic_send_segment_t *sseg;
|
ompi_btl_usnic_send_segment_t *sseg;
|
||||||
size_t space;
|
mca_btl_base_descriptor_t *desc;
|
||||||
size_t copylen;
|
|
||||||
uint8_t *copyptr;
|
|
||||||
size_t payload_len;
|
size_t payload_len;
|
||||||
|
|
||||||
|
desc = &frag->sf_base.uf_base;
|
||||||
|
|
||||||
|
assert(frag->sf_base.uf_type == OMPI_BTL_USNIC_FRAG_LARGE_SEND);
|
||||||
lfrag = (ompi_btl_usnic_large_send_frag_t *)frag;
|
lfrag = (ompi_btl_usnic_large_send_frag_t *)frag;
|
||||||
if (lfrag->lsf_cur_offset == 0) {
|
if (lfrag->lsf_cur_offset == 0) {
|
||||||
|
|
||||||
/* assign a fragment ID */
|
/* assign a fragment ID */
|
||||||
do {
|
do {
|
||||||
lfrag->lsf_frag_id = endpoint->endpoint_next_frag_id++;
|
lfrag->lsf_frag_id = endpoint->endpoint_next_frag_id++;
|
||||||
} while (lfrag->lsf_frag_id == 0);
|
} while (lfrag->lsf_frag_id == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (OPAL_LIKELY(lfrag->lsf_base.sf_convertor == NULL)) {
|
if (lfrag->lsf_pack_on_the_fly) {
|
||||||
|
assert(opal_list_is_empty(&lfrag->lsf_seg_chain));
|
||||||
|
|
||||||
sseg = ompi_btl_usnic_chunk_segment_alloc(module);
|
/* just pack a single chunk segment and put it on the list */
|
||||||
if (OPAL_UNLIKELY(NULL == sseg)) {
|
sseg = pack_chunk_seg_from_frag(module, lfrag);
|
||||||
/* XXX do something better here */
|
|
||||||
BTL_ERROR(("error alloc seg for large send\n"));
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* save back pointer to fragment */
|
|
||||||
sseg->ss_parent_frag = frag;
|
|
||||||
|
|
||||||
/* keep copying in as long as we have space and there is data
|
|
||||||
* to be copied.
|
|
||||||
*/
|
|
||||||
space = module->max_chunk_payload;
|
|
||||||
copyptr = sseg->ss_base.us_payload.raw;
|
|
||||||
payload_len = 0;
|
|
||||||
while (space > 0 && lfrag->lsf_bytes_left > 0) {
|
|
||||||
if (space > lfrag->lsf_bytes_left_in_sge) {
|
|
||||||
copylen = lfrag->lsf_bytes_left_in_sge;
|
|
||||||
} else {
|
|
||||||
copylen = space;
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy(copyptr, lfrag->lsf_cur_ptr, copylen);
|
|
||||||
space -= copylen;
|
|
||||||
copyptr += copylen;
|
|
||||||
lfrag->lsf_bytes_left_in_sge -= copylen;
|
|
||||||
lfrag->lsf_bytes_left -= copylen;
|
|
||||||
if (lfrag->lsf_bytes_left_in_sge > 0) {
|
|
||||||
lfrag->lsf_cur_ptr += copylen;
|
|
||||||
} else {
|
|
||||||
++lfrag->lsf_cur_sge;
|
|
||||||
lfrag->lsf_cur_ptr =
|
|
||||||
lfrag->lsf_base.sf_base.uf_base.des_src[lfrag->lsf_cur_sge].seg_addr.pval;
|
|
||||||
lfrag->lsf_bytes_left_in_sge =
|
|
||||||
lfrag->lsf_base.sf_base.uf_base.des_src[lfrag->lsf_cur_sge].seg_len;
|
|
||||||
}
|
|
||||||
payload_len += copylen;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* set actual packet length for verbs */
|
|
||||||
assert(1 == sseg->ss_send_desc.num_sge); /* chunk invariant */
|
|
||||||
sseg->ss_base.us_sg_entry[0].length =
|
|
||||||
sizeof(ompi_btl_usnic_btl_chunk_header_t) + payload_len;
|
|
||||||
|
|
||||||
/* We are sending converted data, which means we have a list of segments
|
|
||||||
* containing the data. upper layer header is already in first segment
|
|
||||||
*/
|
|
||||||
} else {
|
} else {
|
||||||
|
/* data was pre-packed in prepare_src */
|
||||||
sseg = (ompi_btl_usnic_send_segment_t *)
|
sseg = (ompi_btl_usnic_send_segment_t *)
|
||||||
opal_list_remove_first(&lfrag->lsf_seg_chain);
|
opal_list_remove_first(&lfrag->lsf_seg_chain);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(NULL != sseg);
|
||||||
payload_len = sseg->ss_base.us_sg_entry[0].length;
|
payload_len = sseg->ss_base.us_sg_entry[0].length;
|
||||||
|
|
||||||
|
assert(payload_len > 0); /* must have made progress */
|
||||||
|
assert(payload_len <= module->max_chunk_payload);
|
||||||
|
assert(lfrag->lsf_bytes_left >= payload_len);
|
||||||
|
|
||||||
/* set actual packet length for verbs */
|
/* set actual packet length for verbs */
|
||||||
assert(1 == sseg->ss_send_desc.num_sge); /* chunk invariant */
|
assert(1 == sseg->ss_send_desc.num_sge); /* chunk invariant */
|
||||||
sseg->ss_base.us_sg_entry[0].length =
|
sseg->ss_base.us_sg_entry[0].length =
|
||||||
sizeof(ompi_btl_usnic_btl_chunk_header_t) + payload_len;
|
sizeof(ompi_btl_usnic_btl_chunk_header_t) + payload_len;
|
||||||
lfrag->lsf_bytes_left -= payload_len;
|
lfrag->lsf_bytes_left -= payload_len;
|
||||||
}
|
|
||||||
|
|
||||||
/* fill in BTL header with frag info */
|
/* fill in the chunk's BTL header with frag info */
|
||||||
chp = sseg->ss_base.us_btl_chunk_header;
|
chp = sseg->ss_base.us_btl_chunk_header;
|
||||||
chp->ch_frag_id = lfrag->lsf_frag_id;
|
chp->ch_frag_id = lfrag->lsf_frag_id;
|
||||||
chp->ch_frag_size = lfrag->lsf_base.sf_size;
|
chp->ch_frag_size = lfrag->lsf_base.sf_size;
|
||||||
@ -943,8 +1068,9 @@ usnic_handle_large_send(
|
|||||||
lfrag->lsf_cur_offset += payload_len;
|
lfrag->lsf_cur_offset += payload_len;
|
||||||
|
|
||||||
#if MSGDEBUG1
|
#if MSGDEBUG1
|
||||||
opal_output(0, "payload_len = %zd, bytes_left=%zd\n",
|
opal_output(0, "%s: payload_len=%zd, bytes_left=%zd on_the_fly=%s\n",
|
||||||
payload_len, lfrag->lsf_bytes_left);
|
__func__, payload_len, lfrag->lsf_bytes_left,
|
||||||
|
lfrag->lsf_pack_on_the_fly?"true":"false");
|
||||||
#endif
|
#endif
|
||||||
/* done with fragment? */
|
/* done with fragment? */
|
||||||
if (lfrag->lsf_bytes_left == 0) {
|
if (lfrag->lsf_bytes_left == 0) {
|
||||||
@ -1116,7 +1242,7 @@ ompi_btl_usnic_module_progress_sends(
|
|||||||
*
|
*
|
||||||
* If this is a send from a fragment we own, and we know we have copied the
|
* If this is a send from a fragment we own, and we know we have copied the
|
||||||
* data from the user's buffer, we can perform the callback immediately
|
* data from the user's buffer, we can perform the callback immediately
|
||||||
* (or possibly not at all, simply returning "1" to indicate completion.
|
* (or possibly not at all, simply returning "1" to indicate completion).
|
||||||
*
|
*
|
||||||
* If this is a send from a fragment we own and we have not yet copied out
|
* If this is a send from a fragment we own and we have not yet copied out
|
||||||
* all the data (as is the case in a large send) then we defer the callback
|
* all the data (as is the case in a large send) then we defer the callback
|
||||||
@ -1144,14 +1270,7 @@ usnic_send(
|
|||||||
assert(frag->sf_endpoint == endpoint);
|
assert(frag->sf_endpoint == endpoint);
|
||||||
frag->sf_base.uf_dst_seg[0].seg_addr.pval = NULL; /* not a PUT */
|
frag->sf_base.uf_dst_seg[0].seg_addr.pval = NULL; /* not a PUT */
|
||||||
|
|
||||||
/*
|
compute_sf_size(frag);
|
||||||
* Our descriptors are always either 1 or 2 segments.
|
|
||||||
* We always clear these lengths when the fragment is freed
|
|
||||||
* and only fill in what's needed in either prepare_src or usnic_alloc,
|
|
||||||
* so the total fragment length is always the sum of the 2 lengths.
|
|
||||||
*/
|
|
||||||
frag->sf_size = frag->sf_base.uf_src_seg[0].seg_len +
|
|
||||||
frag->sf_base.uf_src_seg[1].seg_len;
|
|
||||||
frag->sf_ack_bytes_left = frag->sf_size;
|
frag->sf_ack_bytes_left = frag->sf_size;
|
||||||
|
|
||||||
#if MSGDEBUG2
|
#if MSGDEBUG2
|
||||||
@ -1160,6 +1279,7 @@ usnic_send(
|
|||||||
tag, (int)frag->sf_size);
|
tag, (int)frag->sf_size);
|
||||||
#if MSGDEBUG1
|
#if MSGDEBUG1
|
||||||
{ unsigned i;
|
{ unsigned i;
|
||||||
|
opal_output(0, " descriptor->des_flags=0x%x\n", descriptor->des_flags);
|
||||||
for (i=0; i<descriptor->des_src_cnt; ++i) {
|
for (i=0; i<descriptor->des_src_cnt; ++i) {
|
||||||
opal_output(0, " %d: ptr:%p len:%d\n", i,
|
opal_output(0, " %d: ptr:%p len:%d\n", i,
|
||||||
descriptor->des_src[i].seg_addr.pval,
|
descriptor->des_src[i].seg_addr.pval,
|
||||||
@ -1208,7 +1328,7 @@ usnic_send(
|
|||||||
sseg->ss_channel = USNIC_PRIORITY_CHANNEL;
|
sseg->ss_channel = USNIC_PRIORITY_CHANNEL;
|
||||||
sseg->ss_base.us_btl_header->tag = tag;
|
sseg->ss_base.us_btl_header->tag = tag;
|
||||||
#if MSGDEBUG1
|
#if MSGDEBUG1
|
||||||
opal_output(0, "INLINE send, conv=%p", (void *)frag->sf_convertor);
|
opal_output(0, "INLINE send, sseg=%p", (void *)sseg);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* post the segment now */
|
/* post the segment now */
|
||||||
@ -1242,17 +1362,18 @@ usnic_send(
|
|||||||
rc = 0;
|
rc = 0;
|
||||||
} else {
|
} else {
|
||||||
#if MSGDEBUG1
|
#if MSGDEBUG1
|
||||||
opal_output(0, "skipping callback for frag %p\n", (void *)frag);
|
opal_output(0, "skipping callback for frag %p, returning 1\n", (void *)frag);
|
||||||
#endif
|
#endif
|
||||||
rc = 1;
|
rc = 1;
|
||||||
++module->stats.pml_send_callbacks; /* returning "1" is an implicit CB */
|
++module->stats.pml_send_callbacks; /* returning "1" is an implicit CB */
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
#if MSGDEBUG1
|
||||||
|
opal_output(0, "don't own descriptor, defer callback for frag %p\n", (void *)frag);
|
||||||
|
#endif
|
||||||
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
descriptor->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||||
rc = 0;
|
rc = 0;
|
||||||
}
|
}
|
||||||
++module->stats.pml_module_sends;
|
|
||||||
return rc;
|
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* We move this off to another function because having it inside
|
* We move this off to another function because having it inside
|
||||||
@ -1261,8 +1382,14 @@ usnic_send(
|
|||||||
* another file entirely, else the compiler tried to be helpful
|
* another file entirely, else the compiler tried to be helpful
|
||||||
* and inline all by itself.
|
* and inline all by itself.
|
||||||
*/
|
*/
|
||||||
return ompi_btl_usnic_send_slower(module, endpoint, frag, tag);
|
rc = ompi_btl_usnic_finish_put_or_send(module, endpoint, frag, tag);
|
||||||
|
/* FIXME can we clarify flag set/clear ordering? */
|
||||||
|
frag->sf_base.uf_base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
++module->stats.pml_module_sends;
|
||||||
|
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
@ -40,6 +40,7 @@
|
|||||||
#define USNIC_DFLT_EAGER_LIMIT_1DEVICE (150 * 1024)
|
#define USNIC_DFLT_EAGER_LIMIT_1DEVICE (150 * 1024)
|
||||||
#define USNIC_DFLT_EAGER_LIMIT_NDEVICES (25 * 1024)
|
#define USNIC_DFLT_EAGER_LIMIT_NDEVICES (25 * 1024)
|
||||||
#define USNIC_DFLT_RNDV_EAGER_LIMIT 500
|
#define USNIC_DFLT_RNDV_EAGER_LIMIT 500
|
||||||
|
#define USNIC_DFLT_PACK_LAZY_THRESHOLD (16 * 1024)
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
|
@ -98,13 +98,16 @@ ompi_btl_usnic_chunk_send_complete(ompi_btl_usnic_module_t *module,
|
|||||||
ompi_btl_usnic_check_rts(frag->sf_endpoint);
|
ompi_btl_usnic_check_rts(frag->sf_endpoint);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* Responsible for completing non-fastpath parts of a put or send operation,
|
||||||
* This routine handles the non-fastpath part of usnic_send().
|
* including initializing any large frag bookkeeping fields and enqueuing the
|
||||||
* The reason it is here is to prevent it getting inlined with
|
* frag on the endpoint.
|
||||||
* the rest of the function.
|
*
|
||||||
*/
|
* This routine lives in this file to help prevent automatic inlining by the
|
||||||
|
* compiler.
|
||||||
|
*
|
||||||
|
* The "tag" only applies to sends. */
|
||||||
int
|
int
|
||||||
ompi_btl_usnic_send_slower(
|
ompi_btl_usnic_finish_put_or_send(
|
||||||
ompi_btl_usnic_module_t *module,
|
ompi_btl_usnic_module_t *module,
|
||||||
ompi_btl_usnic_endpoint_t *endpoint,
|
ompi_btl_usnic_endpoint_t *endpoint,
|
||||||
ompi_btl_usnic_send_frag_t *frag,
|
ompi_btl_usnic_send_frag_t *frag,
|
||||||
@ -124,27 +127,25 @@ ompi_btl_usnic_send_slower(
|
|||||||
sfrag = (ompi_btl_usnic_small_send_frag_t *)frag;
|
sfrag = (ompi_btl_usnic_small_send_frag_t *)frag;
|
||||||
sseg = &sfrag->ssf_segment;
|
sseg = &sfrag->ssf_segment;
|
||||||
|
|
||||||
/*
|
/* Copy in user data if there is any, collapsing 2 segments into 1.
|
||||||
* copy in user data if there is any, collapsing 2 segments into 1
|
* We already packed via the convertor if necessary, so we only need to
|
||||||
|
* handle the simple memcpy case here.
|
||||||
*/
|
*/
|
||||||
if (frag->sf_base.uf_base.des_src_cnt > 1) {
|
if (frag->sf_base.uf_base.des_src_cnt > 1) {
|
||||||
|
/* no convertor */
|
||||||
|
assert(NULL != frag->sf_base.uf_src_seg[1].seg_addr.pval);
|
||||||
|
|
||||||
/* If not convertor, copy now. Already copied in convertor case */
|
|
||||||
if (frag->sf_convertor == NULL) {
|
|
||||||
memcpy(((char *)frag->sf_base.uf_src_seg[0].seg_addr.lval +
|
memcpy(((char *)frag->sf_base.uf_src_seg[0].seg_addr.lval +
|
||||||
frag->sf_base.uf_src_seg[0].seg_len),
|
frag->sf_base.uf_src_seg[0].seg_len),
|
||||||
frag->sf_base.uf_src_seg[1].seg_addr.pval,
|
frag->sf_base.uf_src_seg[1].seg_addr.pval,
|
||||||
frag->sf_base.uf_src_seg[1].seg_len);
|
frag->sf_base.uf_src_seg[1].seg_len);
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/* update 1st segment length */
|
/* update 1st segment length */
|
||||||
frag->sf_base.uf_base.des_src_cnt = 1;
|
frag->sf_base.uf_base.des_src_cnt = 1;
|
||||||
frag->sf_base.uf_src_seg[0].seg_len +=
|
frag->sf_base.uf_src_seg[0].seg_len +=
|
||||||
frag->sf_base.uf_src_seg[1].seg_len;
|
frag->sf_base.uf_src_seg[1].seg_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* set up VERBS SG list */
|
|
||||||
sseg->ss_base.us_sg_entry[0].length =
|
sseg->ss_base.us_sg_entry[0].length =
|
||||||
sizeof(ompi_btl_usnic_btl_header_t) + frag->sf_size;
|
sizeof(ompi_btl_usnic_btl_header_t) + frag->sf_size;
|
||||||
|
|
||||||
@ -153,30 +154,26 @@ ompi_btl_usnic_send_slower(
|
|||||||
sseg->ss_base.us_btl_header->tag = tag;
|
sseg->ss_base.us_btl_header->tag = tag;
|
||||||
} else {
|
} else {
|
||||||
ompi_btl_usnic_large_send_frag_t *lfrag;
|
ompi_btl_usnic_large_send_frag_t *lfrag;
|
||||||
mca_btl_base_descriptor_t *desc;
|
|
||||||
unsigned i;
|
|
||||||
|
|
||||||
|
/* Save info about the frag so that future invocations of
|
||||||
|
* usnic_handle_large_send can generate segments to put on the wire. */
|
||||||
lfrag = (ompi_btl_usnic_large_send_frag_t *)frag;
|
lfrag = (ompi_btl_usnic_large_send_frag_t *)frag;
|
||||||
desc = &frag->sf_base.uf_base;
|
|
||||||
|
|
||||||
/* Save info about the frag */
|
|
||||||
lfrag->lsf_tag = tag;
|
lfrag->lsf_tag = tag;
|
||||||
lfrag->lsf_cur_offset = 0;
|
lfrag->lsf_cur_offset = 0;
|
||||||
lfrag->lsf_cur_ptr = desc->des_src[0].seg_addr.pval;
|
lfrag->lsf_cur_ptr = lfrag->lsf_des_src[0].seg_addr.pval;
|
||||||
lfrag->lsf_cur_sge = 0;
|
lfrag->lsf_cur_sge = 0;
|
||||||
lfrag->lsf_bytes_left_in_sge = desc->des_src[0].seg_len;
|
lfrag->lsf_bytes_left_in_sge = lfrag->lsf_des_src[0].seg_len;
|
||||||
lfrag->lsf_bytes_left = desc->des_src[0].seg_len;
|
lfrag->lsf_bytes_left = frag->sf_size;
|
||||||
for (i=1; i<desc->des_src_cnt; ++i) {
|
|
||||||
lfrag->lsf_bytes_left += desc->des_src[i].seg_len;
|
if (lfrag->lsf_pack_on_the_fly) {
|
||||||
|
lfrag->lsf_pack_bytes_left = frag->sf_size;
|
||||||
|
} else {
|
||||||
|
/* we pre-packed the convertor into a chain in prepare_src */
|
||||||
|
lfrag->lsf_pack_bytes_left = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* queue this fragment into the send engine */
|
/* queue this fragment into the send engine */
|
||||||
rc = ompi_btl_usnic_endpoint_enqueue_frag(endpoint, frag);
|
rc = ompi_btl_usnic_endpoint_enqueue_frag(endpoint, frag);
|
||||||
frag->sf_base.uf_base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
|
||||||
|
|
||||||
/* Stats */
|
|
||||||
++(((ompi_btl_usnic_module_t*)module)->stats.pml_module_sends);
|
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
@ -274,9 +274,12 @@ void ompi_btl_usnic_frag_send_complete(ompi_btl_usnic_module_t *module,
|
|||||||
void ompi_btl_usnic_chunk_send_complete(ompi_btl_usnic_module_t *module,
|
void ompi_btl_usnic_chunk_send_complete(ompi_btl_usnic_module_t *module,
|
||||||
ompi_btl_usnic_send_segment_t *sseg);
|
ompi_btl_usnic_send_segment_t *sseg);
|
||||||
|
|
||||||
int ompi_btl_usnic_send_slower( ompi_btl_usnic_module_t *module,
|
int
|
||||||
|
ompi_btl_usnic_finish_put_or_send(
|
||||||
|
ompi_btl_usnic_module_t *module,
|
||||||
ompi_btl_usnic_endpoint_t *endpoint,
|
ompi_btl_usnic_endpoint_t *endpoint,
|
||||||
ompi_btl_usnic_send_frag_t *frag,
|
ompi_btl_usnic_send_frag_t *frag,
|
||||||
mca_btl_base_tag_t tag);
|
mca_btl_base_tag_t tag)
|
||||||
|
__opal_attribute_noinline__;
|
||||||
|
|
||||||
#endif /* BTL_USNIC_SEND_H */
|
#endif /* BTL_USNIC_SEND_H */
|
||||||
|
@ -222,3 +222,32 @@ void ompi_btl_usnic_util_abort(const char *msg, const char *file, int line,
|
|||||||
ompi_rte_abort(ret, NULL);
|
ompi_rte_abort(ret, NULL);
|
||||||
/* Never returns */
|
/* Never returns */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Return the largest size data size that can be packed into max_len using the
|
||||||
|
* given convertor. For example, a 1000 byte max_len buffer may only be able
|
||||||
|
* to hold 998 bytes if an indivisible convertor element straddles the 1000
|
||||||
|
* byte boundary.
|
||||||
|
*
|
||||||
|
* This routine internally clones the convertor and does not mutate it!
|
||||||
|
*/
|
||||||
|
size_t ompi_btl_usnic_convertor_pack_peek(
|
||||||
|
const opal_convertor_t *conv,
|
||||||
|
size_t max_len)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
size_t packable_len, position;
|
||||||
|
opal_convertor_t temp;
|
||||||
|
|
||||||
|
OBJ_CONSTRUCT(&temp, opal_convertor_t);
|
||||||
|
position = conv->bConverted + max_len;
|
||||||
|
rc = opal_convertor_clone_with_position(conv, &temp, 1, &position);
|
||||||
|
if (OPAL_UNLIKELY(rc < 0)) {
|
||||||
|
BTL_ERROR(("unexpected convertor error"));
|
||||||
|
abort(); /* XXX */
|
||||||
|
}
|
||||||
|
assert(position >= conv->bConverted);
|
||||||
|
packable_len = position - conv->bConverted;
|
||||||
|
OBJ_DESTRUCT(&temp);
|
||||||
|
return packable_len;
|
||||||
|
}
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
#ifndef BTL_USNIC_UTIL_H
|
#ifndef BTL_USNIC_UTIL_H
|
||||||
#define BTL_USNIC_UTIL_H
|
#define BTL_USNIC_UTIL_H
|
||||||
|
|
||||||
|
#include "opal/datatype/opal_convertor.h"
|
||||||
|
|
||||||
#include "btl_usnic.h"
|
#include "btl_usnic.h"
|
||||||
#include "btl_usnic_module.h"
|
#include "btl_usnic_module.h"
|
||||||
|
|
||||||
@ -42,6 +44,32 @@ static __always_inline int fls(int x)
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* a helper function that just declutters convertor packing */
|
||||||
|
static inline
|
||||||
|
int
|
||||||
|
usnic_convertor_pack_simple(
|
||||||
|
opal_convertor_t *convertor,
|
||||||
|
void *dest,
|
||||||
|
size_t max_bytes_to_pack,
|
||||||
|
size_t *bytes_packed)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
struct iovec iov;
|
||||||
|
uint32_t iov_count;
|
||||||
|
|
||||||
|
iov.iov_base = (IOVBASE_TYPE*)dest;
|
||||||
|
iov.iov_len = max_bytes_to_pack;
|
||||||
|
iov_count = 1;
|
||||||
|
*bytes_packed = max_bytes_to_pack;
|
||||||
|
rc = opal_convertor_pack(convertor, &iov, &iov_count, bytes_packed);
|
||||||
|
if (OPAL_UNLIKELY(rc < 0)) {
|
||||||
|
BTL_ERROR(("opal_convertor_pack error"));
|
||||||
|
abort(); /* XXX */
|
||||||
|
}
|
||||||
|
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Safely (but abnornmally) exit this process without abort()'ing (and
|
* Safely (but abnornmally) exit this process without abort()'ing (and
|
||||||
* leaving a corefile).
|
* leaving a corefile).
|
||||||
@ -65,4 +93,7 @@ uint32_t ompi_btl_usnic_get_ipv4_subnet(uint32_t addrn, uint32_t cidr_len);
|
|||||||
void ompi_btl_usnic_util_abort(const char *msg, const char *file, int line,
|
void ompi_btl_usnic_util_abort(const char *msg, const char *file, int line,
|
||||||
int ret);
|
int ret);
|
||||||
|
|
||||||
|
size_t ompi_btl_usnic_convertor_pack_peek(const opal_convertor_t *conv,
|
||||||
|
size_t max_len);
|
||||||
|
|
||||||
#endif /* BTL_USNIC_UTIL_H */
|
#endif /* BTL_USNIC_UTIL_H */
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user