btl/ugni: improve the handling of eager get fragments when the btl runs out
of preregistered buffers Before this change eager gets we retried on each progress loop. This commit modifies the protocol to only retry eager gets when another eager get has completed. This commit also cleans up some callback code that is no longer needed.
Этот коммит содержится в:
родитель
ebc368d26b
Коммит
9e0c07e4ce
@ -71,8 +71,13 @@ typedef struct mca_btl_ugni_module_t {
|
||||
|
||||
/* lock for this list */
|
||||
opal_mutex_t failed_frags_lock;
|
||||
/** rdma frags waiting to be reposted */
|
||||
opal_list_t failed_frags;
|
||||
|
||||
/** lock for the eager_get_pending list */
|
||||
opal_mutex_t eager_get_pending_lock;
|
||||
opal_list_t eager_get_pending;
|
||||
|
||||
mca_mpool_base_module_t *smsg_mpool;
|
||||
ompi_free_list_t smsg_mboxes;
|
||||
|
||||
@ -107,8 +112,8 @@ typedef struct mca_btl_ugni_module_t {
|
||||
/* fragment id bounce buffer (smsg msg ids are only 32 bits) */
|
||||
opal_pointer_array_t pending_smsg_frags_bb;
|
||||
|
||||
uint32_t reg_max;
|
||||
volatile int reg_count;
|
||||
int32_t reg_max;
|
||||
volatile int32_t reg_count;
|
||||
|
||||
/* used to calculate the fraction of registered memory resources
|
||||
* this rank should be limited too */
|
||||
|
@ -380,10 +380,10 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
|
||||
if (0 == mca_btl_ugni_component.mbox_increment) {
|
||||
/* limit mailbox allocations to either 12.5% of available registrations
|
||||
or 2MiB per allocation */
|
||||
mbox_increment = (int) (2097152.0 / (float)mca_btl_ugni_component.smsg_mbox_size);
|
||||
mbox_increment = (unsigned int) (2097152.0 / (float)mca_btl_ugni_component.smsg_mbox_size);
|
||||
|
||||
/* we may end up using more */
|
||||
if (nprocs/mbox_increment > ugni_module->reg_max / 8) {
|
||||
if (nprocs/mbox_increment > (unsigned int) ugni_module->reg_max / 8) {
|
||||
mbox_increment = nprocs / (ugni_module->reg_max >> 3);
|
||||
}
|
||||
} else {
|
||||
|
@ -471,24 +471,20 @@ mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
|
||||
!recoverable)) {
|
||||
/* give up */
|
||||
BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer));
|
||||
if (frag->cbfunc) {
|
||||
frag->cbfunc (frag, OPAL_ERROR);
|
||||
}
|
||||
mca_btl_ugni_frag_complete (frag, OPAL_ERROR);
|
||||
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
/* repost transaction */
|
||||
mca_btl_ugni_repost (frag, OPAL_SUCCESS);
|
||||
mca_btl_ugni_repost (frag);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag));
|
||||
|
||||
if (frag->cbfunc) {
|
||||
frag->cbfunc (frag, opal_common_rc_ugni_to_opal (rc));
|
||||
}
|
||||
mca_btl_ugni_frag_complete (frag, opal_common_rc_ugni_to_opal (rc));
|
||||
|
||||
return 1;
|
||||
}
|
||||
@ -504,11 +500,11 @@ mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
|
||||
mca_btl_ugni_base_frag_t *frag =
|
||||
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock);
|
||||
assert (NULL != frag);
|
||||
|
||||
if (frag->cbfunc) {
|
||||
frag->cbfunc (frag, OPAL_SUCCESS);
|
||||
if (NULL == frag) {
|
||||
break;
|
||||
}
|
||||
|
||||
mca_btl_ugni_repost (frag);
|
||||
}
|
||||
|
||||
return count;
|
||||
|
@ -81,7 +81,6 @@ typedef struct mca_btl_ugni_base_frag_t {
|
||||
mca_btl_base_endpoint_t *endpoint;
|
||||
mca_btl_ugni_reg_t *registration;
|
||||
ompi_free_list_t *my_list;
|
||||
frag_cb_t cbfunc;
|
||||
} mca_btl_ugni_base_frag_t;
|
||||
|
||||
typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_smsg_frag_t;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -53,28 +53,37 @@ int mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
|
||||
return mca_btl_ugni_post (frag, true, dst_seg, src_seg);
|
||||
}
|
||||
|
||||
static void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, int rc)
|
||||
/* eager get */
|
||||
static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *desc, int rc)
|
||||
{
|
||||
BTL_VERBOSE(("rdma operation for rem_ctx %p complete", frag->hdr.rdma.ctx));
|
||||
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
|
||||
mca_btl_ugni_base_frag_t *pending_frag, *frag = (mca_btl_ugni_base_frag_t *) desc;
|
||||
|
||||
/* tell peer the get is complete */
|
||||
rc = opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
|
||||
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
|
||||
if (OPAL_UNLIKELY(0 > rc)) {
|
||||
/* call this callback again later */
|
||||
frag->cbfunc = mca_btl_ugni_callback_rdma_complete;
|
||||
opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_LOCK(&ugni_module->eager_get_pending_lock);
|
||||
pending_frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->eager_get_pending);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->eager_get_pending_lock);
|
||||
|
||||
if (NULL != pending_frag) {
|
||||
/* copy the relevant data out of the pending fragment */
|
||||
frag->endpoint = pending_frag->endpoint;
|
||||
|
||||
/* start the next eager get using this fragment */
|
||||
(void) mca_btl_ugni_start_eager_get (frag->endpoint, pending_frag->hdr.eager_ex, frag);
|
||||
|
||||
/* return the temporary fragment */
|
||||
mca_btl_ugni_frag_return (pending_frag);
|
||||
} else {
|
||||
/* not needed anymore */
|
||||
mca_btl_ugni_frag_return (frag);
|
||||
}
|
||||
}
|
||||
|
||||
/* eager get */
|
||||
static void mca_btl_ugni_callback_eager_get_retry (mca_btl_ugni_base_frag_t *frag, int rc)
|
||||
{
|
||||
(void) mca_btl_ugni_start_eager_get(frag->endpoint, frag->hdr.eager_ex, frag);
|
||||
}
|
||||
|
||||
static void mca_btl_ugni_callback_eager_get (mca_btl_ugni_base_frag_t *frag, int rc)
|
||||
static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
|
||||
struct mca_btl_base_descriptor_t *desc, int rc)
|
||||
{
|
||||
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
|
||||
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) desc;
|
||||
uint32_t len = frag->hdr.eager.send.lag & 0x00ffffff;
|
||||
uint8_t tag = frag->hdr.eager.send.lag >> 24;
|
||||
size_t payload_len = frag->hdr.eager.src_seg.base.seg_len;
|
||||
@ -105,21 +114,34 @@ static void mca_btl_ugni_callback_eager_get (mca_btl_ugni_base_frag_t *frag, int
|
||||
|
||||
frag->hdr.rdma.ctx = frag->hdr.eager.ctx;
|
||||
|
||||
/* once complete use this fragment for a pending eager get if any exist */
|
||||
frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get_progress_pending;
|
||||
|
||||
/* tell the remote peer the operation is complete */
|
||||
mca_btl_ugni_callback_rdma_complete (frag, rc);
|
||||
rc = opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
|
||||
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
|
||||
if (OPAL_UNLIKELY(0 > rc)) {
|
||||
/* queue fragment */
|
||||
if (false == endpoint->wait_listed) {
|
||||
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
|
||||
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
|
||||
endpoint->wait_listed = true;
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&endpoint->lock);
|
||||
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->lock);
|
||||
}
|
||||
}
|
||||
|
||||
int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
|
||||
mca_btl_ugni_eager_ex_frag_hdr_t hdr,
|
||||
mca_btl_ugni_base_frag_t *frag)
|
||||
{
|
||||
mca_btl_ugni_module_t *ugni_module = ep->btl;
|
||||
int rc;
|
||||
|
||||
if (OPAL_UNLIKELY(frag && frag->my_list == &ep->btl->rdma_int_frags)) {
|
||||
mca_btl_ugni_frag_return (frag);
|
||||
frag = NULL;
|
||||
}
|
||||
|
||||
BTL_VERBOSE(("starting eager get for remote ctx: %p", hdr.eager.ctx));
|
||||
|
||||
do {
|
||||
@ -136,7 +158,7 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
|
||||
frag->hdr.eager_ex = hdr;
|
||||
frag->flags = 0;
|
||||
|
||||
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
|
||||
frag->base.des_flags = 0;
|
||||
|
||||
frag->segments[1] = hdr.eager.src_seg;
|
||||
|
||||
@ -146,15 +168,19 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
|
||||
|
||||
frag->base.des_local = &frag->segments[1].base;
|
||||
|
||||
rc = mca_btl_ugni_post_wcb (frag, GNI_POST_RDMA_GET, frag->segments, frag->segments + 1,
|
||||
mca_btl_ugni_callback_eager_get);
|
||||
/* set up callback for get completion */
|
||||
frag->base.des_flags = MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||
frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get;
|
||||
|
||||
rc = mca_btl_ugni_post (frag, GNI_POST_RDMA_GET, frag->segments, frag->segments + 1);
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS == rc)) {
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
} while (0);
|
||||
|
||||
frag->cbfunc = mca_btl_ugni_callback_eager_get_retry;
|
||||
opal_list_append (&ep->btl->failed_frags, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_LOCK(&ugni_module->eager_get_pending_lock);
|
||||
opal_list_append (&ugni_module->eager_get_pending, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_UNLOCK(&ugni_module->eager_get_pending_lock);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
@ -74,7 +74,11 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
|
||||
ugni_module->active_send_count = 0;
|
||||
|
||||
OBJ_CONSTRUCT(&ugni_module->failed_frags, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->failed_frags_lock,opal_mutex_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->failed_frags_lock, opal_mutex_t);
|
||||
|
||||
OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->eager_get_pending_lock,opal_mutex_t);
|
||||
|
||||
OBJ_CONSTRUCT(&ugni_module->eager_frags_send, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->eager_frags_recv, ompi_free_list_t);
|
||||
OBJ_CONSTRUCT(&ugni_module->smsg_frags, ompi_free_list_t);
|
||||
@ -184,6 +188,9 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
|
||||
OBJ_DESTRUCT(&ugni_module->endpoints);
|
||||
OBJ_DESTRUCT(&ugni_module->failed_frags);
|
||||
|
||||
OBJ_DESTRUCT(&ugni_module->eager_get_pending);
|
||||
OBJ_DESTRUCT(&ugni_module->eager_get_pending_lock);
|
||||
|
||||
if (ugni_module->initialized) {
|
||||
/* need to tear down the mpools *after* the free lists */
|
||||
if (NULL != ugni_module->smsg_mpool) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -85,27 +85,21 @@ static inline int mca_btl_ugni_post_bte (mca_btl_ugni_base_frag_t *frag, gni_pos
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_post_wcb (mca_btl_ugni_base_frag_t *frag, bool get, mca_btl_ugni_segment_t *lcl_seg,
|
||||
mca_btl_ugni_segment_t *rem_seg, frag_cb_t cb) {
|
||||
frag->cbfunc = cb;
|
||||
|
||||
if (frag->base.des_local->seg_len <= mca_btl_ugni_component.ugni_fma_limit) {
|
||||
return mca_btl_ugni_post_fma (frag, get ? GNI_POST_FMA_GET : GNI_POST_FMA_PUT, lcl_seg, rem_seg);
|
||||
}
|
||||
|
||||
return mca_btl_ugni_post_bte (frag, get ? GNI_POST_RDMA_GET : GNI_POST_RDMA_PUT, lcl_seg, rem_seg);
|
||||
}
|
||||
|
||||
static inline int mca_btl_ugni_post (mca_btl_ugni_base_frag_t *frag, bool get, mca_btl_ugni_segment_t *lcl_seg,
|
||||
mca_btl_ugni_segment_t *rem_seg) {
|
||||
return mca_btl_ugni_post_wcb (frag, get, lcl_seg, rem_seg, mca_btl_ugni_frag_complete);
|
||||
const gni_post_type_t fma_ops[2] = {GNI_POST_FMA_PUT, GNI_POST_FMA_GET};
|
||||
const gni_post_type_t rdma_ops[2] = {GNI_POST_RDMA_PUT, GNI_POST_RDMA_GET};
|
||||
|
||||
if (frag->base.des_local->seg_len <= mca_btl_ugni_component.ugni_fma_limit) {
|
||||
return mca_btl_ugni_post_fma (frag, fma_ops[get], lcl_seg, rem_seg);
|
||||
}
|
||||
|
||||
return mca_btl_ugni_post_bte (frag, rdma_ops[get], lcl_seg, rem_seg);
|
||||
}
|
||||
|
||||
static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag, int rc) {
|
||||
static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag) {
|
||||
gni_return_t grc;
|
||||
|
||||
frag->cbfunc = mca_btl_ugni_frag_complete;
|
||||
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
if (GNI_POST_RDMA_PUT == frag->post_desc.base.type ||
|
||||
GNI_POST_RDMA_GET == frag->post_desc.base.type) {
|
||||
@ -116,7 +110,9 @@ static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag, int rc)
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
|
||||
|
||||
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
|
||||
frag->cbfunc = mca_btl_ugni_repost;
|
||||
/* NTH: Should we even retry these? When this code was written there was no indication
|
||||
* whether an error in post is recoverable. Clobber this code and the associated data
|
||||
* structures if post errors are not recoverable. */
|
||||
OPAL_THREAD_LOCK(&frag->endpoint->btl->failed_frags_lock);
|
||||
opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag);
|
||||
OPAL_THREAD_UNLOCK(&frag->endpoint->btl->failed_frags_lock);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user