1
1

btl/ugni: improve the handling of eager get fragments when the btl runs out

of preregistered buffers

Before this change eager gets we retried on each progress loop. This commit
modifies the protocol to only retry eager gets when another eager get has
completed. This commit also cleans up some callback code that is no longer
needed.
Этот коммит содержится в:
Nathan Hjelm 2014-10-08 14:58:09 -06:00
родитель ebc368d26b
Коммит 9e0c07e4ce
7 изменённых файлов: 91 добавлений и 62 удалений

Просмотреть файл

@ -71,8 +71,13 @@ typedef struct mca_btl_ugni_module_t {
/* lock for this list */
opal_mutex_t failed_frags_lock;
/** rdma frags waiting to be reposted */
opal_list_t failed_frags;
/** lock for the eager_get_pending list */
opal_mutex_t eager_get_pending_lock;
opal_list_t eager_get_pending;
mca_mpool_base_module_t *smsg_mpool;
ompi_free_list_t smsg_mboxes;
@ -107,8 +112,8 @@ typedef struct mca_btl_ugni_module_t {
/* fragment id bounce buffer (smsg msg ids are only 32 bits) */
opal_pointer_array_t pending_smsg_frags_bb;
uint32_t reg_max;
volatile int reg_count;
int32_t reg_max;
volatile int32_t reg_count;
/* used to calculate the fraction of registered memory resources
* this rank should be limited too */

Просмотреть файл

@ -380,10 +380,10 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
if (0 == mca_btl_ugni_component.mbox_increment) {
/* limit mailbox allocations to either 12.5% of available registrations
or 2MiB per allocation */
mbox_increment = (int) (2097152.0 / (float)mca_btl_ugni_component.smsg_mbox_size);
mbox_increment = (unsigned int) (2097152.0 / (float)mca_btl_ugni_component.smsg_mbox_size);
/* we may end up using more */
if (nprocs/mbox_increment > ugni_module->reg_max / 8) {
if (nprocs/mbox_increment > (unsigned int) ugni_module->reg_max / 8) {
mbox_increment = nprocs / (ugni_module->reg_max >> 3);
}
} else {

Просмотреть файл

@ -471,24 +471,20 @@ mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
!recoverable)) {
/* give up */
BTL_ERROR(("giving up on frag %p type %d CQE error %s", (void *) frag, frag->post_desc.base.type, buffer));
if (frag->cbfunc) {
frag->cbfunc (frag, OPAL_ERROR);
}
mca_btl_ugni_frag_complete (frag, OPAL_ERROR);
return OPAL_ERROR;
}
/* repost transaction */
mca_btl_ugni_repost (frag, OPAL_SUCCESS);
mca_btl_ugni_repost (frag);
return 0;
}
BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag));
if (frag->cbfunc) {
frag->cbfunc (frag, opal_common_rc_ugni_to_opal (rc));
}
mca_btl_ugni_frag_complete (frag, opal_common_rc_ugni_to_opal (rc));
return 1;
}
@ -504,11 +500,11 @@ mca_btl_ugni_retry_failed (mca_btl_ugni_module_t *ugni_module)
mca_btl_ugni_base_frag_t *frag =
(mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->failed_frags);
OPAL_THREAD_UNLOCK(&ugni_module->failed_frags_lock);
assert (NULL != frag);
if (frag->cbfunc) {
frag->cbfunc (frag, OPAL_SUCCESS);
if (NULL == frag) {
break;
}
mca_btl_ugni_repost (frag);
}
return count;

Просмотреть файл

@ -81,7 +81,6 @@ typedef struct mca_btl_ugni_base_frag_t {
mca_btl_base_endpoint_t *endpoint;
mca_btl_ugni_reg_t *registration;
ompi_free_list_t *my_list;
frag_cb_t cbfunc;
} mca_btl_ugni_base_frag_t;
typedef struct mca_btl_ugni_base_frag_t mca_btl_ugni_smsg_frag_t;

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -53,28 +53,37 @@ int mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
return mca_btl_ugni_post (frag, true, dst_seg, src_seg);
}
static void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, int rc)
/* eager get */
static void mca_btl_ugni_callback_eager_get_progress_pending (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *desc, int rc)
{
BTL_VERBOSE(("rdma operation for rem_ctx %p complete", frag->hdr.rdma.ctx));
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
mca_btl_ugni_base_frag_t *pending_frag, *frag = (mca_btl_ugni_base_frag_t *) desc;
/* tell peer the get is complete */
rc = opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
if (OPAL_UNLIKELY(0 > rc)) {
/* call this callback again later */
frag->cbfunc = mca_btl_ugni_callback_rdma_complete;
opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag);
OPAL_THREAD_LOCK(&ugni_module->eager_get_pending_lock);
pending_frag = (mca_btl_ugni_base_frag_t *) opal_list_remove_first (&ugni_module->eager_get_pending);
OPAL_THREAD_UNLOCK(&ugni_module->eager_get_pending_lock);
if (NULL != pending_frag) {
/* copy the relevant data out of the pending fragment */
frag->endpoint = pending_frag->endpoint;
/* start the next eager get using this fragment */
(void) mca_btl_ugni_start_eager_get (frag->endpoint, pending_frag->hdr.eager_ex, frag);
/* return the temporary fragment */
mca_btl_ugni_frag_return (pending_frag);
} else {
/* not needed anymore */
mca_btl_ugni_frag_return (frag);
}
}
/* eager get */
static void mca_btl_ugni_callback_eager_get_retry (mca_btl_ugni_base_frag_t *frag, int rc)
{
(void) mca_btl_ugni_start_eager_get(frag->endpoint, frag->hdr.eager_ex, frag);
}
static void mca_btl_ugni_callback_eager_get (mca_btl_ugni_base_frag_t *frag, int rc)
static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
struct mca_btl_base_descriptor_t *desc, int rc)
{
mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) desc;
uint32_t len = frag->hdr.eager.send.lag & 0x00ffffff;
uint8_t tag = frag->hdr.eager.send.lag >> 24;
size_t payload_len = frag->hdr.eager.src_seg.base.seg_len;
@ -105,21 +114,34 @@ static void mca_btl_ugni_callback_eager_get (mca_btl_ugni_base_frag_t *frag, int
frag->hdr.rdma.ctx = frag->hdr.eager.ctx;
/* once complete use this fragment for a pending eager get if any exist */
frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get_progress_pending;
/* tell the remote peer the operation is complete */
mca_btl_ugni_callback_rdma_complete (frag, rc);
rc = opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
if (OPAL_UNLIKELY(0 > rc)) {
/* queue fragment */
if (false == endpoint->wait_listed) {
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
endpoint->wait_listed = true;
}
OPAL_THREAD_LOCK(&endpoint->lock);
opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
OPAL_THREAD_UNLOCK(&endpoint->lock);
}
}
int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
mca_btl_ugni_eager_ex_frag_hdr_t hdr,
mca_btl_ugni_base_frag_t *frag)
{
mca_btl_ugni_module_t *ugni_module = ep->btl;
int rc;
if (OPAL_UNLIKELY(frag && frag->my_list == &ep->btl->rdma_int_frags)) {
mca_btl_ugni_frag_return (frag);
frag = NULL;
}
BTL_VERBOSE(("starting eager get for remote ctx: %p", hdr.eager.ctx));
do {
@ -136,7 +158,7 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
frag->hdr.eager_ex = hdr;
frag->flags = 0;
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
frag->base.des_flags = 0;
frag->segments[1] = hdr.eager.src_seg;
@ -146,15 +168,19 @@ int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
frag->base.des_local = &frag->segments[1].base;
rc = mca_btl_ugni_post_wcb (frag, GNI_POST_RDMA_GET, frag->segments, frag->segments + 1,
mca_btl_ugni_callback_eager_get);
/* set up callback for get completion */
frag->base.des_flags = MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get;
rc = mca_btl_ugni_post (frag, GNI_POST_RDMA_GET, frag->segments, frag->segments + 1);
if (OPAL_UNLIKELY(OPAL_SUCCESS == rc)) {
return OPAL_SUCCESS;
}
} while (0);
frag->cbfunc = mca_btl_ugni_callback_eager_get_retry;
opal_list_append (&ep->btl->failed_frags, (opal_list_item_t *) frag);
OPAL_THREAD_LOCK(&ugni_module->eager_get_pending_lock);
opal_list_append (&ugni_module->eager_get_pending, (opal_list_item_t *) frag);
OPAL_THREAD_UNLOCK(&ugni_module->eager_get_pending_lock);
return rc;
}

Просмотреть файл

@ -74,7 +74,11 @@ mca_btl_ugni_module_init (mca_btl_ugni_module_t *ugni_module,
ugni_module->active_send_count = 0;
OBJ_CONSTRUCT(&ugni_module->failed_frags, opal_list_t);
OBJ_CONSTRUCT(&ugni_module->failed_frags_lock,opal_mutex_t);
OBJ_CONSTRUCT(&ugni_module->failed_frags_lock, opal_mutex_t);
OBJ_CONSTRUCT(&ugni_module->eager_get_pending, opal_list_t);
OBJ_CONSTRUCT(&ugni_module->eager_get_pending_lock,opal_mutex_t);
OBJ_CONSTRUCT(&ugni_module->eager_frags_send, ompi_free_list_t);
OBJ_CONSTRUCT(&ugni_module->eager_frags_recv, ompi_free_list_t);
OBJ_CONSTRUCT(&ugni_module->smsg_frags, ompi_free_list_t);
@ -184,6 +188,9 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl)
OBJ_DESTRUCT(&ugni_module->endpoints);
OBJ_DESTRUCT(&ugni_module->failed_frags);
OBJ_DESTRUCT(&ugni_module->eager_get_pending);
OBJ_DESTRUCT(&ugni_module->eager_get_pending_lock);
if (ugni_module->initialized) {
/* need to tear down the mpools *after* the free lists */
if (NULL != ugni_module->smsg_mpool) {

Просмотреть файл

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
* $COPYRIGHT$
@ -85,27 +85,21 @@ static inline int mca_btl_ugni_post_bte (mca_btl_ugni_base_frag_t *frag, gni_pos
return OPAL_SUCCESS;
}
static inline int mca_btl_ugni_post_wcb (mca_btl_ugni_base_frag_t *frag, bool get, mca_btl_ugni_segment_t *lcl_seg,
mca_btl_ugni_segment_t *rem_seg, frag_cb_t cb) {
frag->cbfunc = cb;
if (frag->base.des_local->seg_len <= mca_btl_ugni_component.ugni_fma_limit) {
return mca_btl_ugni_post_fma (frag, get ? GNI_POST_FMA_GET : GNI_POST_FMA_PUT, lcl_seg, rem_seg);
}
return mca_btl_ugni_post_bte (frag, get ? GNI_POST_RDMA_GET : GNI_POST_RDMA_PUT, lcl_seg, rem_seg);
}
static inline int mca_btl_ugni_post (mca_btl_ugni_base_frag_t *frag, bool get, mca_btl_ugni_segment_t *lcl_seg,
mca_btl_ugni_segment_t *rem_seg) {
return mca_btl_ugni_post_wcb (frag, get, lcl_seg, rem_seg, mca_btl_ugni_frag_complete);
const gni_post_type_t fma_ops[2] = {GNI_POST_FMA_PUT, GNI_POST_FMA_GET};
const gni_post_type_t rdma_ops[2] = {GNI_POST_RDMA_PUT, GNI_POST_RDMA_GET};
if (frag->base.des_local->seg_len <= mca_btl_ugni_component.ugni_fma_limit) {
return mca_btl_ugni_post_fma (frag, fma_ops[get], lcl_seg, rem_seg);
}
return mca_btl_ugni_post_bte (frag, rdma_ops[get], lcl_seg, rem_seg);
}
static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag, int rc) {
static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag) {
gni_return_t grc;
frag->cbfunc = mca_btl_ugni_frag_complete;
OPAL_THREAD_LOCK(&frag->endpoint->common->dev->dev_lock);
if (GNI_POST_RDMA_PUT == frag->post_desc.base.type ||
GNI_POST_RDMA_GET == frag->post_desc.base.type) {
@ -116,7 +110,9 @@ static inline void mca_btl_ugni_repost (mca_btl_ugni_base_frag_t *frag, int rc)
OPAL_THREAD_UNLOCK(&frag->endpoint->common->dev->dev_lock);
if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
frag->cbfunc = mca_btl_ugni_repost;
/* NTH: Should we even retry these? When this code was written there was no indication
* whether an error in post is recoverable. Clobber this code and the associated data
* structures if post errors are not recoverable. */
OPAL_THREAD_LOCK(&frag->endpoint->btl->failed_frags_lock);
opal_list_append (&frag->endpoint->btl->failed_frags, (opal_list_item_t *) frag);
OPAL_THREAD_UNLOCK(&frag->endpoint->btl->failed_frags_lock);