Bugfix for hangs in certain communication patterns, particularly alltoall.
This commit was SVN r16600.
Этот коммит содержится в:
родитель
04578ffdd6
Коммит
8273b61471
@ -652,8 +652,8 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
|
||||
length,
|
||||
OBJ_CLASS(mca_btl_ud_send_frag_t),
|
||||
mca_btl_ofud_component.sd_num >> 1,
|
||||
-1,
|
||||
mca_btl_ofud_component.sd_num << 2,
|
||||
mca_btl_ofud_component.sd_num >> 3,
|
||||
ud_btl->super.btl_mpool);
|
||||
|
||||
/* Initialize pool of user fragments */
|
||||
@ -664,8 +664,8 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
|
||||
length,
|
||||
OBJ_CLASS(mca_btl_ud_user_frag_t),
|
||||
mca_btl_ofud_component.sd_num >> 1,
|
||||
-1,
|
||||
mca_btl_ofud_component.sd_num << 2,
|
||||
mca_btl_ofud_component.sd_num >> 3,
|
||||
ud_btl->super.btl_mpool);
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
|
@ -467,19 +467,6 @@ int mca_btl_ud_component_progress(void)
|
||||
frag->endpoint, &frag->base, OMPI_SUCCESS);
|
||||
|
||||
/* Increment send counter, post if any sends are queued */
|
||||
OPAL_THREAD_ADD32(&endpoint->sd_wqe, 1);
|
||||
if(OPAL_UNLIKELY(
|
||||
!opal_list_is_empty(&endpoint->pending_frags))) {
|
||||
OPAL_THREAD_LOCK(&endpoint->pending_frags_lock);
|
||||
frag = (mca_btl_ud_frag_t*)
|
||||
opal_list_remove_first(&endpoint->pending_frags);
|
||||
OPAL_THREAD_UNLOCK(&endpoint->pending_frags_lock);
|
||||
|
||||
if(OPAL_LIKELY(NULL != frag)) {
|
||||
mca_btl_ud_endpoint_post_send(ud_btl, frag);
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_THREAD_ADD32(&ud_btl->sd_wqe, 1);
|
||||
if(OPAL_UNLIKELY(
|
||||
!opal_list_is_empty(&ud_btl->pending_frags))) {
|
||||
|
@ -74,8 +74,6 @@ int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl,
|
||||
frag->sg_entry.length = frag->segment.seg_len + sizeof(mca_btl_ud_header_t);
|
||||
wr->send_flags = IBV_SEND_SIGNALED;
|
||||
|
||||
CHECK_FRAG_QUEUES(endpoint->sd_wqe,
|
||||
endpoint->pending_frags_lock, endpoint->pending_frags, frag);
|
||||
CHECK_FRAG_QUEUES(ud_btl->sd_wqe,
|
||||
ud_btl->ud_lock, ud_btl->pending_frags, frag);
|
||||
|
||||
@ -98,9 +96,11 @@ int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl,
|
||||
|
||||
MCA_BTL_UD_START_TIME(ibv_post_send);
|
||||
if(OPAL_UNLIKELY((ret = ibv_post_send(ib_qp, wr, &bad_wr)))) {
|
||||
#if 0
|
||||
opal_output(0, "ep->sd_wqe %d btl->sd_wqe %d len %d ib_qp_next %d",
|
||||
endpoint->sd_wqe, ud_btl->sd_wqe,
|
||||
frag->sg_entry.length, ud_btl->ib_qp_next);
|
||||
#endif
|
||||
BTL_ERROR(("error posting send request: %d %s\n", ret, strerror(ret)));
|
||||
|
||||
}
|
||||
@ -123,17 +123,9 @@ static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
|
||||
#if OMPI_ENABLE_DEBUG
|
||||
memset(&endpoint->rem_addr, 0, sizeof(struct mca_btl_ud_addr_t));
|
||||
#endif
|
||||
|
||||
OBJ_CONSTRUCT(&endpoint->pending_frags, opal_list_t);
|
||||
OBJ_CONSTRUCT(&endpoint->pending_frags_lock, opal_mutex_t);
|
||||
|
||||
endpoint->sd_wqe = mca_btl_ofud_component.sd_num_peer;
|
||||
}
|
||||
|
||||
static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
||||
{
|
||||
/* TODO - what about any pending frags? */
|
||||
OBJ_DESTRUCT(&endpoint->pending_frags);
|
||||
OBJ_DESTRUCT(&endpoint->pending_frags_lock);
|
||||
}
|
||||
|
||||
|
@ -59,14 +59,6 @@ struct mca_btl_base_endpoint_t {
|
||||
struct ibv_ah* rmt_ah;
|
||||
/**< Remote address handle */
|
||||
/* No lock needed, verbs are thread-safe */
|
||||
|
||||
opal_list_t pending_frags;
|
||||
opal_mutex_t pending_frags_lock;
|
||||
/**< list of pending frags and lock */
|
||||
|
||||
int32_t sd_wqe;
|
||||
/**< number of available send wqe entries */
|
||||
/* No lock needed, OPAL_THREAD_ADD32 is used */
|
||||
};
|
||||
|
||||
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user