Bugfix for hangs in certain communication patterns, particularly alltoall.
This commit was SVN r16600.
Этот коммит содержится в:
родитель
04578ffdd6
Коммит
8273b61471
@ -652,8 +652,8 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
|
|||||||
length,
|
length,
|
||||||
OBJ_CLASS(mca_btl_ud_send_frag_t),
|
OBJ_CLASS(mca_btl_ud_send_frag_t),
|
||||||
mca_btl_ofud_component.sd_num >> 1,
|
mca_btl_ofud_component.sd_num >> 1,
|
||||||
|
-1,
|
||||||
mca_btl_ofud_component.sd_num << 2,
|
mca_btl_ofud_component.sd_num << 2,
|
||||||
mca_btl_ofud_component.sd_num >> 3,
|
|
||||||
ud_btl->super.btl_mpool);
|
ud_btl->super.btl_mpool);
|
||||||
|
|
||||||
/* Initialize pool of user fragments */
|
/* Initialize pool of user fragments */
|
||||||
@ -664,8 +664,8 @@ int mca_btl_ud_module_init(mca_btl_ud_module_t *ud_btl)
|
|||||||
length,
|
length,
|
||||||
OBJ_CLASS(mca_btl_ud_user_frag_t),
|
OBJ_CLASS(mca_btl_ud_user_frag_t),
|
||||||
mca_btl_ofud_component.sd_num >> 1,
|
mca_btl_ofud_component.sd_num >> 1,
|
||||||
|
-1,
|
||||||
mca_btl_ofud_component.sd_num << 2,
|
mca_btl_ofud_component.sd_num << 2,
|
||||||
mca_btl_ofud_component.sd_num >> 3,
|
|
||||||
ud_btl->super.btl_mpool);
|
ud_btl->super.btl_mpool);
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
|
@ -467,19 +467,6 @@ int mca_btl_ud_component_progress(void)
|
|||||||
frag->endpoint, &frag->base, OMPI_SUCCESS);
|
frag->endpoint, &frag->base, OMPI_SUCCESS);
|
||||||
|
|
||||||
/* Increment send counter, post if any sends are queued */
|
/* Increment send counter, post if any sends are queued */
|
||||||
OPAL_THREAD_ADD32(&endpoint->sd_wqe, 1);
|
|
||||||
if(OPAL_UNLIKELY(
|
|
||||||
!opal_list_is_empty(&endpoint->pending_frags))) {
|
|
||||||
OPAL_THREAD_LOCK(&endpoint->pending_frags_lock);
|
|
||||||
frag = (mca_btl_ud_frag_t*)
|
|
||||||
opal_list_remove_first(&endpoint->pending_frags);
|
|
||||||
OPAL_THREAD_UNLOCK(&endpoint->pending_frags_lock);
|
|
||||||
|
|
||||||
if(OPAL_LIKELY(NULL != frag)) {
|
|
||||||
mca_btl_ud_endpoint_post_send(ud_btl, frag);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
OPAL_THREAD_ADD32(&ud_btl->sd_wqe, 1);
|
OPAL_THREAD_ADD32(&ud_btl->sd_wqe, 1);
|
||||||
if(OPAL_UNLIKELY(
|
if(OPAL_UNLIKELY(
|
||||||
!opal_list_is_empty(&ud_btl->pending_frags))) {
|
!opal_list_is_empty(&ud_btl->pending_frags))) {
|
||||||
|
@ -74,8 +74,6 @@ int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl,
|
|||||||
frag->sg_entry.length = frag->segment.seg_len + sizeof(mca_btl_ud_header_t);
|
frag->sg_entry.length = frag->segment.seg_len + sizeof(mca_btl_ud_header_t);
|
||||||
wr->send_flags = IBV_SEND_SIGNALED;
|
wr->send_flags = IBV_SEND_SIGNALED;
|
||||||
|
|
||||||
CHECK_FRAG_QUEUES(endpoint->sd_wqe,
|
|
||||||
endpoint->pending_frags_lock, endpoint->pending_frags, frag);
|
|
||||||
CHECK_FRAG_QUEUES(ud_btl->sd_wqe,
|
CHECK_FRAG_QUEUES(ud_btl->sd_wqe,
|
||||||
ud_btl->ud_lock, ud_btl->pending_frags, frag);
|
ud_btl->ud_lock, ud_btl->pending_frags, frag);
|
||||||
|
|
||||||
@ -98,9 +96,11 @@ int mca_btl_ud_endpoint_post_send(mca_btl_ud_module_t* ud_btl,
|
|||||||
|
|
||||||
MCA_BTL_UD_START_TIME(ibv_post_send);
|
MCA_BTL_UD_START_TIME(ibv_post_send);
|
||||||
if(OPAL_UNLIKELY((ret = ibv_post_send(ib_qp, wr, &bad_wr)))) {
|
if(OPAL_UNLIKELY((ret = ibv_post_send(ib_qp, wr, &bad_wr)))) {
|
||||||
|
#if 0
|
||||||
opal_output(0, "ep->sd_wqe %d btl->sd_wqe %d len %d ib_qp_next %d",
|
opal_output(0, "ep->sd_wqe %d btl->sd_wqe %d len %d ib_qp_next %d",
|
||||||
endpoint->sd_wqe, ud_btl->sd_wqe,
|
endpoint->sd_wqe, ud_btl->sd_wqe,
|
||||||
frag->sg_entry.length, ud_btl->ib_qp_next);
|
frag->sg_entry.length, ud_btl->ib_qp_next);
|
||||||
|
#endif
|
||||||
BTL_ERROR(("error posting send request: %d %s\n", ret, strerror(ret)));
|
BTL_ERROR(("error posting send request: %d %s\n", ret, strerror(ret)));
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -123,17 +123,9 @@ static void mca_btl_ud_endpoint_construct(mca_btl_base_endpoint_t* endpoint)
|
|||||||
#if OMPI_ENABLE_DEBUG
|
#if OMPI_ENABLE_DEBUG
|
||||||
memset(&endpoint->rem_addr, 0, sizeof(struct mca_btl_ud_addr_t));
|
memset(&endpoint->rem_addr, 0, sizeof(struct mca_btl_ud_addr_t));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&endpoint->pending_frags, opal_list_t);
|
|
||||||
OBJ_CONSTRUCT(&endpoint->pending_frags_lock, opal_mutex_t);
|
|
||||||
|
|
||||||
endpoint->sd_wqe = mca_btl_ofud_component.sd_num_peer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
static void mca_btl_ud_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
||||||
{
|
{
|
||||||
/* TODO - what about any pending frags? */
|
|
||||||
OBJ_DESTRUCT(&endpoint->pending_frags);
|
|
||||||
OBJ_DESTRUCT(&endpoint->pending_frags_lock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,14 +59,6 @@ struct mca_btl_base_endpoint_t {
|
|||||||
struct ibv_ah* rmt_ah;
|
struct ibv_ah* rmt_ah;
|
||||||
/**< Remote address handle */
|
/**< Remote address handle */
|
||||||
/* No lock needed, verbs are thread-safe */
|
/* No lock needed, verbs are thread-safe */
|
||||||
|
|
||||||
opal_list_t pending_frags;
|
|
||||||
opal_mutex_t pending_frags_lock;
|
|
||||||
/**< list of pending frags and lock */
|
|
||||||
|
|
||||||
int32_t sd_wqe;
|
|
||||||
/**< number of available send wqe entries */
|
|
||||||
/* No lock needed, OPAL_THREAD_ADD32 is used */
|
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user