1
1

Bring back the SM pending queue, to avoid deadlocks.

This commit fixes trac:1378.

This commit was SVN r19309.

The following Trac tickets were found above:
  Ticket 1378 --> https://svn.open-mpi.org/trac/ompi/ticket/1378
Этот коммит содержится в:
George Bosilca 2008-08-17 19:00:50 +00:00
родитель cb927614c7
Коммит 10612bef8a
3 изменённых файлов: 49 добавлений и 27 удалений

Просмотреть файл

@ -622,7 +622,6 @@ extern mca_btl_base_descriptor_t* mca_btl_sm_alloc(
frag->segment.seg_len = size; frag->segment.seg_len = size;
frag->base.des_flags = flags; frag->base.des_flags = flags;
} }
return (mca_btl_base_descriptor_t*)frag; return (mca_btl_base_descriptor_t*)frag;
} }
@ -812,7 +811,11 @@ int mca_btl_sm_send(
*/ */
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank, MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
endpoint->peer_smp_rank, frag->hdr, false, rc); endpoint->peer_smp_rank, frag->hdr, false, rc);
return (rc < 0 ? rc : 1); if( OPAL_LIKELY(0 == rc) ) {
return 1; /* the data is completely gone */
}
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
return rc;
} }
int mca_btl_sm_ft_event(int state) { int mca_btl_sm_ft_event(int state) {

Просмотреть файл

@ -343,6 +343,22 @@ btl_sm_add_pending(struct mca_btl_base_endpoint_t *ep, void *data, bool resend)
opal_list_append(&ep->pending_sends, (opal_list_item_t*)si); opal_list_append(&ep->pending_sends, (opal_list_item_t*)si);
} }
static int process_pending_send(struct mca_btl_base_endpoint_t *ep)
{
btl_sm_pending_send_item_t *si;
int rc;
si = (btl_sm_pending_send_item_t*)opal_list_remove_first(&ep->pending_sends);
if(NULL == si) return OMPI_ERROR;
OPAL_FREE_LIST_RETURN(&mca_btl_sm_component.pending_send_fl, (opal_list_item_t*)si);
MCA_BTL_SM_FIFO_WRITE(ep, ep->my_smp_rank, ep->peer_smp_rank, si->data,
true, rc);
return rc;
}
int mca_btl_sm_component_progress(void) int mca_btl_sm_component_progress(void)
{ {
/* local variables */ /* local variables */
@ -351,9 +367,7 @@ int mca_btl_sm_component_progress(void)
ompi_fifo_t *fifo = NULL; ompi_fifo_t *fifo = NULL;
mca_btl_sm_hdr_t *hdr; mca_btl_sm_hdr_t *hdr;
int my_smp_rank = mca_btl_sm_component.my_smp_rank; int my_smp_rank = mca_btl_sm_component.my_smp_rank;
int peer_smp_rank; int peer_smp_rank, rc = 0;
int rc = 0;
bool useless;
/* poll each fifo */ /* poll each fifo */
for(peer_smp_rank = 0; peer_smp_rank < mca_btl_sm_component.num_smp_procs; for(peer_smp_rank = 0; peer_smp_rank < mca_btl_sm_component.num_smp_procs;
@ -373,9 +387,7 @@ int mca_btl_sm_component_progress(void)
opal_atomic_lock(fifo->tail_lock); opal_atomic_lock(fifo->tail_lock);
} }
hdr = (mca_btl_sm_hdr_t*)ompi_cb_fifo_read_from_tail(&fifo->tail->cb_fifo, hdr = (mca_btl_sm_hdr_t *)ompi_fifo_read_from_tail(fifo);
fifo->tail->cb_overflow,
&useless );
/* release thread lock */ /* release thread lock */
if(opal_using_threads()) { if(opal_using_threads()) {
@ -408,17 +420,20 @@ int mca_btl_sm_component_progress(void)
MCA_BTL_SM_FIFO_WRITE( MCA_BTL_SM_FIFO_WRITE(
mca_btl_sm_component.sm_peers[peer_smp_rank], mca_btl_sm_component.sm_peers[peer_smp_rank],
my_smp_rank, peer_smp_rank, hdr->frag, false, rc); my_smp_rank, peer_smp_rank, hdr->frag, false, rc);
goto recheck_peer;
break; break;
} }
case MCA_BTL_SM_FRAG_ACK: case MCA_BTL_SM_FRAG_ACK:
{ {
int status = (uintptr_t)hdr & MCA_BTL_SM_FRAG_STATUS_MASK; int status = (uintptr_t)hdr & MCA_BTL_SM_FRAG_STATUS_MASK;
int btl_ownership; int btl_ownership;
struct mca_btl_base_endpoint_t* endpoint;
frag = (mca_btl_sm_frag_t *)((char*)((uintptr_t)hdr & frag = (mca_btl_sm_frag_t *)((char*)((uintptr_t)hdr &
(~(MCA_BTL_SM_FRAG_TYPE_MASK | (~(MCA_BTL_SM_FRAG_TYPE_MASK |
MCA_BTL_SM_FRAG_STATUS_MASK)))); MCA_BTL_SM_FRAG_STATUS_MASK))));
endpoint = frag->endpoint;
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) { if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
/* completion callback */ /* completion callback */
@ -428,6 +443,10 @@ int mca_btl_sm_component_progress(void)
if( btl_ownership ) { if( btl_ownership ) {
MCA_BTL_SM_FRAG_RETURN(frag); MCA_BTL_SM_FRAG_RETURN(frag);
} }
if(opal_list_get_size(&endpoint->pending_sends)) {
if( OMPI_ERR_RESOURCE_BUSY == process_pending_send(endpoint) )
break;
}
goto recheck_peer; goto recheck_peer;
} }
default: default:

Просмотреть файл

@ -14,8 +14,8 @@ do { \
if(opal_using_threads()) \ if(opal_using_threads()) \
opal_atomic_lock(fifo->head_lock); \ opal_atomic_lock(fifo->head_lock); \
/* post fragment */ \ /* post fragment */ \
if(OMPI_CB_ERROR == \ if(ompi_fifo_write_to_head(hdr, fifo) != OMPI_SUCCESS) { \
ompi_cb_fifo_write_to_head(hdr, &fifo->head->cb_fifo)) { \ btl_sm_add_pending(endpoint_peer, hdr, resend); \
rc = OMPI_ERR_RESOURCE_BUSY; \ rc = OMPI_ERR_RESOURCE_BUSY; \
} else { \ } else { \
MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \ MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \