1
1

Bring back the SM pending queue, to avoid deadlocks.

This commit fixes trac:1378.

This commit was SVN r19309.

The following Trac tickets were found above:
  Ticket 1378 --> https://svn.open-mpi.org/trac/ompi/ticket/1378
Этот коммит содержится в:
George Bosilca 2008-08-17 19:00:50 +00:00
родитель cb927614c7
Коммит 10612bef8a
3 изменённых файлов: 49 добавлений и 27 удалений

Просмотреть файл

@ -622,7 +622,6 @@ extern mca_btl_base_descriptor_t* mca_btl_sm_alloc(
frag->segment.seg_len = size;
frag->base.des_flags = flags;
}
return (mca_btl_base_descriptor_t*)frag;
}
@ -812,7 +811,11 @@ int mca_btl_sm_send(
*/
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
endpoint->peer_smp_rank, frag->hdr, false, rc);
return (rc < 0 ? rc : 1);
if( OPAL_LIKELY(0 == rc) ) {
return 1; /* the data is completely gone */
}
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
return rc;
}
int mca_btl_sm_ft_event(int state) {

Просмотреть файл

@ -343,6 +343,22 @@ btl_sm_add_pending(struct mca_btl_base_endpoint_t *ep, void *data, bool resend)
opal_list_append(&ep->pending_sends, (opal_list_item_t*)si);
}
static int process_pending_send(struct mca_btl_base_endpoint_t *ep)
{
btl_sm_pending_send_item_t *si;
int rc;
si = (btl_sm_pending_send_item_t*)opal_list_remove_first(&ep->pending_sends);
if(NULL == si) return OMPI_ERROR;
OPAL_FREE_LIST_RETURN(&mca_btl_sm_component.pending_send_fl, (opal_list_item_t*)si);
MCA_BTL_SM_FIFO_WRITE(ep, ep->my_smp_rank, ep->peer_smp_rank, si->data,
true, rc);
return rc;
}
int mca_btl_sm_component_progress(void)
{
/* local variables */
@ -351,9 +367,7 @@ int mca_btl_sm_component_progress(void)
ompi_fifo_t *fifo = NULL;
mca_btl_sm_hdr_t *hdr;
int my_smp_rank = mca_btl_sm_component.my_smp_rank;
int peer_smp_rank;
int rc = 0;
bool useless;
int peer_smp_rank, rc = 0;
/* poll each fifo */
for(peer_smp_rank = 0; peer_smp_rank < mca_btl_sm_component.num_smp_procs;
@ -373,9 +387,7 @@ int mca_btl_sm_component_progress(void)
opal_atomic_lock(fifo->tail_lock);
}
hdr = (mca_btl_sm_hdr_t*)ompi_cb_fifo_read_from_tail(&fifo->tail->cb_fifo,
fifo->tail->cb_overflow,
&useless );
hdr = (mca_btl_sm_hdr_t *)ompi_fifo_read_from_tail(fifo);
/* release thread lock */
if(opal_using_threads()) {
@ -408,17 +420,20 @@ int mca_btl_sm_component_progress(void)
MCA_BTL_SM_FIFO_WRITE(
mca_btl_sm_component.sm_peers[peer_smp_rank],
my_smp_rank, peer_smp_rank, hdr->frag, false, rc);
goto recheck_peer;
break;
}
case MCA_BTL_SM_FRAG_ACK:
{
int status = (uintptr_t)hdr & MCA_BTL_SM_FRAG_STATUS_MASK;
int btl_ownership;
struct mca_btl_base_endpoint_t* endpoint;
frag = (mca_btl_sm_frag_t *)((char*)((uintptr_t)hdr &
(~(MCA_BTL_SM_FRAG_TYPE_MASK |
MCA_BTL_SM_FRAG_STATUS_MASK))));
endpoint = frag->endpoint;
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
/* completion callback */
@ -428,6 +443,10 @@ int mca_btl_sm_component_progress(void)
if( btl_ownership ) {
MCA_BTL_SM_FRAG_RETURN(frag);
}
if(opal_list_get_size(&endpoint->pending_sends)) {
if( OMPI_ERR_RESOURCE_BUSY == process_pending_send(endpoint) )
break;
}
goto recheck_peer;
}
default:

Просмотреть файл

@ -4,25 +4,25 @@
#include "btl_sm.h"
#include "btl_sm_endpoint.h"
#define MCA_BTL_SM_FIFO_WRITE(endpoint_peer, my_smp_rank, \
peer_smp_rank, hdr, resend, rc) \
do { \
ompi_fifo_t* fifo; \
fifo=&(mca_btl_sm_component.fifo[peer_smp_rank][my_smp_rank]); \
\
/* thread lock */ \
if(opal_using_threads()) \
opal_atomic_lock(fifo->head_lock); \
/* post fragment */ \
if(OMPI_CB_ERROR == \
ompi_cb_fifo_write_to_head(hdr, &fifo->head->cb_fifo)) { \
rc = OMPI_ERR_RESOURCE_BUSY; \
} else { \
MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \
rc = OMPI_SUCCESS; \
} \
if(opal_using_threads()) \
opal_atomic_unlock(fifo->head_lock); \
#define MCA_BTL_SM_FIFO_WRITE(endpoint_peer, my_smp_rank, \
peer_smp_rank, hdr, resend, rc) \
do { \
ompi_fifo_t* fifo; \
fifo=&(mca_btl_sm_component.fifo[peer_smp_rank][my_smp_rank]); \
\
/* thread lock */ \
if(opal_using_threads()) \
opal_atomic_lock(fifo->head_lock); \
/* post fragment */ \
if(ompi_fifo_write_to_head(hdr, fifo) != OMPI_SUCCESS) { \
btl_sm_add_pending(endpoint_peer, hdr, resend); \
rc = OMPI_ERR_RESOURCE_BUSY; \
} else { \
MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \
rc = OMPI_SUCCESS; \
} \
if(opal_using_threads()) \
opal_atomic_unlock(fifo->head_lock); \
} while(0)
#endif