Bring back the SM pending queue, to avoid deadlocks.
This commit fixes trac:1378. This commit was SVN r19309. The following Trac tickets were found above: Ticket 1378 --> https://svn.open-mpi.org/trac/ompi/ticket/1378
Этот коммит содержится в:
родитель
cb927614c7
Коммит
10612bef8a
@ -622,7 +622,6 @@ extern mca_btl_base_descriptor_t* mca_btl_sm_alloc(
|
|||||||
frag->segment.seg_len = size;
|
frag->segment.seg_len = size;
|
||||||
frag->base.des_flags = flags;
|
frag->base.des_flags = flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (mca_btl_base_descriptor_t*)frag;
|
return (mca_btl_base_descriptor_t*)frag;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -812,7 +811,11 @@ int mca_btl_sm_send(
|
|||||||
*/
|
*/
|
||||||
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
|
MCA_BTL_SM_FIFO_WRITE(endpoint, endpoint->my_smp_rank,
|
||||||
endpoint->peer_smp_rank, frag->hdr, false, rc);
|
endpoint->peer_smp_rank, frag->hdr, false, rc);
|
||||||
return (rc < 0 ? rc : 1);
|
if( OPAL_LIKELY(0 == rc) ) {
|
||||||
|
return 1; /* the data is completely gone */
|
||||||
|
}
|
||||||
|
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
|
||||||
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
int mca_btl_sm_ft_event(int state) {
|
int mca_btl_sm_ft_event(int state) {
|
||||||
|
@ -343,6 +343,22 @@ btl_sm_add_pending(struct mca_btl_base_endpoint_t *ep, void *data, bool resend)
|
|||||||
opal_list_append(&ep->pending_sends, (opal_list_item_t*)si);
|
opal_list_append(&ep->pending_sends, (opal_list_item_t*)si);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int process_pending_send(struct mca_btl_base_endpoint_t *ep)
|
||||||
|
{
|
||||||
|
btl_sm_pending_send_item_t *si;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
si = (btl_sm_pending_send_item_t*)opal_list_remove_first(&ep->pending_sends);
|
||||||
|
if(NULL == si) return OMPI_ERROR;
|
||||||
|
|
||||||
|
OPAL_FREE_LIST_RETURN(&mca_btl_sm_component.pending_send_fl, (opal_list_item_t*)si);
|
||||||
|
|
||||||
|
MCA_BTL_SM_FIFO_WRITE(ep, ep->my_smp_rank, ep->peer_smp_rank, si->data,
|
||||||
|
true, rc);
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
int mca_btl_sm_component_progress(void)
|
int mca_btl_sm_component_progress(void)
|
||||||
{
|
{
|
||||||
/* local variables */
|
/* local variables */
|
||||||
@ -351,9 +367,7 @@ int mca_btl_sm_component_progress(void)
|
|||||||
ompi_fifo_t *fifo = NULL;
|
ompi_fifo_t *fifo = NULL;
|
||||||
mca_btl_sm_hdr_t *hdr;
|
mca_btl_sm_hdr_t *hdr;
|
||||||
int my_smp_rank = mca_btl_sm_component.my_smp_rank;
|
int my_smp_rank = mca_btl_sm_component.my_smp_rank;
|
||||||
int peer_smp_rank;
|
int peer_smp_rank, rc = 0;
|
||||||
int rc = 0;
|
|
||||||
bool useless;
|
|
||||||
|
|
||||||
/* poll each fifo */
|
/* poll each fifo */
|
||||||
for(peer_smp_rank = 0; peer_smp_rank < mca_btl_sm_component.num_smp_procs;
|
for(peer_smp_rank = 0; peer_smp_rank < mca_btl_sm_component.num_smp_procs;
|
||||||
@ -373,9 +387,7 @@ int mca_btl_sm_component_progress(void)
|
|||||||
opal_atomic_lock(fifo->tail_lock);
|
opal_atomic_lock(fifo->tail_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
hdr = (mca_btl_sm_hdr_t*)ompi_cb_fifo_read_from_tail(&fifo->tail->cb_fifo,
|
hdr = (mca_btl_sm_hdr_t *)ompi_fifo_read_from_tail(fifo);
|
||||||
fifo->tail->cb_overflow,
|
|
||||||
&useless );
|
|
||||||
|
|
||||||
/* release thread lock */
|
/* release thread lock */
|
||||||
if(opal_using_threads()) {
|
if(opal_using_threads()) {
|
||||||
@ -408,17 +420,20 @@ int mca_btl_sm_component_progress(void)
|
|||||||
MCA_BTL_SM_FIFO_WRITE(
|
MCA_BTL_SM_FIFO_WRITE(
|
||||||
mca_btl_sm_component.sm_peers[peer_smp_rank],
|
mca_btl_sm_component.sm_peers[peer_smp_rank],
|
||||||
my_smp_rank, peer_smp_rank, hdr->frag, false, rc);
|
my_smp_rank, peer_smp_rank, hdr->frag, false, rc);
|
||||||
|
goto recheck_peer;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case MCA_BTL_SM_FRAG_ACK:
|
case MCA_BTL_SM_FRAG_ACK:
|
||||||
{
|
{
|
||||||
int status = (uintptr_t)hdr & MCA_BTL_SM_FRAG_STATUS_MASK;
|
int status = (uintptr_t)hdr & MCA_BTL_SM_FRAG_STATUS_MASK;
|
||||||
int btl_ownership;
|
int btl_ownership;
|
||||||
|
struct mca_btl_base_endpoint_t* endpoint;
|
||||||
|
|
||||||
frag = (mca_btl_sm_frag_t *)((char*)((uintptr_t)hdr &
|
frag = (mca_btl_sm_frag_t *)((char*)((uintptr_t)hdr &
|
||||||
(~(MCA_BTL_SM_FRAG_TYPE_MASK |
|
(~(MCA_BTL_SM_FRAG_TYPE_MASK |
|
||||||
MCA_BTL_SM_FRAG_STATUS_MASK))));
|
MCA_BTL_SM_FRAG_STATUS_MASK))));
|
||||||
|
|
||||||
|
endpoint = frag->endpoint;
|
||||||
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
|
||||||
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
|
if( MCA_BTL_DES_SEND_ALWAYS_CALLBACK & frag->base.des_flags ) {
|
||||||
/* completion callback */
|
/* completion callback */
|
||||||
@ -428,6 +443,10 @@ int mca_btl_sm_component_progress(void)
|
|||||||
if( btl_ownership ) {
|
if( btl_ownership ) {
|
||||||
MCA_BTL_SM_FRAG_RETURN(frag);
|
MCA_BTL_SM_FRAG_RETURN(frag);
|
||||||
}
|
}
|
||||||
|
if(opal_list_get_size(&endpoint->pending_sends)) {
|
||||||
|
if( OMPI_ERR_RESOURCE_BUSY == process_pending_send(endpoint) )
|
||||||
|
break;
|
||||||
|
}
|
||||||
goto recheck_peer;
|
goto recheck_peer;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
@ -4,25 +4,25 @@
|
|||||||
#include "btl_sm.h"
|
#include "btl_sm.h"
|
||||||
#include "btl_sm_endpoint.h"
|
#include "btl_sm_endpoint.h"
|
||||||
|
|
||||||
#define MCA_BTL_SM_FIFO_WRITE(endpoint_peer, my_smp_rank, \
|
#define MCA_BTL_SM_FIFO_WRITE(endpoint_peer, my_smp_rank, \
|
||||||
peer_smp_rank, hdr, resend, rc) \
|
peer_smp_rank, hdr, resend, rc) \
|
||||||
do { \
|
do { \
|
||||||
ompi_fifo_t* fifo; \
|
ompi_fifo_t* fifo; \
|
||||||
fifo=&(mca_btl_sm_component.fifo[peer_smp_rank][my_smp_rank]); \
|
fifo=&(mca_btl_sm_component.fifo[peer_smp_rank][my_smp_rank]); \
|
||||||
\
|
\
|
||||||
/* thread lock */ \
|
/* thread lock */ \
|
||||||
if(opal_using_threads()) \
|
if(opal_using_threads()) \
|
||||||
opal_atomic_lock(fifo->head_lock); \
|
opal_atomic_lock(fifo->head_lock); \
|
||||||
/* post fragment */ \
|
/* post fragment */ \
|
||||||
if(OMPI_CB_ERROR == \
|
if(ompi_fifo_write_to_head(hdr, fifo) != OMPI_SUCCESS) { \
|
||||||
ompi_cb_fifo_write_to_head(hdr, &fifo->head->cb_fifo)) { \
|
btl_sm_add_pending(endpoint_peer, hdr, resend); \
|
||||||
rc = OMPI_ERR_RESOURCE_BUSY; \
|
rc = OMPI_ERR_RESOURCE_BUSY; \
|
||||||
} else { \
|
} else { \
|
||||||
MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \
|
MCA_BTL_SM_SIGNAL_PEER(endpoint_peer); \
|
||||||
rc = OMPI_SUCCESS; \
|
rc = OMPI_SUCCESS; \
|
||||||
} \
|
} \
|
||||||
if(opal_using_threads()) \
|
if(opal_using_threads()) \
|
||||||
opal_atomic_unlock(fifo->head_lock); \
|
opal_atomic_unlock(fifo->head_lock); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user