1
1

Fix the deadlock when we run out of resources on the BTLs. Move the progress

function from the BML into the PML. The BTL progress functions are now directly
registered with the event library.

This commit was SVN r19561.
Этот коммит содержится в:
George Bosilca 2008-09-15 22:56:23 +00:00
родитель 68f6fdf111
Коммит 17e65369be
12 изменённых файлов: 113 добавлений и 124 удалений

Просмотреть файл

@ -38,7 +38,6 @@ mca_bml_base_module_t mca_bml = {
NULL, /* bml_register */
NULL, /* bml_register_error */
NULL, /* bml_finalize*/
NULL, /* bml_progress */
NULL /* FT event */
};
mca_bml_base_component_t mca_bml_component;

Просмотреть файл

@ -74,7 +74,6 @@ struct mca_bml_base_btl_t {
mca_btl_base_module_prepare_fn_t btl_prepare_dst;
mca_btl_base_module_put_fn_t btl_put;
mca_btl_base_module_get_fn_t btl_get;
mca_btl_base_component_progress_fn_t btl_progress;
mca_mpool_base_module_t* btl_mpool;
};
@ -414,16 +413,6 @@ typedef struct mca_bml_base_module_t* (*mca_bml_base_component_init_fn_t)(
bool enable_mpi_threads
);
/**
* MCA->BML Called to progress outstanding requests for
* non-threaded polling environments.
*
* @param tstamp Current time.
* @return OMPI_SUCCESS or error code on failure.
*/
typedef int (*mca_bml_base_module_progress_fn_t)(void);
/**
* BML component descriptor. Contains component version information
* and component open/close/init functions.
@ -600,8 +589,6 @@ struct mca_bml_base_module_t {
mca_bml_base_module_finalize_fn_t bml_finalize;
mca_bml_base_module_progress_fn_t bml_progress;
mca_bml_base_module_ft_event_fn_t bml_ft_event;
};
typedef struct mca_bml_base_module_t mca_bml_base_module_t;

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -54,7 +54,6 @@ mca_bml_r2_module_t mca_bml_r2 = {
mca_bml_r2_register,
mca_bml_r2_register_error,
mca_bml_r2_finalize,
mca_bml_r2_progress,
mca_bml_r2_ft_event
}
@ -86,23 +85,6 @@ static int btl_exclusivity_compare(const void* arg1, const void* arg2)
}
}
int mca_bml_r2_progress( void )
{
int i, count = 0;
/*
* Progress each of the BTL modules
*/
for( i = 0; i < (int)mca_bml_r2.num_btl_progress; i++) {
int rc = mca_bml_r2.btl_progress[i]();
if(rc > 0) {
count += rc;
}
}
return count;
}
static int mca_bml_r2_add_btls( void )
{
int i;
@ -350,7 +332,7 @@ int mca_bml_r2_add_procs( size_t nprocs,
if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
size_t p;
bool found = false;
for(p=0; p<mca_bml_r2.num_btl_progress; p++) {
for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) {
if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
found = true;
break;
@ -360,6 +342,7 @@ int mca_bml_r2_add_procs( size_t nprocs,
mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] =
btl->btl_component->btl_progress;
mca_bml_r2.num_btl_progress++;
opal_progress_register( btl->btl_component->btl_progress );
}
}
}
@ -485,7 +468,7 @@ int mca_bml_r2_add_procs( size_t nprocs,
}
/*
* iterate through each proc and notify any PTLs associated
* iterate through each proc and notify any BTLs associated
* with the proc that it is/has gone away
*/
@ -515,7 +498,7 @@ int mca_bml_r2_del_procs(size_t nprocs,
size_t f_index, f_size;
size_t n_index, n_size;
/* notify each ptl that the proc is going away */
/* notify each btl that the proc is going away */
f_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_eager);
for(f_index = 0; f_index < f_size; f_index++) {
mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_eager, f_index);
@ -560,7 +543,28 @@ int mca_bml_r2_del_procs(size_t nprocs,
return OMPI_SUCCESS;
}
int mca_bml_r2_finalize( void ) {
static inline int bml_r2_remove_btl_progress(mca_btl_base_module_t* btl)
{
unsigned int p;
if(NULL == btl->btl_component->btl_progress) {
return OMPI_SUCCESS;
}
for(p = 0; p < mca_bml_r2.num_btl_progress; p++) {
if(btl->btl_component->btl_progress != mca_bml_r2.btl_progress[p])
continue;
opal_progress_unregister( btl->btl_component->btl_progress );
if( p < (mca_bml_r2.num_btl_progress-1) ) {
mca_bml_r2.btl_progress[p] = mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress-1];
}
mca_bml_r2.num_btl_progress--;
return OMPI_SUCCESS;
}
return OMPI_ERR_NOT_FOUND;
}
int mca_bml_r2_finalize( void )
{
ompi_proc_t** procs;
size_t p, num_procs;
opal_list_item_t* w_item;
@ -579,6 +583,10 @@ int mca_bml_r2_finalize( void ) {
w_item != opal_list_get_end(&mca_btl_base_modules_initialized);
w_item = opal_list_get_next(w_item)) {
mca_btl_base_selected_module_t *sm = (mca_btl_base_selected_module_t *) w_item;
mca_btl_base_module_t* btl = sm->btl_module;
/* unregister the BTL progress function if any */
bml_r2_remove_btl_progress(btl);
/* dont use this btl for any peers */
for(p=0; p<num_procs; p++) {
@ -612,23 +620,25 @@ int mca_bml_r2_finalize( void ) {
int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
{
ompi_proc_t** procs;
size_t i, j, m, p, num_procs;
size_t i, m, p, num_procs;
opal_list_item_t* item;
mca_btl_base_module_t** modules;
mca_btl_base_component_progress_fn_t * btl_progress_new;
bool found = false;
procs = ompi_proc_all(&num_procs);
if(NULL == procs)
return OMPI_SUCCESS;
if(opal_list_get_size(&mca_btl_base_modules_initialized) == 2){
if(opal_list_get_size(&mca_btl_base_modules_initialized) == 2) {
opal_output(0, "only one BTL left, can't failover");
goto CLEANUP;
}
/* Get rid of the associated progress function */
bml_r2_remove_btl_progress(btl);
/* dont use this btl for any peers */
for(p=0; p<num_procs; p++) {
for( p = 0; p < num_procs; p++ ) {
ompi_proc_t* proc = procs[p];
mca_bml_r2_del_proc_btl(proc, btl);
}
@ -660,26 +670,6 @@ int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
mca_bml_r2.btl_modules = modules;
mca_bml_r2.num_btl_modules = m;
if(btl->btl_component->btl_progress) {
/* figure out which progress functions to keep */
/* don't need to keep any if this is the last one.. */
if(mca_bml_r2.num_btl_progress > 1) {
btl_progress_new = (mca_btl_base_component_progress_fn_t*)
malloc(sizeof(mca_btl_base_component_progress_fn_t) *
(mca_bml_r2.num_btl_progress - 1));
j = 0;
for(i = 0; i < mca_bml_r2.num_btl_progress; i++) {
if(btl->btl_component->btl_progress != mca_bml_r2.btl_progress[i]) {
btl_progress_new[j] = mca_bml_r2.btl_progress[i];
j++;
}
}
free(mca_bml_r2.btl_progress);
mca_bml_r2.btl_progress = btl_progress_new;
}
mca_bml_r2.num_btl_progress--;
}
/* cleanup */
btl->btl_finalize(btl);
CLEANUP:

Просмотреть файл

@ -30,7 +30,6 @@ dr_sources = \
pml_dr_iprobe.c \
pml_dr_irecv.c \
pml_dr_isend.c \
pml_dr_progress.c \
pml_dr_recvfrag.c \
pml_dr_recvfrag.h \
pml_dr_recvreq.c \

Просмотреть файл

@ -44,7 +44,7 @@ mca_pml_dr_t mca_pml_dr = {
mca_pml_dr_add_procs,
mca_pml_dr_del_procs,
mca_pml_dr_enable,
mca_pml_dr_progress,
NULL, /*mca_pml_dr_progress,*/
mca_pml_dr_add_comm,
mca_pml_dr_del_comm,
mca_pml_dr_irecv_init,

Просмотреть файл

@ -119,8 +119,6 @@ extern int mca_pml_dr_enable(
bool enable
);
extern int mca_pml_dr_progress(void);
extern int mca_pml_dr_iprobe(
int dst,
int tag,

Просмотреть файл

@ -142,8 +142,6 @@ mca_pml_base_module_t* mca_pml_dr_component_init(int* priority,
return NULL;
}
mca_pml_dr.super.pml_progress = mca_bml.bml_progress;
return &mca_pml_dr.super;
}

Просмотреть файл

@ -1,29 +0,0 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2006 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include "pml_dr.h"
#include "pml_dr_sendreq.h"
#include "ompi/mca/bml/base/base.h"
int mca_pml_dr_progress(void)
{
return mca_bml.bml_progress();
}

Просмотреть файл

@ -181,7 +181,7 @@ mca_pml_ob1_component_init( int* priority,
* to avoid useless functions calls. The event library will instead call
* directly the BML function.
*/
mca_pml_ob1.super.pml_progress = mca_bml.bml_progress;
/*mca_pml_ob1.super.pml_progress = mca_bml.bml_progress;*/
return &mca_pml_ob1.super;
}

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@ -24,6 +24,54 @@
int mca_pml_ob1_progress(void)
{
return mca_bml.bml_progress();
int i, queue_length = opal_list_get_size(&mca_pml_ob1.send_pending);
int j, completed_requests = 0;
bool send_succedded;
if( OPAL_LIKELY(0 == queue_length) )
return 0;
for( i = 0; i < queue_length; i++ ) {
mca_pml_ob1_send_pending_t pending_type = MCA_PML_OB1_SEND_PENDING_NONE;
mca_pml_ob1_send_request_t* sendreq;
mca_bml_base_endpoint_t* endpoint;
sendreq = get_request_from_send_pending(&pending_type);
if(OPAL_UNLIKELY(NULL == sendreq))
break;
switch(pending_type) {
case MCA_PML_OB1_SEND_PENDING_NONE:
assert(0);
return 0;
case MCA_PML_OB1_SEND_PENDING_SCHEDULE:
if( mca_pml_ob1_send_request_schedule_exclusive(sendreq) ==
OMPI_ERR_OUT_OF_RESOURCE ) {
return 0;
}
completed_requests++;
break;
case MCA_PML_OB1_SEND_PENDING_START:
endpoint = sendreq->req_endpoint;
send_succedded = false;
for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) {
mca_bml_base_btl_t* bml_btl;
int rc;
/* select a btl */
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl);
if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
send_succedded = true;
completed_requests++;
break;
}
}
if( false == send_succedded ) {
add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true);
}
}
}
return completed_requests;
}

Просмотреть файл

@ -79,8 +79,8 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
mca_btl_base_tag_t tag,
mca_btl_base_descriptor_t* des,
void* cbdata ) {
mca_btl_base_segment_t* segment = des->des_dst;
mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segment->seg_addr.pval;
mca_btl_base_segment_t* segments = des->des_dst;
mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval;
ompi_communicator_t *comm_ptr;
mca_pml_ob1_recv_request_t *match = NULL;
mca_pml_ob1_comm_t *comm;
@ -89,7 +89,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
size_t num_segments = des->des_dst_cnt;
size_t bytes_received = 0;
if( OPAL_UNLIKELY(segment->seg_len < OMPI_PML_OB1_MATCH_HDR_LEN) ) {
if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_OB1_MATCH_HDR_LEN) ) {
return;
}
ob1_hdr_ntoh(((mca_pml_ob1_hdr_t*) hdr), MCA_PML_OB1_HDR_TYPE_MATCH);
@ -151,7 +151,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
match = match_one(btl, hdr, segment, num_segments, comm_ptr, proc, frag);
match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
/* The match is over. We generate the SEARCH_POSTED_Q_END here,
* before going into the mca_pml_ob1_check_cantmatch_for_match so
@ -165,7 +165,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
OPAL_THREAD_UNLOCK(&comm->matching_lock);
if(OPAL_LIKELY(match)) {
bytes_received = segment->seg_len - OMPI_PML_OB1_MATCH_HDR_LEN;
bytes_received = segments->seg_len - OMPI_PML_OB1_MATCH_HDR_LEN;
match->req_recv.req_bytes_packed = bytes_received;
MCA_PML_OB1_RECV_REQUEST_MATCHED(match, hdr);
@ -184,12 +184,12 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
);
iov[0].iov_len = bytes_received;
iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segment->seg_addr.pval +
iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval +
OMPI_PML_OB1_MATCH_HDR_LEN);
while (iov_count < num_segments) {
bytes_received += segment[iov_count].seg_len;
iov[iov_count].iov_len = segment[iov_count].seg_len;
iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segment[iov_count].seg_addr.pval);
bytes_received += segments[iov_count].seg_len;
iov[iov_count].iov_len = segments[iov_count].seg_len;
iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval);
iov_count++;
}
ompi_convertor_unpack( &match->req_recv.req_base.req_convertor,
@ -216,7 +216,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
slow_path:
OPAL_THREAD_UNLOCK(&comm->matching_lock);
mca_pml_ob1_recv_frag_match(btl, hdr, segment,
mca_pml_ob1_recv_frag_match(btl, hdr, segments,
num_segments, MCA_PML_OB1_HDR_TYPE_MATCH);
}

Просмотреть файл

@ -60,9 +60,9 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
case MCA_PML_OB1_SEND_PENDING_START:
send_dst = mca_bml_base_btl_array_find(
&sendreq->req_endpoint->btl_eager, bml_btl->btl);
if(NULL == send_dst ||
mca_pml_ob1_send_request_start_btl(sendreq, send_dst) ==
OMPI_ERR_OUT_OF_RESOURCE) {
if( (NULL == send_dst) ||
(mca_pml_ob1_send_request_start_btl(sendreq, send_dst) ==
OMPI_ERR_OUT_OF_RESOURCE) ) {
/* prepend to the pending list to minimize reordering in case
* send_dst != 0 */
add_request_to_send_pending(sendreq,
@ -561,7 +561,7 @@ int mca_pml_ob1_send_request_start_copy( mca_pml_ob1_send_request_t* sendreq,
/* send */
rc = mca_bml_base_send_status(bml_btl, des, MCA_PML_OB1_HDR_TYPE_MATCH);
if( OPAL_LIKELY( rc >= 0 ) ) {
if( OPAL_LIKELY( rc >= OMPI_SUCCESS ) ) {
if( OPAL_LIKELY( 1 == rc ) ) {
mca_pml_ob1_match_completion_free_request( bml_btl, sendreq );
}
@ -1203,8 +1203,7 @@ int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag )
frag->rdma_length = save_size;
if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending,
(opal_list_item_t*)frag);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
return OMPI_ERR_OUT_OF_RESOURCE;
} else {