Try to failover when we get an async error from the lower layer (BTL)..
This commit was SVN r12420.
Этот коммит содержится в:
родитель
27420fbda3
Коммит
f7c554df65
@ -233,5 +233,10 @@ int mca_pml_dr_dump(
|
||||
void mca_pml_dr_error_handler(
|
||||
struct mca_btl_base_module_t* btl,
|
||||
int32_t flags) {
|
||||
orte_errmgr.abort();
|
||||
/* try failover ! */
|
||||
mca_pml_dr_sendreq_cleanup_active(btl);
|
||||
mca_bml.bml_del_btl(btl);
|
||||
/* orte_errmgr.abort(); */
|
||||
}
|
||||
|
||||
|
||||
|
@ -1145,3 +1145,18 @@ void mca_pml_dr_send_request_frag_ack(
|
||||
}
|
||||
|
||||
|
||||
void mca_pml_dr_sendreq_cleanup_active(mca_btl_base_module_t* btl) {
|
||||
opal_list_item_t* item;
|
||||
|
||||
for (item = opal_list_get_first(&mca_pml_dr.send_active) ;
|
||||
item != opal_list_get_end(&mca_pml_dr.send_active) ;
|
||||
item = opal_list_get_next(item)) {
|
||||
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*) item;
|
||||
mca_btl_base_descriptor_t* des = sendreq->req_descriptor;
|
||||
mca_bml_base_btl_t* bml_btl = des->des_context;
|
||||
if( bml_btl && bml_btl->btl == btl) {
|
||||
des->des_context = NULL;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -484,6 +484,8 @@ void mca_pml_dr_send_request_frag_ack(
|
||||
mca_btl_base_module_t* btl,
|
||||
mca_pml_dr_ack_hdr_t*);
|
||||
|
||||
void mca_pml_dr_sendreq_cleanup_active(mca_btl_base_module_t* btl);
|
||||
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
|
@ -24,7 +24,6 @@
|
||||
|
||||
static void mca_pml_dr_vfrag_wdog_timeout(int fd, short event, void* vfrag);
|
||||
static void mca_pml_dr_vfrag_ack_timeout(int fd, short event, void* vfrag);
|
||||
static void mca_pml_dr_vfrag_cleanup_active_desc(mca_bml_base_btl_t* bml_btl);
|
||||
|
||||
static void mca_pml_dr_vfrag_construct(mca_pml_dr_vfrag_t* vfrag)
|
||||
{
|
||||
@ -82,7 +81,7 @@ static void mca_pml_dr_vfrag_wdog_timeout(int fd, short event, void* data)
|
||||
/* declare btl dead */
|
||||
opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__,
|
||||
vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
mca_pml_dr_vfrag_cleanup_active_desc(vfrag->bml_btl);
|
||||
mca_pml_dr_sendreq_cleanup_active(vfrag->bml_btl->btl);
|
||||
mca_bml.bml_del_btl(vfrag->bml_btl->btl);
|
||||
mca_pml_dr_vfrag_reset(vfrag);
|
||||
}
|
||||
@ -119,7 +118,7 @@ static void mca_pml_dr_vfrag_ack_timeout(int fd, short event, void* data)
|
||||
/* declare btl dead */
|
||||
opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__,
|
||||
vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||
mca_pml_dr_vfrag_cleanup_active_desc(vfrag->bml_btl);
|
||||
mca_pml_dr_sendreq_cleanup_active(vfrag->bml_btl->btl);
|
||||
mca_bml.bml_del_btl(vfrag->bml_btl->btl);
|
||||
mca_pml_dr_vfrag_reset(vfrag);
|
||||
}
|
||||
@ -198,18 +197,3 @@ void mca_pml_dr_vfrag_reschedule(mca_pml_dr_vfrag_t* vfrag)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void mca_pml_dr_vfrag_cleanup_active_desc(mca_bml_base_btl_t* bml_btl) {
|
||||
opal_list_item_t* item;
|
||||
|
||||
for (item = opal_list_get_first(&mca_pml_dr.send_active) ;
|
||||
item != opal_list_get_end(&mca_pml_dr.send_active) ;
|
||||
item = opal_list_get_next(item)) {
|
||||
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*) item;
|
||||
mca_btl_base_descriptor_t* des = sendreq->req_descriptor;
|
||||
if( des->des_context == bml_btl) {
|
||||
des->des_context = NULL;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user