more fixes for failover.. and yet still more to come..
This commit was SVN r12450.
Этот коммит содержится в:
родитель
ea77beca29
Коммит
813e7faea8
@ -561,12 +561,17 @@ int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
|
|||||||
opal_list_item_t* item;
|
opal_list_item_t* item;
|
||||||
mca_btl_base_module_t** modules;
|
mca_btl_base_module_t** modules;
|
||||||
mca_btl_base_component_progress_fn_t * btl_progress_new;
|
mca_btl_base_component_progress_fn_t * btl_progress_new;
|
||||||
|
bool found = false;
|
||||||
|
|
||||||
procs = ompi_proc_all(&num_procs);
|
procs = ompi_proc_all(&num_procs);
|
||||||
if(NULL == procs)
|
if(NULL == procs)
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
|
|
||||||
|
if(opal_list_get_size(&mca_btl_base_modules_initialized) == 2){
|
||||||
|
opal_output(0, "only one BTL left, can't failover");
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/* dont use this btl for any peers */
|
/* dont use this btl for any peers */
|
||||||
for(p=0; p<num_procs; p++) {
|
for(p=0; p<num_procs; p++) {
|
||||||
ompi_proc_t* proc = procs[p];
|
ompi_proc_t* proc = procs[p];
|
||||||
@ -581,10 +586,14 @@ int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
|
|||||||
if(sm->btl_module == btl) {
|
if(sm->btl_module == btl) {
|
||||||
opal_list_remove_item(&mca_btl_base_modules_initialized, item);
|
opal_list_remove_item(&mca_btl_base_modules_initialized, item);
|
||||||
free(sm);
|
free(sm);
|
||||||
|
found = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(!found) {
|
||||||
|
/* doesn't even exist */
|
||||||
|
return OMPI_SUCCESS;
|
||||||
|
}
|
||||||
/* remove from bml list */
|
/* remove from bml list */
|
||||||
modules = (mca_btl_base_module_t**)malloc(sizeof(mca_btl_base_module_t*) * mca_bml_r2.num_btl_modules-1);
|
modules = (mca_btl_base_module_t**)malloc(sizeof(mca_btl_base_module_t*) * mca_bml_r2.num_btl_modules-1);
|
||||||
for(i=0,m=0; i<mca_bml_r2.num_btl_modules; i++) {
|
for(i=0,m=0; i<mca_bml_r2.num_btl_modules; i++) {
|
||||||
@ -597,15 +606,10 @@ int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
|
|||||||
mca_bml_r2.num_btl_modules = m;
|
mca_bml_r2.num_btl_modules = m;
|
||||||
|
|
||||||
|
|
||||||
/* remove progress function so btl_progress isn't
|
if(btl->btl_component->btl_progress) {
|
||||||
called on the failed BTL */
|
|
||||||
if(mca_bml_r2.num_btl_progress <= 1) {
|
|
||||||
/* nothing left to send on! */
|
|
||||||
opal_output(0, "%s:%d:%s: only one BTL, can't fail-over!",
|
|
||||||
__FILE__, __LINE__, __func__);
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
/* figure out which progress functions to keep */
|
/* figure out which progress functions to keep */
|
||||||
|
/* don't need to keep any if this is the last one.. */
|
||||||
|
if(mca_bml_r2.num_btl_progress > 1) {
|
||||||
btl_progress_new = (mca_btl_base_component_progress_fn_t*)
|
btl_progress_new = (mca_btl_base_component_progress_fn_t*)
|
||||||
malloc(sizeof(mca_btl_base_component_progress_fn_t) *
|
malloc(sizeof(mca_btl_base_component_progress_fn_t) *
|
||||||
(mca_bml_r2.num_btl_progress - 1));
|
(mca_bml_r2.num_btl_progress - 1));
|
||||||
@ -618,8 +622,9 @@ int mca_bml_r2_del_btl(mca_btl_base_module_t* btl)
|
|||||||
}
|
}
|
||||||
free(mca_bml_r2.btl_progress);
|
free(mca_bml_r2.btl_progress);
|
||||||
mca_bml_r2.btl_progress = btl_progress_new;
|
mca_bml_r2.btl_progress = btl_progress_new;
|
||||||
|
}
|
||||||
mca_bml_r2.num_btl_progress--;
|
mca_bml_r2.num_btl_progress--;
|
||||||
|
}
|
||||||
/* cleanup */
|
/* cleanup */
|
||||||
btl->btl_finalize(btl);
|
btl->btl_finalize(btl);
|
||||||
free(procs);
|
free(procs);
|
||||||
|
@ -234,6 +234,8 @@ void mca_pml_dr_error_handler(
|
|||||||
struct mca_btl_base_module_t* btl,
|
struct mca_btl_base_module_t* btl,
|
||||||
int32_t flags) {
|
int32_t flags) {
|
||||||
/* try failover ! */
|
/* try failover ! */
|
||||||
|
opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__,
|
||||||
|
btl->btl_component->btl_version.mca_component_name);
|
||||||
mca_pml_dr_sendreq_cleanup_active(btl);
|
mca_pml_dr_sendreq_cleanup_active(btl);
|
||||||
mca_bml.bml_del_btl(btl);
|
mca_bml.bml_del_btl(btl);
|
||||||
/* orte_errmgr.abort(); */
|
/* orte_errmgr.abort(); */
|
||||||
|
@ -144,7 +144,8 @@ void mca_pml_dr_recv_frag_callback(
|
|||||||
if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs, hdr->hdr_common.hdr_vid)) {
|
if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs, hdr->hdr_common.hdr_vid)) {
|
||||||
MCA_PML_DR_DEBUG(0,(0, "%s:%d: got a duplicate vfrag vfrag id %d\n",
|
MCA_PML_DR_DEBUG(0,(0, "%s:%d: got a duplicate vfrag vfrag id %d\n",
|
||||||
__FILE__, __LINE__, hdr->hdr_common.hdr_vid));
|
__FILE__, __LINE__, hdr->hdr_common.hdr_vid));
|
||||||
mca_pml_dr_recv_frag_ack(ep->bml_endpoint,
|
mca_pml_dr_recv_frag_ack(btl,
|
||||||
|
ep->bml_endpoint,
|
||||||
&hdr->hdr_common,
|
&hdr->hdr_common,
|
||||||
hdr->hdr_match.hdr_src_ptr.pval,
|
hdr->hdr_match.hdr_src_ptr.pval,
|
||||||
1, 0);
|
1, 0);
|
||||||
@ -198,7 +199,8 @@ void mca_pml_dr_recv_frag_callback(
|
|||||||
if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs_matched, hdr->hdr_common.hdr_vid)) {
|
if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs_matched, hdr->hdr_common.hdr_vid)) {
|
||||||
MCA_PML_DR_DEBUG(0, (0, "%s:%d: acking duplicate matched rendezvous from sequence tracker\n",
|
MCA_PML_DR_DEBUG(0, (0, "%s:%d: acking duplicate matched rendezvous from sequence tracker\n",
|
||||||
__FILE__, __LINE__));
|
__FILE__, __LINE__));
|
||||||
mca_pml_dr_recv_frag_ack(ep->bml_endpoint,
|
mca_pml_dr_recv_frag_ack(btl,
|
||||||
|
ep->bml_endpoint,
|
||||||
&hdr->hdr_common,
|
&hdr->hdr_common,
|
||||||
hdr->hdr_match.hdr_src_ptr.pval,
|
hdr->hdr_match.hdr_src_ptr.pval,
|
||||||
~(uint64_t) 0, hdr->hdr_rndv.hdr_msg_length);
|
~(uint64_t) 0, hdr->hdr_rndv.hdr_msg_length);
|
||||||
@ -220,12 +222,13 @@ void mca_pml_dr_recv_frag_callback(
|
|||||||
if(NULL != recvreq) {
|
if(NULL != recvreq) {
|
||||||
MCA_PML_DR_DEBUG(0,(0, "%s:%d: acking duplicate matched rendezvous from pending matched vfrag id %d\n",
|
MCA_PML_DR_DEBUG(0,(0, "%s:%d: acking duplicate matched rendezvous from pending matched vfrag id %d\n",
|
||||||
__FILE__, __LINE__, hdr->hdr_common.hdr_vid));
|
__FILE__, __LINE__, hdr->hdr_common.hdr_vid));
|
||||||
mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_common,
|
mca_pml_dr_recv_request_ack(btl, recvreq, &hdr->hdr_common,
|
||||||
hdr->hdr_match.hdr_src_ptr, recvreq->req_bytes_received, 1);
|
hdr->hdr_match.hdr_src_ptr, recvreq->req_bytes_received, 1);
|
||||||
} else {
|
} else {
|
||||||
if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs_matched, hdr->hdr_common.hdr_vid)) {
|
if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs_matched, hdr->hdr_common.hdr_vid)) {
|
||||||
MCA_PML_DR_DEBUG(0,(0, "%s:%d: acking duplicate matched rendezvous from sequence tracker\n", __FILE__, __LINE__));
|
MCA_PML_DR_DEBUG(0,(0, "%s:%d: acking duplicate matched rendezvous from sequence tracker\n", __FILE__, __LINE__));
|
||||||
mca_pml_dr_recv_frag_ack(ep->bml_endpoint,
|
mca_pml_dr_recv_frag_ack(btl,
|
||||||
|
ep->bml_endpoint,
|
||||||
&hdr->hdr_common,
|
&hdr->hdr_common,
|
||||||
hdr->hdr_match.hdr_src_ptr.pval,
|
hdr->hdr_match.hdr_src_ptr.pval,
|
||||||
~(uint64_t) 0, hdr->hdr_rndv.hdr_msg_length);
|
~(uint64_t) 0, hdr->hdr_rndv.hdr_msg_length);
|
||||||
@ -277,7 +280,8 @@ void mca_pml_dr_recv_frag_callback(
|
|||||||
/* seq_recvs protected by matching lock */
|
/* seq_recvs protected by matching lock */
|
||||||
if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs, hdr->hdr_common.hdr_vid)) {
|
if(ompi_seq_tracker_check_duplicate(&ep->seq_recvs, hdr->hdr_common.hdr_vid)) {
|
||||||
MCA_PML_DR_DEBUG(0,(0, "%s:%d: acking duplicate fragment\n", __FILE__, __LINE__));
|
MCA_PML_DR_DEBUG(0,(0, "%s:%d: acking duplicate fragment\n", __FILE__, __LINE__));
|
||||||
mca_pml_dr_recv_frag_ack(ep->bml_endpoint,
|
mca_pml_dr_recv_frag_ack(btl,
|
||||||
|
ep->bml_endpoint,
|
||||||
&hdr->hdr_common,
|
&hdr->hdr_common,
|
||||||
hdr->hdr_frag.hdr_src_ptr.pval,
|
hdr->hdr_frag.hdr_src_ptr.pval,
|
||||||
~(uint64_t) 0, 0);
|
~(uint64_t) 0, 0);
|
||||||
@ -694,7 +698,8 @@ rematch:
|
|||||||
}
|
}
|
||||||
MCA_PML_DR_RECV_FRAG_INIT(frag,ompi_proc,hdr,segments,num_segments,btl,csum);
|
MCA_PML_DR_RECV_FRAG_INIT(frag,ompi_proc,hdr,segments,num_segments,btl,csum);
|
||||||
if(do_csum && csum != hdr->hdr_csum) {
|
if(do_csum && csum != hdr->hdr_csum) {
|
||||||
mca_pml_dr_recv_frag_ack((mca_bml_base_endpoint_t*)ompi_proc->proc_bml,
|
mca_pml_dr_recv_frag_ack(btl,
|
||||||
|
(mca_bml_base_endpoint_t*)ompi_proc->proc_bml,
|
||||||
&hdr->hdr_common, hdr->hdr_src_ptr.pval, 0, 0);
|
&hdr->hdr_common, hdr->hdr_src_ptr.pval, 0, 0);
|
||||||
MCA_PML_DR_DEBUG(0,(0, "%s:%d: received corrupted data 0x%08x != 0x%08x (segments %d length %d)\n",
|
MCA_PML_DR_DEBUG(0,(0, "%s:%d: received corrupted data 0x%08x != 0x%08x (segments %d length %d)\n",
|
||||||
__FILE__, __LINE__, csum, hdr->hdr_csum, num_segments,
|
__FILE__, __LINE__, csum, hdr->hdr_csum, num_segments,
|
||||||
@ -730,7 +735,8 @@ rematch:
|
|||||||
}
|
}
|
||||||
MCA_PML_DR_RECV_FRAG_INIT(frag,ompi_proc,hdr,segments,num_segments,btl,csum);
|
MCA_PML_DR_RECV_FRAG_INIT(frag,ompi_proc,hdr,segments,num_segments,btl,csum);
|
||||||
if(do_csum && csum != hdr->hdr_csum) {
|
if(do_csum && csum != hdr->hdr_csum) {
|
||||||
mca_pml_dr_recv_frag_ack((mca_bml_base_endpoint_t*)ompi_proc->proc_bml,
|
mca_pml_dr_recv_frag_ack(btl,
|
||||||
|
(mca_bml_base_endpoint_t*)ompi_proc->proc_bml,
|
||||||
&hdr->hdr_common, hdr->hdr_src_ptr.pval, 0, 0);
|
&hdr->hdr_common, hdr->hdr_src_ptr.pval, 0, 0);
|
||||||
MCA_PML_DR_DEBUG(0,(0, "%s:%d: received corrupted data 0x%08x != 0x%08x\n",
|
MCA_PML_DR_DEBUG(0,(0, "%s:%d: received corrupted data 0x%08x != 0x%08x\n",
|
||||||
__FILE__, __LINE__, csum, hdr->hdr_csum));
|
__FILE__, __LINE__, csum, hdr->hdr_csum));
|
||||||
@ -755,7 +761,8 @@ rematch:
|
|||||||
MCA_PML_DR_DEBUG(1,(0, "%s:%d: received short message, acking now vfrag id: %d\n",
|
MCA_PML_DR_DEBUG(1,(0, "%s:%d: received short message, acking now vfrag id: %d\n",
|
||||||
__FILE__, __LINE__, hdr->hdr_common.hdr_vid));
|
__FILE__, __LINE__, hdr->hdr_common.hdr_vid));
|
||||||
|
|
||||||
mca_pml_dr_recv_frag_ack((mca_bml_base_endpoint_t*)ompi_proc->proc_bml,
|
mca_pml_dr_recv_frag_ack(btl,
|
||||||
|
(mca_bml_base_endpoint_t*)ompi_proc->proc_bml,
|
||||||
&hdr->hdr_common, hdr->hdr_src_ptr.pval, 1, 0);
|
&hdr->hdr_common, hdr->hdr_src_ptr.pval, 1, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -773,6 +780,7 @@ rematch:
|
|||||||
|
|
||||||
|
|
||||||
void mca_pml_dr_recv_frag_ack(
|
void mca_pml_dr_recv_frag_ack(
|
||||||
|
mca_btl_base_module_t* btl,
|
||||||
mca_bml_base_endpoint_t* endpoint,
|
mca_bml_base_endpoint_t* endpoint,
|
||||||
mca_pml_dr_common_hdr_t* hdr,
|
mca_pml_dr_common_hdr_t* hdr,
|
||||||
void *src_ptr,
|
void *src_ptr,
|
||||||
@ -785,7 +793,11 @@ void mca_pml_dr_recv_frag_ack(
|
|||||||
mca_pml_dr_ack_hdr_t* ack;
|
mca_pml_dr_ack_hdr_t* ack;
|
||||||
int rc;
|
int rc;
|
||||||
bool do_csum;
|
bool do_csum;
|
||||||
bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
|
|
||||||
|
/* use the same BTL for ACK's makes failover SANE */
|
||||||
|
bml_btl = mca_bml_base_btl_array_find(&endpoint->btl_eager,
|
||||||
|
btl);
|
||||||
|
|
||||||
do_csum = mca_pml_dr.enable_csum &&
|
do_csum = mca_pml_dr.enable_csum &&
|
||||||
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
||||||
|
|
||||||
|
@ -133,6 +133,7 @@ do { \
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
void mca_pml_dr_recv_frag_ack(
|
void mca_pml_dr_recv_frag_ack(
|
||||||
|
mca_btl_base_module_t* btl,
|
||||||
mca_bml_base_endpoint_t* endpoint,
|
mca_bml_base_endpoint_t* endpoint,
|
||||||
mca_pml_dr_common_hdr_t* hdr,
|
mca_pml_dr_common_hdr_t* hdr,
|
||||||
void* src_ptr,
|
void* src_ptr,
|
||||||
|
@ -36,7 +36,7 @@
|
|||||||
* as we need to put the match back on the list if the checksum
|
* as we need to put the match back on the list if the checksum
|
||||||
* fails for later matching
|
* fails for later matching
|
||||||
*/
|
*/
|
||||||
#define MCA_PML_DR_RECV_REQUEST_MATCH_ACK(do_csum,recvreq,hdr,csum,bytes_received) \
|
#define MCA_PML_DR_RECV_REQUEST_MATCH_ACK(btl,do_csum,recvreq,hdr,csum,bytes_received) \
|
||||||
if(do_csum && csum != hdr->hdr_match.hdr_csum) { \
|
if(do_csum && csum != hdr->hdr_match.hdr_csum) { \
|
||||||
/* failed the csum, put the request back on the list for \
|
/* failed the csum, put the request back on the list for \
|
||||||
* matching later on retransmission \
|
* matching later on retransmission \
|
||||||
@ -46,7 +46,8 @@ if(do_csum && csum != hdr->hdr_match.hdr_csum) { \
|
|||||||
} else { \
|
} else { \
|
||||||
mca_pml_dr_recv_request_match_specific(recvreq); \
|
mca_pml_dr_recv_request_match_specific(recvreq); \
|
||||||
} \
|
} \
|
||||||
mca_pml_dr_recv_frag_ack(recvreq->req_endpoint->bml_endpoint, \
|
mca_pml_dr_recv_frag_ack(btl, \
|
||||||
|
recvreq->req_endpoint->bml_endpoint, \
|
||||||
&hdr->hdr_common, \
|
&hdr->hdr_common, \
|
||||||
hdr->hdr_match.hdr_src_ptr.pval, \
|
hdr->hdr_match.hdr_src_ptr.pval, \
|
||||||
0, 0); \
|
0, 0); \
|
||||||
@ -59,7 +60,7 @@ if(do_csum && csum != hdr->hdr_match.hdr_csum) { \
|
|||||||
} else if (recvreq->req_acked == false) { \
|
} else if (recvreq->req_acked == false) { \
|
||||||
MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d", \
|
MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d", \
|
||||||
__FILE__, __LINE__, recvreq->req_vfrag0.vf_id)); \
|
__FILE__, __LINE__, recvreq->req_vfrag0.vf_id)); \
|
||||||
mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_common, \
|
mca_pml_dr_recv_request_ack(btl, recvreq, &hdr->hdr_common, \
|
||||||
hdr->hdr_match.hdr_src_ptr, bytes_received, 1); \
|
hdr->hdr_match.hdr_src_ptr, bytes_received, 1); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -162,6 +163,7 @@ static void mca_pml_dr_ctl_completion(
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
void mca_pml_dr_recv_request_ack(
|
void mca_pml_dr_recv_request_ack(
|
||||||
|
mca_btl_base_module_t* btl,
|
||||||
mca_pml_dr_recv_request_t* recvreq,
|
mca_pml_dr_recv_request_t* recvreq,
|
||||||
mca_pml_dr_common_hdr_t* hdr,
|
mca_pml_dr_common_hdr_t* hdr,
|
||||||
ompi_ptr_t src_ptr,
|
ompi_ptr_t src_ptr,
|
||||||
@ -174,8 +176,11 @@ void mca_pml_dr_recv_request_ack(
|
|||||||
int rc;
|
int rc;
|
||||||
bool do_csum;
|
bool do_csum;
|
||||||
|
|
||||||
|
|
||||||
|
/* use the same BTL for ACK's makes failover SANE */
|
||||||
|
bml_btl = mca_bml_base_btl_array_find(&recvreq->req_endpoint->bml_endpoint->btl_eager,
|
||||||
|
btl);
|
||||||
/* allocate descriptor */
|
/* allocate descriptor */
|
||||||
bml_btl = mca_bml_base_btl_array_get_next(&recvreq->req_endpoint->bml_endpoint->btl_eager);
|
|
||||||
do_csum = mca_pml_dr.enable_csum &&
|
do_csum = mca_pml_dr.enable_csum &&
|
||||||
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
(bml_btl->btl_flags & MCA_BTL_FLAGS_NEED_CSUM);
|
||||||
MCA_PML_DR_DES_ALLOC(bml_btl, des, sizeof(mca_pml_dr_ack_hdr_t));
|
MCA_PML_DR_DES_ALLOC(bml_btl, des, sizeof(mca_pml_dr_ack_hdr_t));
|
||||||
@ -254,7 +259,7 @@ void mca_pml_dr_recv_request_progress(
|
|||||||
bytes_received,
|
bytes_received,
|
||||||
bytes_delivered,
|
bytes_delivered,
|
||||||
csum);
|
csum);
|
||||||
MCA_PML_DR_RECV_REQUEST_MATCH_ACK(do_csum, recvreq,hdr,csum,bytes_received);
|
MCA_PML_DR_RECV_REQUEST_MATCH_ACK(btl,do_csum, recvreq,hdr,csum,bytes_received);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -272,7 +277,7 @@ void mca_pml_dr_recv_request_progress(
|
|||||||
bytes_received,
|
bytes_received,
|
||||||
bytes_delivered,
|
bytes_delivered,
|
||||||
csum);
|
csum);
|
||||||
MCA_PML_DR_RECV_REQUEST_MATCH_ACK(do_csum, recvreq,hdr,csum,bytes_received);
|
MCA_PML_DR_RECV_REQUEST_MATCH_ACK(btl,do_csum, recvreq,hdr,csum,bytes_received);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -283,7 +288,8 @@ void mca_pml_dr_recv_request_progress(
|
|||||||
if(vfrag->vf_ack == vfrag->vf_mask) {
|
if(vfrag->vf_ack == vfrag->vf_mask) {
|
||||||
MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d",
|
MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d",
|
||||||
__FILE__, __LINE__, vfrag->vf_id));
|
__FILE__, __LINE__, vfrag->vf_id));
|
||||||
mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_common,
|
mca_pml_dr_recv_request_ack(btl,
|
||||||
|
recvreq, &hdr->hdr_common,
|
||||||
hdr->hdr_frag.hdr_src_ptr,
|
hdr->hdr_frag.hdr_src_ptr,
|
||||||
vfrag->vf_size,
|
vfrag->vf_size,
|
||||||
vfrag->vf_mask);
|
vfrag->vf_mask);
|
||||||
@ -318,7 +324,7 @@ void mca_pml_dr_recv_request_progress(
|
|||||||
ompi_seq_tracker_insert(&recvreq->req_endpoint->seq_recvs, vfrag->vf_id);
|
ompi_seq_tracker_insert(&recvreq->req_endpoint->seq_recvs, vfrag->vf_id);
|
||||||
MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d",
|
MCA_PML_DR_DEBUG(1,(0, "%s:%d: sending ack, vfrag ID %d",
|
||||||
__FILE__, __LINE__, vfrag->vf_id));
|
__FILE__, __LINE__, vfrag->vf_id));
|
||||||
mca_pml_dr_recv_request_ack(recvreq, &hdr->hdr_common,
|
mca_pml_dr_recv_request_ack(btl, recvreq, &hdr->hdr_common,
|
||||||
hdr->hdr_frag.hdr_src_ptr,
|
hdr->hdr_frag.hdr_src_ptr,
|
||||||
vfrag->vf_size, vfrag->vf_mask);
|
vfrag->vf_size, vfrag->vf_mask);
|
||||||
}
|
}
|
||||||
|
@ -177,6 +177,7 @@ void mca_pml_dr_recv_request_match_specific(mca_pml_dr_recv_request_t* request);
|
|||||||
* Ack a matched request.
|
* Ack a matched request.
|
||||||
*/
|
*/
|
||||||
void mca_pml_dr_recv_request_ack(
|
void mca_pml_dr_recv_request_ack(
|
||||||
|
mca_btl_base_module_t* blt,
|
||||||
mca_pml_dr_recv_request_t* recvreq,
|
mca_pml_dr_recv_request_t* recvreq,
|
||||||
mca_pml_dr_common_hdr_t* hdr,
|
mca_pml_dr_common_hdr_t* hdr,
|
||||||
ompi_ptr_t src_ptr,
|
ompi_ptr_t src_ptr,
|
||||||
|
@ -138,11 +138,6 @@ do {
|
|||||||
/* } */ \
|
/* } */ \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Start a send request.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define MCA_PML_DR_SEND_REQUEST_START(sendreq, rc) \
|
#define MCA_PML_DR_SEND_REQUEST_START(sendreq, rc) \
|
||||||
do { \
|
do { \
|
||||||
mca_pml_dr_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm; \
|
mca_pml_dr_comm_t* comm = sendreq->req_send.req_base.req_comm->c_pml_comm; \
|
||||||
|
@ -79,10 +79,14 @@ static void mca_pml_dr_vfrag_wdog_timeout(int fd, short event, void* data)
|
|||||||
/* check for hung btl */
|
/* check for hung btl */
|
||||||
if(++vfrag->vf_wdog_cnt == mca_pml_dr.wdog_retry_max) {
|
if(++vfrag->vf_wdog_cnt == mca_pml_dr.wdog_retry_max) {
|
||||||
/* declare btl dead */
|
/* declare btl dead */
|
||||||
|
if(vfrag->bml_btl->btl) {
|
||||||
opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__,
|
opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__,
|
||||||
vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name);
|
vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name);
|
||||||
mca_pml_dr_sendreq_cleanup_active(vfrag->bml_btl->btl);
|
mca_pml_dr_sendreq_cleanup_active(vfrag->bml_btl->btl);
|
||||||
mca_bml.bml_del_btl(vfrag->bml_btl->btl);
|
mca_bml.bml_del_btl(vfrag->bml_btl->btl);
|
||||||
|
} else {
|
||||||
|
opal_output(0, "%s:%d:%s: failing already failed BTL", __FILE__, __LINE__, __func__);
|
||||||
|
}
|
||||||
mca_pml_dr_vfrag_reset(vfrag);
|
mca_pml_dr_vfrag_reset(vfrag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user