2005-11-22 17:24:47 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2006-08-24 16:38:08 +00:00
|
|
|
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
2005-11-22 17:24:47 +00:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2005-12-20 21:42:58 +00:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
2005-11-22 17:24:47 +00:00
|
|
|
* University of Stuttgart. All rights reserved.
|
2006-03-27 22:44:26 +00:00
|
|
|
* Copyright (c) 2004-2006 The Regents of the University of California.
|
2005-11-22 17:24:47 +00:00
|
|
|
* All rights reserved.
|
|
|
|
* $COPYRIGHT$
|
2005-12-20 21:42:58 +00:00
|
|
|
*
|
2005-11-22 17:24:47 +00:00
|
|
|
* Additional copyrights may follow
|
2005-12-20 21:42:58 +00:00
|
|
|
*
|
2005-11-22 17:24:47 +00:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
2005-12-20 21:42:58 +00:00
|
|
|
|
2005-11-22 17:24:47 +00:00
|
|
|
#include "ompi_config.h"
|
2005-12-20 21:42:58 +00:00
|
|
|
#include "pml_dr_vfrag.h"
|
2006-02-24 17:08:14 +00:00
|
|
|
#include "pml_dr_sendreq.h"
|
2006-05-04 16:16:26 +00:00
|
|
|
#include "ompi/mca/bml/base/base.h"
|
2006-03-16 22:33:08 +00:00
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
|
|
|
2006-05-04 16:16:26 +00:00
|
|
|
static void mca_pml_dr_vfrag_wdog_timeout(int fd, short event, void* vfrag);
|
|
|
|
static void mca_pml_dr_vfrag_ack_timeout(int fd, short event, void* vfrag);
|
2005-12-20 21:42:58 +00:00
|
|
|
|
|
|
|
static void mca_pml_dr_vfrag_construct(mca_pml_dr_vfrag_t* vfrag)
|
|
|
|
{
|
2007-01-04 22:07:37 +00:00
|
|
|
vfrag->vf_send.pval = NULL;
|
|
|
|
vfrag->vf_recv.pval = NULL;
|
2006-02-16 16:15:16 +00:00
|
|
|
vfrag->vf_id = 0;
|
|
|
|
vfrag->vf_idx = 0;
|
|
|
|
vfrag->vf_len = 0;
|
|
|
|
vfrag->vf_offset = 0;
|
|
|
|
vfrag->vf_size = 0;
|
|
|
|
vfrag->vf_max_send_size = 0;
|
|
|
|
vfrag->vf_ack = 0;
|
2006-03-24 06:49:45 +00:00
|
|
|
vfrag->vf_mask = 1;
|
2006-03-23 22:08:59 +00:00
|
|
|
vfrag->vf_state = 0;
|
2006-05-04 16:16:26 +00:00
|
|
|
vfrag->vf_wdog_tv = mca_pml_dr.wdog_timer;
|
|
|
|
vfrag->vf_ack_tv = mca_pml_dr.ack_timer;
|
|
|
|
vfrag->vf_wdog_cnt = 0;
|
|
|
|
vfrag->vf_ack_cnt = 0;
|
|
|
|
opal_evtimer_set(&vfrag->vf_wdog_ev, mca_pml_dr_vfrag_wdog_timeout, (void*) vfrag);
|
|
|
|
opal_evtimer_set(&vfrag->vf_ack_ev, mca_pml_dr_vfrag_ack_timeout, (void*) vfrag);
|
2005-12-20 21:42:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void mca_pml_dr_vfrag_destruct(mca_pml_dr_vfrag_t* vfrag)
|
|
|
|
{
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2005-11-22 17:24:47 +00:00
|
|
|
|
2005-12-20 21:42:58 +00:00
|
|
|
OBJ_CLASS_INSTANCE(
|
|
|
|
mca_pml_dr_vfrag_t,
|
2006-06-12 16:44:00 +00:00
|
|
|
ompi_free_list_item_t,
|
2005-12-20 21:42:58 +00:00
|
|
|
mca_pml_dr_vfrag_construct,
|
|
|
|
mca_pml_dr_vfrag_destruct
|
|
|
|
);
|
2005-11-22 17:24:47 +00:00
|
|
|
|
|
|
|
|
2006-02-24 17:08:14 +00:00
|
|
|
/**
|
|
|
|
* The wdog timer expired, better do something about it, like resend the current part of the vfrag
|
|
|
|
*/
|
2006-05-04 16:16:26 +00:00
|
|
|
static void mca_pml_dr_vfrag_wdog_timeout(int fd, short event, void* data)
|
2006-03-04 00:36:16 +00:00
|
|
|
{
|
2006-02-24 17:08:14 +00:00
|
|
|
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*) data;
|
2007-01-04 22:07:37 +00:00
|
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
2006-03-24 06:49:45 +00:00
|
|
|
|
2007-01-30 20:56:31 +00:00
|
|
|
MCA_PML_DR_DEBUG(0,(0, "%s:%d:%s: wdog timeout: %p vid: %d",
|
2007-02-01 19:27:11 +00:00
|
|
|
__FILE__, __LINE__, __func__, (void*)vfrag, vfrag->vf_id));
|
2006-03-24 06:49:45 +00:00
|
|
|
|
2006-05-04 16:16:26 +00:00
|
|
|
/* update pending counts */
|
2006-06-01 18:58:38 +00:00
|
|
|
OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth,-vfrag->vf_pending);
|
2006-05-04 16:16:26 +00:00
|
|
|
OPAL_THREAD_ADD64(&vfrag->vf_pending,-vfrag->vf_pending);
|
|
|
|
|
|
|
|
/* check for hung btl */
|
|
|
|
if(++vfrag->vf_wdog_cnt == mca_pml_dr.wdog_retry_max) {
|
|
|
|
/* declare btl dead */
|
2006-11-06 21:27:17 +00:00
|
|
|
if(vfrag->bml_btl->btl) {
|
2008-06-09 14:53:58 +00:00
|
|
|
opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__,
|
2006-11-06 21:27:17 +00:00
|
|
|
vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name);
|
|
|
|
mca_pml_dr_sendreq_cleanup_active(vfrag->bml_btl->btl);
|
|
|
|
mca_bml.bml_del_btl(vfrag->bml_btl->btl);
|
|
|
|
} else {
|
2008-06-09 14:53:58 +00:00
|
|
|
opal_output(0, "%s:%d:%s: failing already failed BTL", __FILE__, __LINE__, __func__);
|
2006-11-06 21:27:17 +00:00
|
|
|
}
|
2006-05-04 16:16:26 +00:00
|
|
|
mca_pml_dr_vfrag_reset(vfrag);
|
2006-11-06 23:25:24 +00:00
|
|
|
} else if(NULL == vfrag->bml_btl->btl) {
|
|
|
|
mca_pml_dr_vfrag_reset(vfrag);
|
|
|
|
}
|
2006-05-04 16:16:26 +00:00
|
|
|
|
|
|
|
/* back off watchdog timer */
|
|
|
|
vfrag->vf_wdog_tv.tv_sec =
|
|
|
|
mca_pml_dr.wdog_timer.tv_sec +
|
|
|
|
mca_pml_dr.wdog_timer.tv_sec * mca_pml_dr.wdog_timer_multiplier *
|
|
|
|
vfrag->vf_wdog_cnt;
|
|
|
|
vfrag->vf_wdog_tv.tv_usec =
|
|
|
|
mca_pml_dr.wdog_timer.tv_usec +
|
|
|
|
mca_pml_dr.wdog_timer.tv_usec * mca_pml_dr.wdog_timer_multiplier *
|
|
|
|
vfrag->vf_wdog_cnt;
|
|
|
|
|
|
|
|
/* reschedule vfrag */
|
|
|
|
mca_pml_dr_vfrag_reschedule(vfrag);
|
2006-02-24 17:08:14 +00:00
|
|
|
}
|
|
|
|
|
2006-05-04 16:16:26 +00:00
|
|
|
|
2006-02-24 17:08:14 +00:00
|
|
|
/**
|
|
|
|
* The ack timer expired, better do something about it, like resend the entire vfrag?
|
|
|
|
*/
|
2006-05-04 16:16:26 +00:00
|
|
|
static void mca_pml_dr_vfrag_ack_timeout(int fd, short event, void* data)
|
|
|
|
{
|
2006-03-16 22:33:08 +00:00
|
|
|
mca_pml_dr_vfrag_t* vfrag = (mca_pml_dr_vfrag_t*) data;
|
2007-01-30 20:56:31 +00:00
|
|
|
MCA_PML_DR_DEBUG(0,(0, "%s:%d:%s: ack timeout: %p",
|
2007-02-01 19:27:11 +00:00
|
|
|
__FILE__, __LINE__, __func__, (void*)vfrag));
|
2006-05-04 16:16:26 +00:00
|
|
|
|
|
|
|
/* stop ack timer */
|
|
|
|
MCA_PML_DR_VFRAG_ACK_STOP(vfrag);
|
|
|
|
|
|
|
|
/* check for hung btl */
|
|
|
|
if(++vfrag->vf_ack_cnt == mca_pml_dr.ack_retry_max) {
|
|
|
|
/* declare btl dead */
|
2006-11-06 22:09:39 +00:00
|
|
|
if(vfrag->bml_btl->btl) {
|
2008-06-09 14:53:58 +00:00
|
|
|
opal_output(0, "%s:%d:%s: failing BTL: %s", __FILE__, __LINE__, __func__,
|
2006-11-06 22:09:39 +00:00
|
|
|
vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name);
|
|
|
|
mca_pml_dr_sendreq_cleanup_active(vfrag->bml_btl->btl);
|
|
|
|
mca_bml.bml_del_btl(vfrag->bml_btl->btl);
|
|
|
|
} else {
|
2008-06-09 14:53:58 +00:00
|
|
|
opal_output(0, "%s:%d:%s: failing already failed BTL", __FILE__, __LINE__, __func__);
|
2006-11-06 22:09:39 +00:00
|
|
|
}
|
2006-05-04 16:16:26 +00:00
|
|
|
mca_pml_dr_vfrag_reset(vfrag);
|
2006-11-06 23:25:24 +00:00
|
|
|
} else if(NULL == vfrag->bml_btl->btl) {
|
|
|
|
mca_pml_dr_vfrag_reset(vfrag);
|
2006-05-04 16:16:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* back off ack timer */
|
|
|
|
vfrag->vf_ack_tv.tv_sec =
|
|
|
|
mca_pml_dr.ack_timer.tv_sec +
|
|
|
|
mca_pml_dr.ack_timer.tv_sec * mca_pml_dr.ack_timer_multiplier *
|
|
|
|
vfrag->vf_ack_cnt;
|
|
|
|
vfrag->vf_ack_tv.tv_usec =
|
|
|
|
mca_pml_dr.ack_timer.tv_usec +
|
|
|
|
mca_pml_dr.ack_timer.tv_usec * mca_pml_dr.ack_timer_multiplier *
|
|
|
|
vfrag->vf_ack_cnt;
|
|
|
|
|
|
|
|
/* reschedule vfrag */
|
|
|
|
mca_pml_dr_vfrag_reschedule(vfrag);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Vfrag failure - declare btl dead and try to resend on an alternate btl
|
|
|
|
*/
|
|
|
|
|
|
|
|
void mca_pml_dr_vfrag_reset(mca_pml_dr_vfrag_t* vfrag)
|
|
|
|
{
|
2007-01-04 22:07:37 +00:00
|
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
2006-03-24 06:49:45 +00:00
|
|
|
|
2006-05-04 16:16:26 +00:00
|
|
|
/* update counters - give new BTL a fair chance :-) */
|
|
|
|
vfrag->vf_ack_cnt = 0;
|
|
|
|
vfrag->vf_wdog_cnt = 0;
|
|
|
|
|
|
|
|
/* lookup new bml_btl data structure */
|
|
|
|
sendreq->req_endpoint = (mca_pml_dr_endpoint_t*)sendreq->req_send.req_base.req_proc->proc_pml;
|
|
|
|
|
|
|
|
/* make sure a path is available */
|
2006-07-04 01:20:20 +00:00
|
|
|
if(mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->bml_endpoint->btl_eager) == 0 ||
|
|
|
|
mca_bml_base_btl_array_get_size(&sendreq->req_endpoint->bml_endpoint->btl_eager) == 0) {
|
2008-06-09 14:53:58 +00:00
|
|
|
opal_output(0, "%s:%d:%s: no path to peer", __FILE__, __LINE__, __func__);
|
2008-02-28 01:57:57 +00:00
|
|
|
orte_errmgr.abort(-1, NULL);
|
2006-05-04 16:16:26 +00:00
|
|
|
}
|
|
|
|
if(vfrag->vf_offset == 0) {
|
2006-07-04 01:20:20 +00:00
|
|
|
vfrag->bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->bml_endpoint->btl_eager);
|
2006-05-04 16:16:26 +00:00
|
|
|
} else {
|
2006-07-04 01:20:20 +00:00
|
|
|
vfrag->bml_btl = mca_bml_base_btl_array_get_next(&sendreq->req_endpoint->bml_endpoint->btl_send);
|
2006-05-04 16:16:26 +00:00
|
|
|
}
|
2008-06-09 14:53:58 +00:00
|
|
|
opal_output(0, "%s:%d:%s: selected new BTL: %s", __FILE__, __LINE__, __func__,
|
2006-05-04 16:16:26 +00:00
|
|
|
vfrag->bml_btl->btl->btl_component->btl_version.mca_component_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Reschedule vfrag that has timed out
|
|
|
|
*/
|
|
|
|
|
|
|
|
void mca_pml_dr_vfrag_reschedule(mca_pml_dr_vfrag_t* vfrag)
|
|
|
|
{
|
2007-01-04 22:07:37 +00:00
|
|
|
mca_pml_dr_send_request_t* sendreq = (mca_pml_dr_send_request_t*)vfrag->vf_send.pval;
|
2006-05-04 16:16:26 +00:00
|
|
|
|
|
|
|
/* start wdog timer */
|
|
|
|
MCA_PML_DR_VFRAG_WDOG_START(vfrag);
|
2006-03-21 14:30:54 +00:00
|
|
|
|
2006-03-24 06:49:45 +00:00
|
|
|
/* first frag within send request */
|
2006-05-04 16:16:26 +00:00
|
|
|
OPAL_THREAD_LOCK(&ompi_request_lock);
|
2006-03-24 06:49:45 +00:00
|
|
|
if(vfrag == &sendreq->req_vfrag0) {
|
|
|
|
if(vfrag->vf_state & MCA_PML_DR_VFRAG_RNDV) {
|
|
|
|
MCA_PML_DR_SEND_REQUEST_RNDV_PROBE(sendreq, vfrag);
|
|
|
|
} else {
|
|
|
|
MCA_PML_DR_SEND_REQUEST_EAGER_RETRY(sendreq, vfrag);
|
|
|
|
}
|
|
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
|
|
|
|
/* reschedule unacked portion of vfrag */
|
2006-03-16 22:33:08 +00:00
|
|
|
} else {
|
2006-03-24 06:49:45 +00:00
|
|
|
MCA_PML_DR_SEND_REQUEST_VFRAG_RETRANS(sendreq, vfrag);
|
2006-03-16 22:33:08 +00:00
|
|
|
OPAL_THREAD_UNLOCK(&ompi_request_lock);
|
|
|
|
mca_pml_dr_send_request_schedule(sendreq);
|
|
|
|
}
|
2006-02-24 17:08:14 +00:00
|
|
|
}
|
|
|
|
|