From 0ae2277796c3ec3f6434b5080c28a006460e46bc Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 1 May 2012 15:53:00 +0000 Subject: [PATCH] Add a backoff mechanism for re-establishing communication This commit was SVN r26366. --- ompi/mca/mtl/portals4/mtl_portals4_flowctl.c | 16 ++++++++++++++-- ompi/mca/mtl/portals4/mtl_portals4_flowctl.h | 5 ++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/ompi/mca/mtl/portals4/mtl_portals4_flowctl.c b/ompi/mca/mtl/portals4/mtl_portals4_flowctl.c index 47a8fa002d..545cc98765 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_flowctl.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_flowctl.c @@ -44,8 +44,6 @@ ompi_mtl_portals4_flowctl_init(void) OBJ_CLASS(ompi_mtl_portals4_pending_request_t), 1, -1, 1); - OBJ_CONSTRUCT(&ompi_mtl_portals4.flowctl.mutex, opal_mutex_t); - ompi_mtl_portals4.flowctl.slots = (ompi_mtl_portals4.queue_size - 3) / 3; ompi_mtl_portals4.flowctl.alert_req.type = portals4_req_flowctl; @@ -200,6 +198,9 @@ ompi_mtl_portals4_flowctl_init(void) ompi_mtl_portals4.flowctl.num_children = 0; + gettimeofday(&ompi_mtl_portals4.flowctl.tv, NULL); + ompi_mtl_portals4.flowctl.backoff_count = 0; + ret = OMPI_SUCCESS; error: @@ -544,6 +545,7 @@ flowctl_fanout_callback(ptl_event_t *ev, ompi_mtl_portals4_base_request_t *ptl_base_request) { int ret; + struct timeval tv; ompi_mtl_portals4.flowctl.flowctl_active = false; ret = PtlPTEnable(ompi_mtl_portals4.ni_h, ompi_mtl_portals4.recv_idx); @@ -554,6 +556,16 @@ flowctl_fanout_callback(ptl_event_t *ev, return ret; } + gettimeofday(&tv, NULL); + if (((tv.tv_sec * 1000000 + tv.tv_usec) - + (ompi_mtl_portals4.flowctl.tv.tv_sec * 1000000 + ompi_mtl_portals4.flowctl.tv.tv_usec)) + < 1000000 * ompi_mtl_portals4.flowctl.backoff_count) { + usleep(++ompi_mtl_portals4.flowctl.backoff_count); + } else { + ompi_mtl_portals4.flowctl.backoff_count = 0; + } + ompi_mtl_portals4.flowctl.tv = tv; + ompi_mtl_portals4_pending_list_progress(); OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_output, diff --git a/ompi/mca/mtl/portals4/mtl_portals4_flowctl.h b/ompi/mca/mtl/portals4/mtl_portals4_flowctl.h index 2b38647b1a..87e2d23e52 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_flowctl.h +++ b/ompi/mca/mtl/portals4/mtl_portals4_flowctl.h @@ -38,7 +38,6 @@ struct ompi_mtl_portals4_flowctl_t { opal_list_t active_sends; opal_list_t pending_sends; opal_free_list_t pending_fl; - opal_mutex_t mutex; int32_t slots; ompi_mtl_portals4_base_request_t alert_req; @@ -70,6 +69,10 @@ struct ompi_mtl_portals4_flowctl_t { /** Flow control restart fan-out ME. */ ptl_handle_me_t fanout_me_h; + /** last restart time */ + struct timeval tv; + int backoff_count; + size_t num_procs; size_t num_children; ptl_process_t children[2];