From 58155bc760276daca76b6565f6186f4f53d8a977 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Fri, 4 Oct 2019 12:05:13 -0700 Subject: [PATCH] btl/usnic: cap the number of resends per progress iteration New MCA param: btl_usnic_max_resends_per_iteration. This is the max number of resends we'll do in a single pass through usNIC component progress. This prevents progress from getting stuck in an endless loop of retransmissions (i.e., if more retransmissions are triggered during the sending of retransmissions). Specifically: we need to leave the resend loop to allow receives to happen (which may ACK messages we have sent previously, and therefore cause pending resends to be moot). Signed-off-by: Jeff Squyres (cherry picked from commit 27e3040dfeba00a9a2615a217c164899f0009e59) --- opal/mca/btl/usnic/btl_usnic.h | 4 ++++ opal/mca/btl/usnic/btl_usnic_mca.c | 4 ++++ opal/mca/btl/usnic/btl_usnic_module.c | 7 +++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/opal/mca/btl/usnic/btl_usnic.h b/opal/mca/btl/usnic/btl_usnic.h index 3251b67369..b4831d82d9 100644 --- a/opal/mca/btl/usnic/btl_usnic.h +++ b/opal/mca/btl/usnic/btl_usnic.h @@ -206,6 +206,10 @@ typedef struct opal_btl_usnic_component_t { /** retrans characteristics */ int retrans_timeout; + /** max number of messages re-sent during a single progress + iteration */ + int max_resends_per_iteration; + /** minimum number of times through component progress before checking to see if standalone ACKs need to be sent */ int ack_iteration_delay; diff --git a/opal/mca/btl/usnic/btl_usnic_mca.c b/opal/mca/btl/usnic/btl_usnic_mca.c index 54050dde30..f0399cbeb8 100644 --- a/opal/mca/btl/usnic/btl_usnic_mca.c +++ b/opal/mca/btl/usnic/btl_usnic_mca.c @@ -260,6 +260,10 @@ int opal_btl_usnic_component_register(void) 100000, &mca_btl_usnic_component.retrans_timeout, REGINT_GE_ONE, OPAL_INFO_LVL_5)); + CHECK(reg_int("max_resends_per_iteration", "Maximum number of frames to resend in a single iteration through usNIC component progress", + 16, &mca_btl_usnic_component.max_resends_per_iteration, + REGINT_GE_ONE, OPAL_INFO_LVL_5)); + CHECK(reg_int("ack_iteration_delay", "Minimum number of times through usNIC \"progress\" function before checking to see if standalone ACKs need to be sent", 0, &mca_btl_usnic_component.ack_iteration_delay, REGINT_GE_ZERO, OPAL_INFO_LVL_5)); diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 2c3d8e0463..086f99f512 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -963,11 +963,12 @@ usnic_do_resends( opal_btl_usnic_send_segment_t *sseg; opal_btl_usnic_endpoint_t *endpoint; struct opal_btl_usnic_channel_t *data_channel; - int ret; + int ret, count; data_channel = &module->mod_channels[USNIC_DATA_CHANNEL]; - while ((get_send_credits(data_channel) > 1) && + count = mca_btl_usnic_component.max_resends_per_iteration; + while (count > 0 && (get_send_credits(data_channel) > 1) && !opal_list_is_empty(&module->pending_resend_segs)) { /* @@ -1009,6 +1010,8 @@ usnic_do_resends( BTL_ERROR(("hotel checkin failed\n")); abort(); /* should not be possible */ } + + --count; } }