1
1

btl/usnic: cap the number of resends per progress iteration

New MCA param: btl_usnic_max_resends_per_iteration.  This is the max
number of resends we'll do in a single pass through usNIC component
progress.  This prevents progress from getting stuck in an endless
loop of retransmissions (i.e., if more retransmissions are triggered
during the sending of retransmissions).  Specifically: we need to
leave the resend loop to allow receives to happen (which may ACK
messages we have sent previously, and therefore cause pending resends
to be moot).

Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
(cherry picked from commit 27e3040dfeba00a9a2615a217c164899f0009e59)
Этот коммит содержится в:
Jeff Squyres 2019-10-04 12:05:13 -07:00
родитель 8f929c68f1
Коммит 58155bc760
3 изменённых файлов: 13 добавлений и 2 удалений

Просмотреть файл

@ -206,6 +206,10 @@ typedef struct opal_btl_usnic_component_t {
/** retrans characteristics */
int retrans_timeout;
/** max number of messages re-sent during a single progress
iteration */
int max_resends_per_iteration;
/** minimum number of times through component progress before
checking to see if standalone ACKs need to be sent */
int ack_iteration_delay;

Просмотреть файл

@ -260,6 +260,10 @@ int opal_btl_usnic_component_register(void)
100000, &mca_btl_usnic_component.retrans_timeout,
REGINT_GE_ONE, OPAL_INFO_LVL_5));
CHECK(reg_int("max_resends_per_iteration", "Maximum number of frames to resend in a single iteration through usNIC component progress",
16, &mca_btl_usnic_component.max_resends_per_iteration,
REGINT_GE_ONE, OPAL_INFO_LVL_5));
CHECK(reg_int("ack_iteration_delay", "Minimum number of times through usNIC \"progress\" function before checking to see if standalone ACKs need to be sent",
0, &mca_btl_usnic_component.ack_iteration_delay,
REGINT_GE_ZERO, OPAL_INFO_LVL_5));

Просмотреть файл

@ -963,11 +963,12 @@ usnic_do_resends(
opal_btl_usnic_send_segment_t *sseg;
opal_btl_usnic_endpoint_t *endpoint;
struct opal_btl_usnic_channel_t *data_channel;
int ret;
int ret, count;
data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
while ((get_send_credits(data_channel) > 1) &&
count = mca_btl_usnic_component.max_resends_per_iteration;
while (count > 0 && (get_send_credits(data_channel) > 1) &&
!opal_list_is_empty(&module->pending_resend_segs)) {
/*
@ -1009,6 +1010,8 @@ usnic_do_resends(
BTL_ERROR(("hotel checkin failed\n"));
abort(); /* should not be possible */
}
--count;
}
}