btl/usnic: cap the number of resends per progress iteration
New MCA param: btl_usnic_max_resends_per_iteration. This is the max number of resends we'll do in a single pass through usNIC component progress. This prevents progress from getting stuck in an endless loop of retransmissions (i.e., if more retransmissions are triggered during the sending of retransmissions). Specifically: we need to leave the resend loop to allow receives to happen (which may ACK messages we have sent previously, and therefore cause pending resends to be moot). Signed-off-by: Jeff Squyres <jsquyres@cisco.com> (cherry picked from commit 27e3040dfeba00a9a2615a217c164899f0009e59)
Этот коммит содержится в:
родитель
8f929c68f1
Коммит
58155bc760
@ -206,6 +206,10 @@ typedef struct opal_btl_usnic_component_t {
|
||||
/** retrans characteristics */
|
||||
int retrans_timeout;
|
||||
|
||||
/** max number of messages re-sent during a single progress
|
||||
iteration */
|
||||
int max_resends_per_iteration;
|
||||
|
||||
/** minimum number of times through component progress before
|
||||
checking to see if standalone ACKs need to be sent */
|
||||
int ack_iteration_delay;
|
||||
|
@ -260,6 +260,10 @@ int opal_btl_usnic_component_register(void)
|
||||
100000, &mca_btl_usnic_component.retrans_timeout,
|
||||
REGINT_GE_ONE, OPAL_INFO_LVL_5));
|
||||
|
||||
CHECK(reg_int("max_resends_per_iteration", "Maximum number of frames to resend in a single iteration through usNIC component progress",
|
||||
16, &mca_btl_usnic_component.max_resends_per_iteration,
|
||||
REGINT_GE_ONE, OPAL_INFO_LVL_5));
|
||||
|
||||
CHECK(reg_int("ack_iteration_delay", "Minimum number of times through usNIC \"progress\" function before checking to see if standalone ACKs need to be sent",
|
||||
0, &mca_btl_usnic_component.ack_iteration_delay,
|
||||
REGINT_GE_ZERO, OPAL_INFO_LVL_5));
|
||||
|
@ -963,11 +963,12 @@ usnic_do_resends(
|
||||
opal_btl_usnic_send_segment_t *sseg;
|
||||
opal_btl_usnic_endpoint_t *endpoint;
|
||||
struct opal_btl_usnic_channel_t *data_channel;
|
||||
int ret;
|
||||
int ret, count;
|
||||
|
||||
data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
|
||||
|
||||
while ((get_send_credits(data_channel) > 1) &&
|
||||
count = mca_btl_usnic_component.max_resends_per_iteration;
|
||||
while (count > 0 && (get_send_credits(data_channel) > 1) &&
|
||||
!opal_list_is_empty(&module->pending_resend_segs)) {
|
||||
|
||||
/*
|
||||
@ -1009,6 +1010,8 @@ usnic_do_resends(
|
||||
BTL_ERROR(("hotel checkin failed\n"));
|
||||
abort(); /* should not be possible */
|
||||
}
|
||||
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user