btl/usnic: cap the number of resends per progress iteration
New MCA param: btl_usnic_max_resends_per_iteration. This is the max number of resends we'll do in a single pass through usNIC component progress. This prevents progress from getting stuck in an endless loop of retransmissions (i.e., if more retransmissions are triggered during the sending of retransmissions). Specifically: we need to leave the resend loop to allow receives to happen (which may ACK messages we have sent previously, and therefore cause pending resends to be moot). Signed-off-by: Jeff Squyres <jsquyres@cisco.com>
Этот коммит содержится в:
родитель
3cc95d86b2
Коммит
27e3040dfe
@ -190,6 +190,10 @@ typedef struct opal_btl_usnic_component_t {
|
||||
/** retrans characteristics */
|
||||
int retrans_timeout;
|
||||
|
||||
/** max number of messages re-sent during a single progress
|
||||
iteration */
|
||||
int max_resends_per_iteration;
|
||||
|
||||
/** minimum number of times through component progress before
|
||||
checking to see if standalone ACKs need to be sent */
|
||||
int ack_iteration_delay;
|
||||
|
@ -249,6 +249,10 @@ int opal_btl_usnic_component_register(void)
|
||||
100000, &mca_btl_usnic_component.retrans_timeout,
|
||||
REGINT_GE_ONE, OPAL_INFO_LVL_5));
|
||||
|
||||
CHECK(reg_int("max_resends_per_iteration", "Maximum number of frames to resend in a single iteration through usNIC component progress",
|
||||
16, &mca_btl_usnic_component.max_resends_per_iteration,
|
||||
REGINT_GE_ONE, OPAL_INFO_LVL_5));
|
||||
|
||||
CHECK(reg_int("ack_iteration_delay", "Minimum number of times through usNIC \"progress\" function before checking to see if standalone ACKs need to be sent",
|
||||
0, &mca_btl_usnic_component.ack_iteration_delay,
|
||||
REGINT_GE_ZERO, OPAL_INFO_LVL_5));
|
||||
|
@ -954,11 +954,12 @@ usnic_do_resends(
|
||||
opal_btl_usnic_send_segment_t *sseg;
|
||||
opal_btl_usnic_endpoint_t *endpoint;
|
||||
struct opal_btl_usnic_channel_t *data_channel;
|
||||
int ret;
|
||||
int ret, count;
|
||||
|
||||
data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
|
||||
|
||||
while ((get_send_credits(data_channel) > 1) &&
|
||||
count = mca_btl_usnic_component.max_resends_per_iteration;
|
||||
while (count > 0 && (get_send_credits(data_channel) > 1) &&
|
||||
!opal_list_is_empty(&module->pending_resend_segs)) {
|
||||
|
||||
/*
|
||||
@ -999,6 +1000,8 @@ usnic_do_resends(
|
||||
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
|
||||
opal_btl_usnic_util_abort("hotel checkin failed\n", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user