btl/usnic: cap the number of resends per progress iteration
New MCA param: btl_usnic_max_resends_per_iteration. This is the max number of resends we'll do in a single pass through usNIC component progress. This prevents progress from getting stuck in an endless loop of retransmissions (i.e., if more retransmissions are triggered during the sending of retransmissions). Specifically: we need to leave the resend loop to allow receives to happen (which may ACK messages we have sent previously, and therefore cause pending resends to be moot). Signed-off-by: Jeff Squyres <jsquyres@cisco.com> (cherry picked from commit 27e3040dfeba00a9a2615a217c164899f0009e59)
Этот коммит содержится в:
родитель
8f929c68f1
Коммит
58155bc760
@ -206,6 +206,10 @@ typedef struct opal_btl_usnic_component_t {
|
|||||||
/** retrans characteristics */
|
/** retrans characteristics */
|
||||||
int retrans_timeout;
|
int retrans_timeout;
|
||||||
|
|
||||||
|
/** max number of messages re-sent during a single progress
|
||||||
|
iteration */
|
||||||
|
int max_resends_per_iteration;
|
||||||
|
|
||||||
/** minimum number of times through component progress before
|
/** minimum number of times through component progress before
|
||||||
checking to see if standalone ACKs need to be sent */
|
checking to see if standalone ACKs need to be sent */
|
||||||
int ack_iteration_delay;
|
int ack_iteration_delay;
|
||||||
|
@ -260,6 +260,10 @@ int opal_btl_usnic_component_register(void)
|
|||||||
100000, &mca_btl_usnic_component.retrans_timeout,
|
100000, &mca_btl_usnic_component.retrans_timeout,
|
||||||
REGINT_GE_ONE, OPAL_INFO_LVL_5));
|
REGINT_GE_ONE, OPAL_INFO_LVL_5));
|
||||||
|
|
||||||
|
CHECK(reg_int("max_resends_per_iteration", "Maximum number of frames to resend in a single iteration through usNIC component progress",
|
||||||
|
16, &mca_btl_usnic_component.max_resends_per_iteration,
|
||||||
|
REGINT_GE_ONE, OPAL_INFO_LVL_5));
|
||||||
|
|
||||||
CHECK(reg_int("ack_iteration_delay", "Minimum number of times through usNIC \"progress\" function before checking to see if standalone ACKs need to be sent",
|
CHECK(reg_int("ack_iteration_delay", "Minimum number of times through usNIC \"progress\" function before checking to see if standalone ACKs need to be sent",
|
||||||
0, &mca_btl_usnic_component.ack_iteration_delay,
|
0, &mca_btl_usnic_component.ack_iteration_delay,
|
||||||
REGINT_GE_ZERO, OPAL_INFO_LVL_5));
|
REGINT_GE_ZERO, OPAL_INFO_LVL_5));
|
||||||
|
@ -963,11 +963,12 @@ usnic_do_resends(
|
|||||||
opal_btl_usnic_send_segment_t *sseg;
|
opal_btl_usnic_send_segment_t *sseg;
|
||||||
opal_btl_usnic_endpoint_t *endpoint;
|
opal_btl_usnic_endpoint_t *endpoint;
|
||||||
struct opal_btl_usnic_channel_t *data_channel;
|
struct opal_btl_usnic_channel_t *data_channel;
|
||||||
int ret;
|
int ret, count;
|
||||||
|
|
||||||
data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
|
data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
|
||||||
|
|
||||||
while ((get_send_credits(data_channel) > 1) &&
|
count = mca_btl_usnic_component.max_resends_per_iteration;
|
||||||
|
while (count > 0 && (get_send_credits(data_channel) > 1) &&
|
||||||
!opal_list_is_empty(&module->pending_resend_segs)) {
|
!opal_list_is_empty(&module->pending_resend_segs)) {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1009,6 +1010,8 @@ usnic_do_resends(
|
|||||||
BTL_ERROR(("hotel checkin failed\n"));
|
BTL_ERROR(("hotel checkin failed\n"));
|
||||||
abort(); /* should not be possible */
|
abort(); /* should not be possible */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
--count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user