usnic: convert some BTL_ERRORs to more descriptive show_help messages
1. After we receive N abnormally-short messages (meaning: corrupted), print a show_help message about it. N defaults to 25. N can be set to 0 disable the message via btl_usnic_max_short_packets. 1. If we receive a completion error for something other than a receive, display a show_help message. Reviewed by Dave Goodell. CMR'ing to v1.8.3, but it will require a custom patch because of the OMPI->OPAL BTL move. cmr=v1.8.3 This commit was SVN r32522.
Этот коммит содержится в:
родитель
5b90af601c
Коммит
6b592d3016
@ -203,6 +203,10 @@ typedef struct opal_btl_usnic_component_t {
|
||||
/* ibv_create_ah() (i.e., ARP) timeout */
|
||||
int arp_timeout;
|
||||
|
||||
/** how many short packets have to be received before outputting
|
||||
the "received short packets" warning? */
|
||||
uint32_t max_short_packets;
|
||||
|
||||
/* Prefix for the connectivity map filename (map will be output if
|
||||
the prefix is non-NULL) */
|
||||
char *connectivity_map_prefix;
|
||||
|
@ -991,10 +991,22 @@ static int usnic_handle_completion(
|
||||
if (cwc->byte_len <
|
||||
(OPAL_BTL_USNIC_PROTO_HDR_SZ +
|
||||
sizeof(opal_btl_usnic_btl_header_t))) {
|
||||
BTL_ERROR(("%s: RX error polling CQ[%d] with status %d for wr_id %" PRIx64 " vend_err %d, byte_len %d",
|
||||
ibv_get_device_name(module->device),
|
||||
channel->chan_index, cwc->status, cwc->wr_id,
|
||||
cwc->vendor_err, cwc->byte_len));
|
||||
uint32_t m = mca_btl_usnic_component.max_short_packets;
|
||||
++module->num_short_packets;
|
||||
if (OPAL_UNLIKELY(0 != m &&
|
||||
module->num_short_packets >= m)) {
|
||||
opal_show_help("help-mpi-btl-usnic.txt",
|
||||
"received too many short packets",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->if_name,
|
||||
module->num_short_packets);
|
||||
|
||||
/* Reset so that we only show this warning once
|
||||
per MPI process */
|
||||
mca_btl_usnic_component.max_short_packets = 0;
|
||||
}
|
||||
} else {
|
||||
/* silently count CRC errors */
|
||||
++module->stats.num_crc_errors;
|
||||
@ -1003,10 +1015,17 @@ static int usnic_handle_completion(
|
||||
channel->repost_recv_head = &rseg->rs_recv_desc;
|
||||
return 0;
|
||||
} else {
|
||||
BTL_ERROR(("%s: error polling CQ[%d] with status %d for wr_id %" PRIx64 " opcode %d, vend_err %d",
|
||||
ibv_get_device_name(module->device), channel->chan_index,
|
||||
cwc->status, cwc->wr_id, cwc->opcode,
|
||||
cwc->vendor_err));
|
||||
opal_show_help("help-mpi-btl-usnic.txt",
|
||||
"non-receive completion error",
|
||||
true,
|
||||
opal_process_info.nodename,
|
||||
ibv_get_device_name(module->device),
|
||||
module->if_name,
|
||||
channel->chan_index,
|
||||
cwc->status,
|
||||
(void*) cwc->wr_id,
|
||||
cwc->opcode,
|
||||
cwc->vendor_err);
|
||||
|
||||
/* mark error on this channel */
|
||||
channel->chan_error = true;
|
||||
|
@ -163,6 +163,7 @@ int opal_btl_usnic_component_register(void)
|
||||
static int eager_limit;
|
||||
static int rndv_eager_limit;
|
||||
static int pack_lazy_threshold;
|
||||
static int max_short_packets;
|
||||
|
||||
#define CHECK(expr) do {\
|
||||
tmp = (expr); \
|
||||
@ -257,6 +258,11 @@ int opal_btl_usnic_component_register(void)
|
||||
10, &mca_btl_usnic_component.arp_timeout,
|
||||
REGINT_GE_ONE, OPAL_INFO_LVL_6));
|
||||
|
||||
CHECK(reg_int("max_short_packets", "Number of abnormally-short packets received before outputting a warning (0 = never show the warning)",
|
||||
25, &max_short_packets,
|
||||
REGINT_GE_ZERO, OPAL_INFO_LVL_5));
|
||||
mca_btl_usnic_component.max_short_packets = max_short_packets;
|
||||
|
||||
/* Default to bandwidth auto-detection */
|
||||
opal_btl_usnic_module_template.super.btl_bandwidth = 0;
|
||||
opal_btl_usnic_module_template.super.btl_latency = 4;
|
||||
|
@ -185,6 +185,7 @@ typedef struct opal_btl_usnic_module_t {
|
||||
opal_btl_usnic_channel_t mod_channels[USNIC_NUM_CHANNELS];
|
||||
|
||||
uint32_t qp_max_inline;
|
||||
uint32_t num_short_packets;
|
||||
|
||||
/* Performance / debugging statistics */
|
||||
opal_btl_usnic_module_stats_t stats;
|
||||
|
@ -323,3 +323,45 @@ connectivity map file will not be written.
|
||||
Output map file: %s
|
||||
Working directory: %s
|
||||
Error: %s (%d)
|
||||
#
|
||||
[received too many short packets]
|
||||
WARNING: The usnic BTL received a significant number of abnormally
|
||||
short packets on a single network interface. This may be due to
|
||||
corruption or congestion in the network fabric. It may be helpful to
|
||||
enable no-drop functionality in the fabric. It may also be useful to
|
||||
run a physical/layer 0 diagnostic.
|
||||
|
||||
Your job will continue, but if this poor network behavior continues,
|
||||
you may experience lower-than-expected performance due to overheads
|
||||
caused by higher-than-usual retransmission rates (to compensate for
|
||||
the corrupted received packets).
|
||||
|
||||
Local server: %s
|
||||
usNIC interface: %s (which is %s)
|
||||
# of short packets
|
||||
received so far: %d
|
||||
|
||||
You will only receive this warning once per MPI process per job.
|
||||
|
||||
If you know that your network environment is lossy/heavily congested
|
||||
such that short/corrupted packets are expected, you can disable this
|
||||
warning by setting the btl_usnic_max_short_packets MCA parameter to 0.
|
||||
#
|
||||
[non-receive completion error]
|
||||
WARNING: The usnic BTL has detected an error in the completion of a
|
||||
non-receive event. This is highly unusual, and may indicate an error
|
||||
in the usNIC subsystem on this server.
|
||||
|
||||
Your MPI job will continue, but you should monitor the job and ensure
|
||||
that it behaves correctly.
|
||||
|
||||
Local server: %s
|
||||
usNIC interface: %s (which is %s)
|
||||
Channel index: %d
|
||||
Completion status: %d
|
||||
Work request ID: %p
|
||||
Opcode: %d
|
||||
Vendor error: %d
|
||||
|
||||
If this error keeps happening, you should contact Cisco technical
|
||||
support.
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user