1
1

usnic: convert some BTL_ERRORs to more descriptive show_help messages

1. After we receive N abnormally-short messages (meaning: corrupted),
print a show_help message about it.  N defaults to 25.  N can be set
to 0 disable the message via btl_usnic_max_short_packets.
1. If we receive a completion error for something other than a
receive, display a show_help message.

Reviewed by Dave Goodell.

CMR'ing to v1.8.3, but it will require a custom patch because of the
OMPI->OPAL BTL move.

cmr=v1.8.3

This commit was SVN r32522.
Этот коммит содержится в:
Jeff Squyres 2014-08-13 15:01:20 +00:00
родитель 5b90af601c
Коммит 6b592d3016
5 изменённых файлов: 80 добавлений и 8 удалений

Просмотреть файл

@ -203,6 +203,10 @@ typedef struct opal_btl_usnic_component_t {
/* ibv_create_ah() (i.e., ARP) timeout */
int arp_timeout;
/** how many short packets have to be received before outputting
the "received short packets" warning? */
uint32_t max_short_packets;
/* Prefix for the connectivity map filename (map will be output if
the prefix is non-NULL) */
char *connectivity_map_prefix;

Просмотреть файл

@ -991,10 +991,22 @@ static int usnic_handle_completion(
if (cwc->byte_len <
(OPAL_BTL_USNIC_PROTO_HDR_SZ +
sizeof(opal_btl_usnic_btl_header_t))) {
BTL_ERROR(("%s: RX error polling CQ[%d] with status %d for wr_id %" PRIx64 " vend_err %d, byte_len %d",
ibv_get_device_name(module->device),
channel->chan_index, cwc->status, cwc->wr_id,
cwc->vendor_err, cwc->byte_len));
uint32_t m = mca_btl_usnic_component.max_short_packets;
++module->num_short_packets;
if (OPAL_UNLIKELY(0 != m &&
module->num_short_packets >= m)) {
opal_show_help("help-mpi-btl-usnic.txt",
"received too many short packets",
true,
opal_process_info.nodename,
ibv_get_device_name(module->device),
module->if_name,
module->num_short_packets);
/* Reset so that we only show this warning once
per MPI process */
mca_btl_usnic_component.max_short_packets = 0;
}
} else {
/* silently count CRC errors */
++module->stats.num_crc_errors;
@ -1003,10 +1015,17 @@ static int usnic_handle_completion(
channel->repost_recv_head = &rseg->rs_recv_desc;
return 0;
} else {
BTL_ERROR(("%s: error polling CQ[%d] with status %d for wr_id %" PRIx64 " opcode %d, vend_err %d",
ibv_get_device_name(module->device), channel->chan_index,
cwc->status, cwc->wr_id, cwc->opcode,
cwc->vendor_err));
opal_show_help("help-mpi-btl-usnic.txt",
"non-receive completion error",
true,
opal_process_info.nodename,
ibv_get_device_name(module->device),
module->if_name,
channel->chan_index,
cwc->status,
(void*) cwc->wr_id,
cwc->opcode,
cwc->vendor_err);
/* mark error on this channel */
channel->chan_error = true;

Просмотреть файл

@ -163,6 +163,7 @@ int opal_btl_usnic_component_register(void)
static int eager_limit;
static int rndv_eager_limit;
static int pack_lazy_threshold;
static int max_short_packets;
#define CHECK(expr) do {\
tmp = (expr); \
@ -257,6 +258,11 @@ int opal_btl_usnic_component_register(void)
10, &mca_btl_usnic_component.arp_timeout,
REGINT_GE_ONE, OPAL_INFO_LVL_6));
CHECK(reg_int("max_short_packets", "Number of abnormally-short packets received before outputting a warning (0 = never show the warning)",
25, &max_short_packets,
REGINT_GE_ZERO, OPAL_INFO_LVL_5));
mca_btl_usnic_component.max_short_packets = max_short_packets;
/* Default to bandwidth auto-detection */
opal_btl_usnic_module_template.super.btl_bandwidth = 0;
opal_btl_usnic_module_template.super.btl_latency = 4;

Просмотреть файл

@ -185,6 +185,7 @@ typedef struct opal_btl_usnic_module_t {
opal_btl_usnic_channel_t mod_channels[USNIC_NUM_CHANNELS];
uint32_t qp_max_inline;
uint32_t num_short_packets;
/* Performance / debugging statistics */
opal_btl_usnic_module_stats_t stats;

Просмотреть файл

@ -323,3 +323,45 @@ connectivity map file will not be written.
Output map file: %s
Working directory: %s
Error: %s (%d)
#
[received too many short packets]
WARNING: The usnic BTL received a significant number of abnormally
short packets on a single network interface. This may be due to
corruption or congestion in the network fabric. It may be helpful to
enable no-drop functionality in the fabric. It may also be useful to
run a physical/layer 0 diagnostic.
Your job will continue, but if this poor network behavior continues,
you may experience lower-than-expected performance due to overheads
caused by higher-than-usual retransmission rates (to compensate for
the corrupted received packets).
Local server: %s
usNIC interface: %s (which is %s)
# of short packets
received so far: %d
You will only receive this warning once per MPI process per job.
If you know that your network environment is lossy/heavily congested
such that short/corrupted packets are expected, you can disable this
warning by setting the btl_usnic_max_short_packets MCA parameter to 0.
#
[non-receive completion error]
WARNING: The usnic BTL has detected an error in the completion of a
non-receive event. This is highly unusual, and may indicate an error
in the usNIC subsystem on this server.
Your MPI job will continue, but you should monitor the job and ensure
that it behaves correctly.
Local server: %s
usNIC interface: %s (which is %s)
Channel index: %d
Completion status: %d
Work request ID: %p
Opcode: %d
Vendor error: %d
If this error keeps happening, you should contact Cisco technical
support.