usnic: convert some BTL_ERRORs to more descriptive show_help messages
1. After we receive N abnormally-short messages (meaning: corrupted), print a show_help message about it. N defaults to 25. N can be set to 0 disable the message via btl_usnic_max_short_packets. 1. If we receive a completion error for something other than a receive, display a show_help message. Reviewed by Dave Goodell. CMR'ing to v1.8.3, but it will require a custom patch because of the OMPI->OPAL BTL move. cmr=v1.8.3 This commit was SVN r32522.
Этот коммит содержится в:
родитель
5b90af601c
Коммит
6b592d3016
@ -203,6 +203,10 @@ typedef struct opal_btl_usnic_component_t {
|
|||||||
/* ibv_create_ah() (i.e., ARP) timeout */
|
/* ibv_create_ah() (i.e., ARP) timeout */
|
||||||
int arp_timeout;
|
int arp_timeout;
|
||||||
|
|
||||||
|
/** how many short packets have to be received before outputting
|
||||||
|
the "received short packets" warning? */
|
||||||
|
uint32_t max_short_packets;
|
||||||
|
|
||||||
/* Prefix for the connectivity map filename (map will be output if
|
/* Prefix for the connectivity map filename (map will be output if
|
||||||
the prefix is non-NULL) */
|
the prefix is non-NULL) */
|
||||||
char *connectivity_map_prefix;
|
char *connectivity_map_prefix;
|
||||||
|
@ -991,10 +991,22 @@ static int usnic_handle_completion(
|
|||||||
if (cwc->byte_len <
|
if (cwc->byte_len <
|
||||||
(OPAL_BTL_USNIC_PROTO_HDR_SZ +
|
(OPAL_BTL_USNIC_PROTO_HDR_SZ +
|
||||||
sizeof(opal_btl_usnic_btl_header_t))) {
|
sizeof(opal_btl_usnic_btl_header_t))) {
|
||||||
BTL_ERROR(("%s: RX error polling CQ[%d] with status %d for wr_id %" PRIx64 " vend_err %d, byte_len %d",
|
uint32_t m = mca_btl_usnic_component.max_short_packets;
|
||||||
|
++module->num_short_packets;
|
||||||
|
if (OPAL_UNLIKELY(0 != m &&
|
||||||
|
module->num_short_packets >= m)) {
|
||||||
|
opal_show_help("help-mpi-btl-usnic.txt",
|
||||||
|
"received too many short packets",
|
||||||
|
true,
|
||||||
|
opal_process_info.nodename,
|
||||||
ibv_get_device_name(module->device),
|
ibv_get_device_name(module->device),
|
||||||
channel->chan_index, cwc->status, cwc->wr_id,
|
module->if_name,
|
||||||
cwc->vendor_err, cwc->byte_len));
|
module->num_short_packets);
|
||||||
|
|
||||||
|
/* Reset so that we only show this warning once
|
||||||
|
per MPI process */
|
||||||
|
mca_btl_usnic_component.max_short_packets = 0;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
/* silently count CRC errors */
|
/* silently count CRC errors */
|
||||||
++module->stats.num_crc_errors;
|
++module->stats.num_crc_errors;
|
||||||
@ -1003,10 +1015,17 @@ static int usnic_handle_completion(
|
|||||||
channel->repost_recv_head = &rseg->rs_recv_desc;
|
channel->repost_recv_head = &rseg->rs_recv_desc;
|
||||||
return 0;
|
return 0;
|
||||||
} else {
|
} else {
|
||||||
BTL_ERROR(("%s: error polling CQ[%d] with status %d for wr_id %" PRIx64 " opcode %d, vend_err %d",
|
opal_show_help("help-mpi-btl-usnic.txt",
|
||||||
ibv_get_device_name(module->device), channel->chan_index,
|
"non-receive completion error",
|
||||||
cwc->status, cwc->wr_id, cwc->opcode,
|
true,
|
||||||
cwc->vendor_err));
|
opal_process_info.nodename,
|
||||||
|
ibv_get_device_name(module->device),
|
||||||
|
module->if_name,
|
||||||
|
channel->chan_index,
|
||||||
|
cwc->status,
|
||||||
|
(void*) cwc->wr_id,
|
||||||
|
cwc->opcode,
|
||||||
|
cwc->vendor_err);
|
||||||
|
|
||||||
/* mark error on this channel */
|
/* mark error on this channel */
|
||||||
channel->chan_error = true;
|
channel->chan_error = true;
|
||||||
|
@ -163,6 +163,7 @@ int opal_btl_usnic_component_register(void)
|
|||||||
static int eager_limit;
|
static int eager_limit;
|
||||||
static int rndv_eager_limit;
|
static int rndv_eager_limit;
|
||||||
static int pack_lazy_threshold;
|
static int pack_lazy_threshold;
|
||||||
|
static int max_short_packets;
|
||||||
|
|
||||||
#define CHECK(expr) do {\
|
#define CHECK(expr) do {\
|
||||||
tmp = (expr); \
|
tmp = (expr); \
|
||||||
@ -257,6 +258,11 @@ int opal_btl_usnic_component_register(void)
|
|||||||
10, &mca_btl_usnic_component.arp_timeout,
|
10, &mca_btl_usnic_component.arp_timeout,
|
||||||
REGINT_GE_ONE, OPAL_INFO_LVL_6));
|
REGINT_GE_ONE, OPAL_INFO_LVL_6));
|
||||||
|
|
||||||
|
CHECK(reg_int("max_short_packets", "Number of abnormally-short packets received before outputting a warning (0 = never show the warning)",
|
||||||
|
25, &max_short_packets,
|
||||||
|
REGINT_GE_ZERO, OPAL_INFO_LVL_5));
|
||||||
|
mca_btl_usnic_component.max_short_packets = max_short_packets;
|
||||||
|
|
||||||
/* Default to bandwidth auto-detection */
|
/* Default to bandwidth auto-detection */
|
||||||
opal_btl_usnic_module_template.super.btl_bandwidth = 0;
|
opal_btl_usnic_module_template.super.btl_bandwidth = 0;
|
||||||
opal_btl_usnic_module_template.super.btl_latency = 4;
|
opal_btl_usnic_module_template.super.btl_latency = 4;
|
||||||
|
@ -185,6 +185,7 @@ typedef struct opal_btl_usnic_module_t {
|
|||||||
opal_btl_usnic_channel_t mod_channels[USNIC_NUM_CHANNELS];
|
opal_btl_usnic_channel_t mod_channels[USNIC_NUM_CHANNELS];
|
||||||
|
|
||||||
uint32_t qp_max_inline;
|
uint32_t qp_max_inline;
|
||||||
|
uint32_t num_short_packets;
|
||||||
|
|
||||||
/* Performance / debugging statistics */
|
/* Performance / debugging statistics */
|
||||||
opal_btl_usnic_module_stats_t stats;
|
opal_btl_usnic_module_stats_t stats;
|
||||||
|
@ -323,3 +323,45 @@ connectivity map file will not be written.
|
|||||||
Output map file: %s
|
Output map file: %s
|
||||||
Working directory: %s
|
Working directory: %s
|
||||||
Error: %s (%d)
|
Error: %s (%d)
|
||||||
|
#
|
||||||
|
[received too many short packets]
|
||||||
|
WARNING: The usnic BTL received a significant number of abnormally
|
||||||
|
short packets on a single network interface. This may be due to
|
||||||
|
corruption or congestion in the network fabric. It may be helpful to
|
||||||
|
enable no-drop functionality in the fabric. It may also be useful to
|
||||||
|
run a physical/layer 0 diagnostic.
|
||||||
|
|
||||||
|
Your job will continue, but if this poor network behavior continues,
|
||||||
|
you may experience lower-than-expected performance due to overheads
|
||||||
|
caused by higher-than-usual retransmission rates (to compensate for
|
||||||
|
the corrupted received packets).
|
||||||
|
|
||||||
|
Local server: %s
|
||||||
|
usNIC interface: %s (which is %s)
|
||||||
|
# of short packets
|
||||||
|
received so far: %d
|
||||||
|
|
||||||
|
You will only receive this warning once per MPI process per job.
|
||||||
|
|
||||||
|
If you know that your network environment is lossy/heavily congested
|
||||||
|
such that short/corrupted packets are expected, you can disable this
|
||||||
|
warning by setting the btl_usnic_max_short_packets MCA parameter to 0.
|
||||||
|
#
|
||||||
|
[non-receive completion error]
|
||||||
|
WARNING: The usnic BTL has detected an error in the completion of a
|
||||||
|
non-receive event. This is highly unusual, and may indicate an error
|
||||||
|
in the usNIC subsystem on this server.
|
||||||
|
|
||||||
|
Your MPI job will continue, but you should monitor the job and ensure
|
||||||
|
that it behaves correctly.
|
||||||
|
|
||||||
|
Local server: %s
|
||||||
|
usNIC interface: %s (which is %s)
|
||||||
|
Channel index: %d
|
||||||
|
Completion status: %d
|
||||||
|
Work request ID: %p
|
||||||
|
Opcode: %d
|
||||||
|
Vendor error: %d
|
||||||
|
|
||||||
|
If this error keeps happening, you should contact Cisco technical
|
||||||
|
support.
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user