Fixes trac:1215: adds specific show_help messages about PP vs. SRQ/XRC RNR
retry exceeded errors. This commit was SVN r18554. The following Trac tickets were found above: Ticket 1215 --> https://svn.open-mpi.org/trac/ompi/ticket/1215
Этот коммит содержится в:
родитель
8c267d50a3
Коммит
69d78c6739
@ -2446,8 +2446,29 @@ error:
|
||||
wc->status, wc->wr_id, wc->opcode, qp));
|
||||
}
|
||||
|
||||
if(IBV_WC_RETRY_EXC_ERR == wc->status)
|
||||
orte_show_help("help-mpi-btl-openib.txt", "btl_openib:retry-exceeded", true);
|
||||
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
|
||||
IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||
char *peer_hostname =
|
||||
(NULL != endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||
endpoint->endpoint_proc->proc_ompi->proc_hostname :
|
||||
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
|
||||
const char *device_name =
|
||||
ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device);
|
||||
|
||||
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
BTL_OPENIB_QP_TYPE_PP(qp) ?
|
||||
"pp rnr retry exceeded" :
|
||||
"srq rnr retry exceeded", true,
|
||||
orte_process_info.nodename, device_name,
|
||||
peer_hostname);
|
||||
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"pp retry exceeded", true,
|
||||
orte_process_info.nodename,
|
||||
device_name, peer_hostname);
|
||||
}
|
||||
}
|
||||
|
||||
if(openib_btl)
|
||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
||||
|
@ -324,8 +324,10 @@ int btl_openib_register_mca_params(void)
|
||||
mca_btl_openib_component.ib_retry_count = (uint32_t) ival;
|
||||
|
||||
CHECK(reg_int("ib_rnr_retry", "InfiniBand \"receiver not ready\" "
|
||||
"retry count "
|
||||
"(must be >= 0 and <= 7)",
|
||||
"retry count; applies *only* to SRQ/XRC queues. PP queues "
|
||||
"use RNR retry values of 0 because Open MPI performs "
|
||||
"software flow control to guarantee that RNRs never occur "
|
||||
"(must be >= 0 and <= 7; 7 = \"infinite\")",
|
||||
7, &ival, 0));
|
||||
if (ival > 7) {
|
||||
orte_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||
|
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2007 Mellanox Technologies. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
@ -139,7 +139,53 @@ support it?). The failure occured here:
|
||||
You may need to consult with your system administrator to get this
|
||||
problem fixed.
|
||||
#
|
||||
[btl_openib:retry-exceeded]
|
||||
[pp rnr retry exceeded]
|
||||
The OpenFabrics "receiver not ready" retry count on a per-peer
|
||||
connection between two MPI processes has been exceeded. In general,
|
||||
this should not happen because Open MPI uses flow control on per-peer
|
||||
connections to ensure that receivers are always ready when data is
|
||||
sent.
|
||||
|
||||
This error usually means one of two things:
|
||||
|
||||
1. There is something awry within the network fabric itself.
|
||||
2. A bug in Open MPI has caused flow control to malfunction.
|
||||
|
||||
#1 is usually more likely. You should note the hosts on which this
|
||||
error has occurred; it has been observed that rebooting or removing a
|
||||
particular host from the job can sometimes resolve this issue.
|
||||
|
||||
Below is some information about the host that raised the error and the
|
||||
peer to which it was connected:
|
||||
|
||||
Local host: %s
|
||||
Local device: %s
|
||||
Peer host: %s
|
||||
|
||||
You may need to consult with your system administrator to get this
|
||||
problem fixed.
|
||||
#
|
||||
[srq rnr retry exceeded]
|
||||
The OpenFabrics "receiver not ready" retry count on a shared receive
|
||||
queue or XRC receive queue has been exceeded. This error can occur if
|
||||
the mca_btl_openib_ib_rnr_retry is set to a value less than 7 (where 7
|
||||
the default value and effectively means "infinite retry"). If your
|
||||
rnr_retry value is 7, there might be something awry within the network
|
||||
fabric itself. In this case, you should note the hosts on which this
|
||||
error has occurred; it has been observed that rebooting or removing a
|
||||
particular host from the job can sometimes resolve this issue.
|
||||
|
||||
Below is some information about the host that raised the error and the
|
||||
peer to which it was connected:
|
||||
|
||||
Local host: %s
|
||||
Local device: %s
|
||||
Peer host: %s
|
||||
|
||||
You may need to consult with your system administrator to get this
|
||||
problem fixed.
|
||||
#
|
||||
[pp retry exceeded]
|
||||
The InfiniBand retry count between two MPI processes has been
|
||||
exceeded. "Retry count" is defined in the InfiniBand spec 1.2
|
||||
(section 12.7.38):
|
||||
@ -158,13 +204,22 @@ respect to the retry count:
|
||||
|
||||
* btl_openib_ib_retry_count - The number of times the sender will
|
||||
attempt to retry (defaulted to 7, the maximum value).
|
||||
|
||||
* btl_openib_ib_timeout - The local ACK timeout parameter (defaulted
|
||||
to 10). The actual timeout value used is calculated as:
|
||||
|
||||
4.096 microseconds * (2^btl_openib_ib_timeout)
|
||||
|
||||
See the InfiniBand spec 1.2 (section 12.7.34) for more details.
|
||||
|
||||
Below is some information about the host that raised the error and the
|
||||
peer to which it was connected:
|
||||
|
||||
Local host: %s
|
||||
Local device: %s
|
||||
Peer host: %s
|
||||
|
||||
You may need to consult with your system administrator to get this
|
||||
problem fixed.
|
||||
#
|
||||
[no active ports found]
|
||||
WARNING: There is at least one IB HCA found on host '%s', but there are
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user