Fixes trac:1215: adds specific show_help messages about PP vs. SRQ/XRC RNR
retry exceeded errors. This commit was SVN r18554. The following Trac tickets were found above: Ticket 1215 --> https://svn.open-mpi.org/trac/ompi/ticket/1215
Этот коммит содержится в:
родитель
8c267d50a3
Коммит
69d78c6739
@ -2446,8 +2446,29 @@ error:
|
|||||||
wc->status, wc->wr_id, wc->opcode, qp));
|
wc->status, wc->wr_id, wc->opcode, qp));
|
||||||
}
|
}
|
||||||
|
|
||||||
if(IBV_WC_RETRY_EXC_ERR == wc->status)
|
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
|
||||||
orte_show_help("help-mpi-btl-openib.txt", "btl_openib:retry-exceeded", true);
|
IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||||
|
char *peer_hostname =
|
||||||
|
(NULL != endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
|
||||||
|
endpoint->endpoint_proc->proc_ompi->proc_hostname :
|
||||||
|
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
|
||||||
|
const char *device_name =
|
||||||
|
ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device);
|
||||||
|
|
||||||
|
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status) {
|
||||||
|
orte_show_help("help-mpi-btl-openib.txt",
|
||||||
|
BTL_OPENIB_QP_TYPE_PP(qp) ?
|
||||||
|
"pp rnr retry exceeded" :
|
||||||
|
"srq rnr retry exceeded", true,
|
||||||
|
orte_process_info.nodename, device_name,
|
||||||
|
peer_hostname);
|
||||||
|
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||||
|
orte_show_help("help-mpi-btl-openib.txt",
|
||||||
|
"pp retry exceeded", true,
|
||||||
|
orte_process_info.nodename,
|
||||||
|
device_name, peer_hostname);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(openib_btl)
|
if(openib_btl)
|
||||||
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);
|
||||||
|
@ -324,8 +324,10 @@ int btl_openib_register_mca_params(void)
|
|||||||
mca_btl_openib_component.ib_retry_count = (uint32_t) ival;
|
mca_btl_openib_component.ib_retry_count = (uint32_t) ival;
|
||||||
|
|
||||||
CHECK(reg_int("ib_rnr_retry", "InfiniBand \"receiver not ready\" "
|
CHECK(reg_int("ib_rnr_retry", "InfiniBand \"receiver not ready\" "
|
||||||
"retry count "
|
"retry count; applies *only* to SRQ/XRC queues. PP queues "
|
||||||
"(must be >= 0 and <= 7)",
|
"use RNR retry values of 0 because Open MPI performs "
|
||||||
|
"software flow control to guarantee that RNRs never occur "
|
||||||
|
"(must be >= 0 and <= 7; 7 = \"infinite\")",
|
||||||
7, &ival, 0));
|
7, &ival, 0));
|
||||||
if (ival > 7) {
|
if (ival > 7) {
|
||||||
orte_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
orte_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
# Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
||||||
# Copyright (c) 2007 Mellanox Technologies. All rights reserved.
|
# Copyright (c) 2007 Mellanox Technologies. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
@ -139,7 +139,53 @@ support it?). The failure occured here:
|
|||||||
You may need to consult with your system administrator to get this
|
You may need to consult with your system administrator to get this
|
||||||
problem fixed.
|
problem fixed.
|
||||||
#
|
#
|
||||||
[btl_openib:retry-exceeded]
|
[pp rnr retry exceeded]
|
||||||
|
The OpenFabrics "receiver not ready" retry count on a per-peer
|
||||||
|
connection between two MPI processes has been exceeded. In general,
|
||||||
|
this should not happen because Open MPI uses flow control on per-peer
|
||||||
|
connections to ensure that receivers are always ready when data is
|
||||||
|
sent.
|
||||||
|
|
||||||
|
This error usually means one of two things:
|
||||||
|
|
||||||
|
1. There is something awry within the network fabric itself.
|
||||||
|
2. A bug in Open MPI has caused flow control to malfunction.
|
||||||
|
|
||||||
|
#1 is usually more likely. You should note the hosts on which this
|
||||||
|
error has occurred; it has been observed that rebooting or removing a
|
||||||
|
particular host from the job can sometimes resolve this issue.
|
||||||
|
|
||||||
|
Below is some information about the host that raised the error and the
|
||||||
|
peer to which it was connected:
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Local device: %s
|
||||||
|
Peer host: %s
|
||||||
|
|
||||||
|
You may need to consult with your system administrator to get this
|
||||||
|
problem fixed.
|
||||||
|
#
|
||||||
|
[srq rnr retry exceeded]
|
||||||
|
The OpenFabrics "receiver not ready" retry count on a shared receive
|
||||||
|
queue or XRC receive queue has been exceeded. This error can occur if
|
||||||
|
the mca_btl_openib_ib_rnr_retry is set to a value less than 7 (where 7
|
||||||
|
the default value and effectively means "infinite retry"). If your
|
||||||
|
rnr_retry value is 7, there might be something awry within the network
|
||||||
|
fabric itself. In this case, you should note the hosts on which this
|
||||||
|
error has occurred; it has been observed that rebooting or removing a
|
||||||
|
particular host from the job can sometimes resolve this issue.
|
||||||
|
|
||||||
|
Below is some information about the host that raised the error and the
|
||||||
|
peer to which it was connected:
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Local device: %s
|
||||||
|
Peer host: %s
|
||||||
|
|
||||||
|
You may need to consult with your system administrator to get this
|
||||||
|
problem fixed.
|
||||||
|
#
|
||||||
|
[pp retry exceeded]
|
||||||
The InfiniBand retry count between two MPI processes has been
|
The InfiniBand retry count between two MPI processes has been
|
||||||
exceeded. "Retry count" is defined in the InfiniBand spec 1.2
|
exceeded. "Retry count" is defined in the InfiniBand spec 1.2
|
||||||
(section 12.7.38):
|
(section 12.7.38):
|
||||||
@ -158,13 +204,22 @@ respect to the retry count:
|
|||||||
|
|
||||||
* btl_openib_ib_retry_count - The number of times the sender will
|
* btl_openib_ib_retry_count - The number of times the sender will
|
||||||
attempt to retry (defaulted to 7, the maximum value).
|
attempt to retry (defaulted to 7, the maximum value).
|
||||||
|
|
||||||
* btl_openib_ib_timeout - The local ACK timeout parameter (defaulted
|
* btl_openib_ib_timeout - The local ACK timeout parameter (defaulted
|
||||||
to 10). The actual timeout value used is calculated as:
|
to 10). The actual timeout value used is calculated as:
|
||||||
|
|
||||||
4.096 microseconds * (2^btl_openib_ib_timeout)
|
4.096 microseconds * (2^btl_openib_ib_timeout)
|
||||||
|
|
||||||
See the InfiniBand spec 1.2 (section 12.7.34) for more details.
|
See the InfiniBand spec 1.2 (section 12.7.34) for more details.
|
||||||
|
|
||||||
|
Below is some information about the host that raised the error and the
|
||||||
|
peer to which it was connected:
|
||||||
|
|
||||||
|
Local host: %s
|
||||||
|
Local device: %s
|
||||||
|
Peer host: %s
|
||||||
|
|
||||||
|
You may need to consult with your system administrator to get this
|
||||||
|
problem fixed.
|
||||||
#
|
#
|
||||||
[no active ports found]
|
[no active ports found]
|
||||||
WARNING: There is at least one IB HCA found on host '%s', but there are
|
WARNING: There is at least one IB HCA found on host '%s', but there are
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user