1
1

Fixes trac:1215: adds specific show_help messages about PP vs. SRQ/XRC RNR

retry exceeded errors.

This commit was SVN r18554.

The following Trac tickets were found above:
  Ticket 1215 --> https://svn.open-mpi.org/trac/ompi/ticket/1215
Этот коммит содержится в:
Jeff Squyres 2008-06-02 11:03:48 +00:00
родитель 8c267d50a3
Коммит 69d78c6739
3 изменённых файлов: 85 добавлений и 7 удалений

Просмотреть файл

@ -2446,8 +2446,29 @@ error:
wc->status, wc->wr_id, wc->opcode, qp)); wc->status, wc->wr_id, wc->opcode, qp));
} }
if(IBV_WC_RETRY_EXC_ERR == wc->status) if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
orte_show_help("help-mpi-btl-openib.txt", "btl_openib:retry-exceeded", true); IBV_WC_RETRY_EXC_ERR == wc->status) {
char *peer_hostname =
(NULL != endpoint->endpoint_proc->proc_ompi->proc_hostname) ?
endpoint->endpoint_proc->proc_ompi->proc_hostname :
"<unknown -- please run with mpi_keep_peer_hostnames=1>";
const char *device_name =
ibv_get_device_name(endpoint->qps[qp].qp->lcl_qp->context->device);
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status) {
orte_show_help("help-mpi-btl-openib.txt",
BTL_OPENIB_QP_TYPE_PP(qp) ?
"pp rnr retry exceeded" :
"srq rnr retry exceeded", true,
orte_process_info.nodename, device_name,
peer_hostname);
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
orte_show_help("help-mpi-btl-openib.txt",
"pp retry exceeded", true,
orte_process_info.nodename,
device_name, peer_hostname);
}
}
if(openib_btl) if(openib_btl)
openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL); openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_FATAL);

Просмотреть файл

@ -324,8 +324,10 @@ int btl_openib_register_mca_params(void)
mca_btl_openib_component.ib_retry_count = (uint32_t) ival; mca_btl_openib_component.ib_retry_count = (uint32_t) ival;
CHECK(reg_int("ib_rnr_retry", "InfiniBand \"receiver not ready\" " CHECK(reg_int("ib_rnr_retry", "InfiniBand \"receiver not ready\" "
"retry count " "retry count; applies *only* to SRQ/XRC queues. PP queues "
"(must be >= 0 and <= 7)", "use RNR retry values of 0 because Open MPI performs "
"software flow control to guarantee that RNRs never occur "
"(must be >= 0 and <= 7; 7 = \"infinite\")",
7, &ival, 0)); 7, &ival, 0));
if (ival > 7) { if (ival > 7) {
orte_show_help("help-mpi-btl-openib.txt", "invalid mca param value", orte_show_help("help-mpi-btl-openib.txt", "invalid mca param value",

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved. # University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California. # Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved. # All rights reserved.
# Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2007 Mellanox Technologies. All rights reserved. # Copyright (c) 2007 Mellanox Technologies. All rights reserved.
# $COPYRIGHT$ # $COPYRIGHT$
# #
@ -139,7 +139,53 @@ support it?). The failure occured here:
You may need to consult with your system administrator to get this You may need to consult with your system administrator to get this
problem fixed. problem fixed.
# #
[btl_openib:retry-exceeded] [pp rnr retry exceeded]
The OpenFabrics "receiver not ready" retry count on a per-peer
connection between two MPI processes has been exceeded. In general,
this should not happen because Open MPI uses flow control on per-peer
connections to ensure that receivers are always ready when data is
sent.
This error usually means one of two things:
1. There is something awry within the network fabric itself.
2. A bug in Open MPI has caused flow control to malfunction.
#1 is usually more likely. You should note the hosts on which this
error has occurred; it has been observed that rebooting or removing a
particular host from the job can sometimes resolve this issue.
Below is some information about the host that raised the error and the
peer to which it was connected:
Local host: %s
Local device: %s
Peer host: %s
You may need to consult with your system administrator to get this
problem fixed.
#
[srq rnr retry exceeded]
The OpenFabrics "receiver not ready" retry count on a shared receive
queue or XRC receive queue has been exceeded. This error can occur if
the mca_btl_openib_ib_rnr_retry is set to a value less than 7 (where 7
the default value and effectively means "infinite retry"). If your
rnr_retry value is 7, there might be something awry within the network
fabric itself. In this case, you should note the hosts on which this
error has occurred; it has been observed that rebooting or removing a
particular host from the job can sometimes resolve this issue.
Below is some information about the host that raised the error and the
peer to which it was connected:
Local host: %s
Local device: %s
Peer host: %s
You may need to consult with your system administrator to get this
problem fixed.
#
[pp retry exceeded]
The InfiniBand retry count between two MPI processes has been The InfiniBand retry count between two MPI processes has been
exceeded. "Retry count" is defined in the InfiniBand spec 1.2 exceeded. "Retry count" is defined in the InfiniBand spec 1.2
(section 12.7.38): (section 12.7.38):
@ -158,13 +204,22 @@ respect to the retry count:
* btl_openib_ib_retry_count - The number of times the sender will * btl_openib_ib_retry_count - The number of times the sender will
attempt to retry (defaulted to 7, the maximum value). attempt to retry (defaulted to 7, the maximum value).
* btl_openib_ib_timeout - The local ACK timeout parameter (defaulted * btl_openib_ib_timeout - The local ACK timeout parameter (defaulted
to 10). The actual timeout value used is calculated as: to 10). The actual timeout value used is calculated as:
4.096 microseconds * (2^btl_openib_ib_timeout) 4.096 microseconds * (2^btl_openib_ib_timeout)
See the InfiniBand spec 1.2 (section 12.7.34) for more details. See the InfiniBand spec 1.2 (section 12.7.34) for more details.
Below is some information about the host that raised the error and the
peer to which it was connected:
Local host: %s
Local device: %s
Peer host: %s
You may need to consult with your system administrator to get this
problem fixed.
# #
[no active ports found] [no active ports found]
WARNING: There is at least one IB HCA found on host '%s', but there are WARNING: There is at least one IB HCA found on host '%s', but there are