1
1

add better error messages for vapi retry exceeded errors.

This commit was SVN r10219.
Этот коммит содержится в:
Galen Shipman 2006-06-06 02:04:56 +00:00
родитель d4b5c98247
Коммит cc54b07aa0
5 изменённых файлов: 77 добавлений и 7 удалений

Просмотреть файл

@ -23,6 +23,8 @@
CFLAGS = $(btl_mvapi_CFLAGS)
AM_CPPFLAGS = $(btl_mvapi_CPPFLAGS)
dist_pkgdata_DATA=help-mpi-btl-mvapi.txt
sources = \
btl_mvapi.c \
btl_mvapi.h \

Просмотреть файл

@ -33,7 +33,8 @@
#include "opal/util/output.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/proc/proc.h"
#include "opal/util/show_help.h"
#include "opal/mca/base/mca_base_param.h"
#include "orte/mca/errmgr/errmgr.h"
#include "ompi/mca/mpool/base/base.h"
@ -41,6 +42,7 @@
#include "btl_mvapi_frag.h"
#include "btl_mvapi_endpoint.h"
#include "btl_mvapi_eager_rdma.h"
#include "btl_mvapi_proc.h"
#include "ompi/mca/btl/base/base.h"
#include <vapi.h>
#include <vapi_common.h>
@ -774,9 +776,22 @@ int mca_btl_mvapi_component_progress( void )
ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_hp, &comp);
if(VAPI_OK == ret) {
if(comp.status != VAPI_SUCCESS) {
BTL_ERROR(("Got error : %s, Vendor code : %d Frag : %p",
VAPI_wc_status_sym(comp.status),
comp.vendor_err_syndrome, comp.id));
ompi_proc_t* remote_proc = NULL;
frag = (mca_btl_mvapi_frag_t*) (unsigned long) comp.id;
if(frag) {
endpoint = (mca_btl_mvapi_endpoint_t*) frag->endpoint;
if(endpoint &&
endpoint->endpoint_proc &&
endpoint->endpoint_proc->proc_ompi) {
remote_proc = endpoint->endpoint_proc->proc_ompi;
}
}
BTL_PEER_ERROR(remote_proc, ("error polling HP CQ with status %s status number %d for Frag : %p",
VAPI_wc_status_sym(comp.status),
comp.status, comp.id));
if(comp.status == VAPI_RETRY_EXC_ERR) {
opal_show_help("help-mpi-btl-mvapi.txt", "btl_mvapi:retry-exceeded", true);
}
return OMPI_ERROR;
}

41
ompi/mca/btl/mvapi/help-mpi-btl-mvapi.txt Обычный файл
Просмотреть файл

@ -0,0 +1,41 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2006 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open MPI.
#
[btl_mvapi:retry-exceeded]
The retry count is a down counter initialized on creation of the QP. Retry
count is defined in the InfiniBand Spec 1.2 (12.7.38):
The total number of times that the sender wishes the receiver to retry tim-
eout, packet sequence, etc. errors before posting a completion error.
Note that two mca parameters are involved here:
btl_openib_ib_retry_count - The number of times the sender will attempt to
retry (defaulted to 7, the maximum value).
btl_openib_ib_timeout - The local ack timeout parameter (defaulted to 10). The
actual timeout value used is calculated as:
(4.096 micro-seconds * 2^btl_openib_ib_timeout).
See InfiniBand Spec 1.2 (12.7.34) for more details.
What to do next:
One item to note is the hosts on which this error has occured, it has been
observed that rebooting or removing a particular host from the job can resolve
this issue. Should you be able to identify a specific cause or additional
trouble shooting information please report this to devel@open-mpi.org.

Просмотреть файл

@ -874,7 +874,7 @@ int mca_btl_openib_component_progress()
remote_proc = endpoint->endpoint_proc->proc_ompi;
}
}
BTL_PEER_ERROR(remote_proc, ("error polling HP CQ with status %s status number %d for wr_id %llu opcode %d\n",
BTL_PEER_ERROR(remote_proc, ("error polling HP CQ with status %s status number %d for wr_id %llu opcode %d",
mca_btl_openib_component_status_to_string(wc.status),
wc.status, wc.wr_id, wc.opcode));
if(wc.status == IBV_WC_RETRY_EXC_ERR) {

Просмотреть файл

@ -25,5 +25,17 @@ The total number of times that the sender wishes the receiver to retry tim-
eout, packet sequence, etc. errors before posting a completion error.
Note that two mca parameters are involved here:
btl_openib_ib_retry_count - The number of times the sender will attempt to retry (defaulted to 7, the maximum value).
btl_openib_ib_timeout - The local ack timeout parameter (defaulted to 10). The actual timeout value used is calculated as: (4.096 micro-seconds * 2^btl_openib_ib_timeout). See InfiniBand Spec 1.2 (12.7.34) for more details.
btl_openib_ib_retry_count - The number of times the sender will attempt to
retry (defaulted to 7, the maximum value).
btl_openib_ib_timeout - The local ack timeout parameter (defaulted to 10). The
actual timeout value used is calculated as:
(4.096 micro-seconds *2^btl_openib_ib_timeout).
See InfiniBand Spec 1.2 (12.7.34) for more details.
What to do next:
One item to note is the hosts on which this error has occured, it has been
observed that rebooting or removing a particular host from the job can resolve
this issue. Should you be able to identify a specific cause or additional
trouble shooting information please report this to devel@open-mpi.org.