add better error messages for vapi retry exceeded errors.
This commit was SVN r10219.
Этот коммит содержится в:
родитель
d4b5c98247
Коммит
cc54b07aa0
@ -23,6 +23,8 @@
|
||||
CFLAGS = $(btl_mvapi_CFLAGS)
|
||||
AM_CPPFLAGS = $(btl_mvapi_CPPFLAGS)
|
||||
|
||||
dist_pkgdata_DATA=help-mpi-btl-mvapi.txt
|
||||
|
||||
sources = \
|
||||
btl_mvapi.c \
|
||||
btl_mvapi.h \
|
||||
|
@ -33,7 +33,8 @@
|
||||
#include "opal/util/output.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
@ -41,6 +42,7 @@
|
||||
#include "btl_mvapi_frag.h"
|
||||
#include "btl_mvapi_endpoint.h"
|
||||
#include "btl_mvapi_eager_rdma.h"
|
||||
#include "btl_mvapi_proc.h"
|
||||
#include "ompi/mca/btl/base/base.h"
|
||||
#include <vapi.h>
|
||||
#include <vapi_common.h>
|
||||
@ -774,9 +776,22 @@ int mca_btl_mvapi_component_progress( void )
|
||||
ret = VAPI_poll_cq(mvapi_btl->nic, mvapi_btl->cq_hndl_hp, &comp);
|
||||
if(VAPI_OK == ret) {
|
||||
if(comp.status != VAPI_SUCCESS) {
|
||||
BTL_ERROR(("Got error : %s, Vendor code : %d Frag : %p",
|
||||
VAPI_wc_status_sym(comp.status),
|
||||
comp.vendor_err_syndrome, comp.id));
|
||||
ompi_proc_t* remote_proc = NULL;
|
||||
frag = (mca_btl_mvapi_frag_t*) (unsigned long) comp.id;
|
||||
if(frag) {
|
||||
endpoint = (mca_btl_mvapi_endpoint_t*) frag->endpoint;
|
||||
if(endpoint &&
|
||||
endpoint->endpoint_proc &&
|
||||
endpoint->endpoint_proc->proc_ompi) {
|
||||
remote_proc = endpoint->endpoint_proc->proc_ompi;
|
||||
}
|
||||
}
|
||||
BTL_PEER_ERROR(remote_proc, ("error polling HP CQ with status %s status number %d for Frag : %p",
|
||||
VAPI_wc_status_sym(comp.status),
|
||||
comp.status, comp.id));
|
||||
if(comp.status == VAPI_RETRY_EXC_ERR) {
|
||||
opal_show_help("help-mpi-btl-mvapi.txt", "btl_mvapi:retry-exceeded", true);
|
||||
}
|
||||
return OMPI_ERROR;
|
||||
}
|
||||
|
||||
|
41
ompi/mca/btl/mvapi/help-mpi-btl-mvapi.txt
Обычный файл
41
ompi/mca/btl/mvapi/help-mpi-btl-mvapi.txt
Обычный файл
@ -0,0 +1,41 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2006 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for Open MPI.
|
||||
#
|
||||
[btl_mvapi:retry-exceeded]
|
||||
The retry count is a down counter initialized on creation of the QP. Retry
|
||||
count is defined in the InfiniBand Spec 1.2 (12.7.38):
|
||||
The total number of times that the sender wishes the receiver to retry tim-
|
||||
eout, packet sequence, etc. errors before posting a completion error.
|
||||
|
||||
Note that two mca parameters are involved here:
|
||||
btl_openib_ib_retry_count - The number of times the sender will attempt to
|
||||
retry (defaulted to 7, the maximum value).
|
||||
|
||||
btl_openib_ib_timeout - The local ack timeout parameter (defaulted to 10). The
|
||||
actual timeout value used is calculated as:
|
||||
(4.096 micro-seconds * 2^btl_openib_ib_timeout).
|
||||
See InfiniBand Spec 1.2 (12.7.34) for more details.
|
||||
|
||||
What to do next:
|
||||
One item to note is the hosts on which this error has occured, it has been
|
||||
observed that rebooting or removing a particular host from the job can resolve
|
||||
this issue. Should you be able to identify a specific cause or additional
|
||||
trouble shooting information please report this to devel@open-mpi.org.
|
||||
|
@ -874,7 +874,7 @@ int mca_btl_openib_component_progress()
|
||||
remote_proc = endpoint->endpoint_proc->proc_ompi;
|
||||
}
|
||||
}
|
||||
BTL_PEER_ERROR(remote_proc, ("error polling HP CQ with status %s status number %d for wr_id %llu opcode %d\n",
|
||||
BTL_PEER_ERROR(remote_proc, ("error polling HP CQ with status %s status number %d for wr_id %llu opcode %d",
|
||||
mca_btl_openib_component_status_to_string(wc.status),
|
||||
wc.status, wc.wr_id, wc.opcode));
|
||||
if(wc.status == IBV_WC_RETRY_EXC_ERR) {
|
||||
|
@ -25,5 +25,17 @@ The total number of times that the sender wishes the receiver to retry tim-
|
||||
eout, packet sequence, etc. errors before posting a completion error.
|
||||
|
||||
Note that two mca parameters are involved here:
|
||||
btl_openib_ib_retry_count - The number of times the sender will attempt to retry (defaulted to 7, the maximum value).
|
||||
btl_openib_ib_timeout - The local ack timeout parameter (defaulted to 10). The actual timeout value used is calculated as: (4.096 micro-seconds * 2^btl_openib_ib_timeout). See InfiniBand Spec 1.2 (12.7.34) for more details.
|
||||
btl_openib_ib_retry_count - The number of times the sender will attempt to
|
||||
retry (defaulted to 7, the maximum value).
|
||||
|
||||
btl_openib_ib_timeout - The local ack timeout parameter (defaulted to 10). The
|
||||
actual timeout value used is calculated as:
|
||||
(4.096 micro-seconds *2^btl_openib_ib_timeout).
|
||||
See InfiniBand Spec 1.2 (12.7.34) for more details.
|
||||
|
||||
|
||||
What to do next:
|
||||
One item to note is the hosts on which this error has occured, it has been
|
||||
observed that rebooting or removing a particular host from the job can resolve
|
||||
this issue. Should you be able to identify a specific cause or additional
|
||||
trouble shooting information please report this to devel@open-mpi.org.
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user