Make a much, much better error message for a not-uncommon failure
scenario (user/sysadmin forgot to set the memlock limits high enough). This commit was SVN r13289.
Этот коммит содержится в:
родитель
b252cb82c8
Коммит
6b69ea664d
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -21,6 +22,7 @@
|
|||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
#include "opal/util/if.h"
|
#include "opal/util/if.h"
|
||||||
|
#include "opal/util/show_help.h"
|
||||||
#include "ompi/mca/pml/pml.h"
|
#include "ompi/mca/pml/pml.h"
|
||||||
#include "ompi/mca/btl/btl.h"
|
#include "ompi/mca/btl/btl.h"
|
||||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||||
@ -33,6 +35,7 @@
|
|||||||
#include "ompi/mca/mpool/base/base.h"
|
#include "ompi/mca/mpool/base/base.h"
|
||||||
#include "ompi/mca/mpool/mpool.h"
|
#include "ompi/mca/mpool/mpool.h"
|
||||||
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
||||||
|
#include "orte/util/sys_info.h"
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
@ -71,6 +74,21 @@ mca_btl_openib_module_t mca_btl_openib_module = {
|
|||||||
int mca_btl_openib_size_queues( struct mca_btl_openib_module_t* openib_btl, size_t nprocs);
|
int mca_btl_openib_size_queues( struct mca_btl_openib_module_t* openib_btl, size_t nprocs);
|
||||||
|
|
||||||
|
|
||||||
|
static void show_init_error(const char *file, int line,
|
||||||
|
const char *func, const char *dev)
|
||||||
|
{
|
||||||
|
if (ENOMEM == errno) {
|
||||||
|
opal_show_help("help-mpi-btl-openib.txt", "init-fail-no-mem",
|
||||||
|
true, orte_system_info.nodename,
|
||||||
|
file, line, func, dev);
|
||||||
|
} else {
|
||||||
|
opal_show_help("help-mpi-btl-openib.txt", "init-fail-create-q",
|
||||||
|
true, orte_system_info.nodename,
|
||||||
|
file, line, func, strerror(errno), errno, dev);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* add a proc to this btl module
|
* add a proc to this btl module
|
||||||
* creates an endpoint that is setup on the
|
* creates an endpoint that is setup on the
|
||||||
@ -745,7 +763,7 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
|
|||||||
/* Allocate Protection Domain */
|
/* Allocate Protection Domain */
|
||||||
openib_btl->poll_cq = false;
|
openib_btl->poll_cq = false;
|
||||||
|
|
||||||
if(mca_btl_openib_component.use_srq) {
|
if (mca_btl_openib_component.use_srq) {
|
||||||
|
|
||||||
struct ibv_srq_init_attr attr;
|
struct ibv_srq_init_attr attr;
|
||||||
attr.attr.max_wr = mca_btl_openib_component.srq_rd_max;
|
attr.attr.max_wr = mca_btl_openib_component.srq_rd_max;
|
||||||
@ -756,15 +774,17 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
|
|||||||
|
|
||||||
openib_btl->srq[BTL_OPENIB_HP_QP] =
|
openib_btl->srq[BTL_OPENIB_HP_QP] =
|
||||||
ibv_create_srq(openib_btl->hca->ib_pd, &attr);
|
ibv_create_srq(openib_btl->hca->ib_pd, &attr);
|
||||||
if(NULL == openib_btl->srq[BTL_OPENIB_HP_QP]) {
|
if (NULL == openib_btl->srq[BTL_OPENIB_HP_QP]) {
|
||||||
BTL_ERROR(("error in ibv_create_srq\n"));
|
show_init_error(__FILE__, __LINE__, "ibv_create_srq",
|
||||||
|
ibv_get_device_name(openib_btl->hca->ib_dev));
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
openib_btl->srq[BTL_OPENIB_LP_QP] =
|
openib_btl->srq[BTL_OPENIB_LP_QP] =
|
||||||
ibv_create_srq(openib_btl->hca->ib_pd, &attr);
|
ibv_create_srq(openib_btl->hca->ib_pd, &attr);
|
||||||
if(NULL == openib_btl->srq[BTL_OPENIB_LP_QP]) {
|
if (NULL == openib_btl->srq[BTL_OPENIB_LP_QP]) {
|
||||||
BTL_ERROR(("error in ibv_create_srq\n"));
|
show_init_error(__FILE__, __LINE__, "ibv_create_srq",
|
||||||
|
ibv_get_device_name(openib_btl->hca->ib_dev));
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -797,19 +817,16 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
|
|||||||
#endif
|
#endif
|
||||||
#endif /* OMPI_ENABLE_PROGRESS_THREADS */
|
#endif /* OMPI_ENABLE_PROGRESS_THREADS */
|
||||||
|
|
||||||
if(NULL == openib_btl->ib_cq[BTL_OPENIB_LP_QP]) {
|
if (NULL == openib_btl->ib_cq[BTL_OPENIB_LP_QP]) {
|
||||||
BTL_ERROR(("error creating low priority cq for %s errno says %s\n",
|
show_init_error(__FILE__, __LINE__, "ibv_create_srq",
|
||||||
ibv_get_device_name(openib_btl->hca->ib_dev),
|
ibv_get_device_name(openib_btl->hca->ib_dev));
|
||||||
strerror(errno)));
|
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||||
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_LP_QP], 0)) {
|
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_LP_QP], 0)) {
|
||||||
BTL_ERROR(("error requesting low priority cq notification for %s"
|
show_init_error(__FILE__, __LINE__, "ibv_req_notify_cq",
|
||||||
" errno says %s\n",
|
ibv_get_device_name(openib_btl->hca->ib_dev));
|
||||||
ibv_get_device_name(openib_btl->hca->ib_dev),
|
|
||||||
strerror(errno)));
|
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -835,18 +852,15 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
|
|||||||
#endif /* OMPI_ENABLE_PROGRESS_THREADS */
|
#endif /* OMPI_ENABLE_PROGRESS_THREADS */
|
||||||
|
|
||||||
if(NULL == openib_btl->ib_cq[BTL_OPENIB_HP_QP]) {
|
if(NULL == openib_btl->ib_cq[BTL_OPENIB_HP_QP]) {
|
||||||
BTL_ERROR(("error creating high priority cq for %s errno says %s\n",
|
show_init_error(__FILE__, __LINE__, "ibv_create_cq",
|
||||||
ibv_get_device_name(openib_btl->hca->ib_dev),
|
ibv_get_device_name(openib_btl->hca->ib_dev));
|
||||||
strerror(errno)));
|
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
#if OMPI_ENABLE_PROGRESS_THREADS == 1
|
||||||
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_HP_QP], 0)) {
|
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_HP_QP], 0)) {
|
||||||
BTL_ERROR(("error requesting high priority cq notification for %s"
|
show_init_error(__FILE__, __LINE__, "ibv_req_notify_cq",
|
||||||
" errno says %s\n",
|
ibv_get_device_name(openib_btl->hca->ib_dev));
|
||||||
ibv_get_device_name(openib_btl->hca->ib_dev),
|
|
||||||
strerror(errno)));
|
|
||||||
return OMPI_ERROR;
|
return OMPI_ERROR;
|
||||||
}
|
}
|
||||||
OPAL_THREAD_LOCK(&openib_btl->hca->hca_lock);
|
OPAL_THREAD_LOCK(&openib_btl->hca->hca_lock);
|
||||||
|
@ -92,6 +92,35 @@ btl_openib_hca_param_files MCA parameter to set values for your HCA.
|
|||||||
|
|
||||||
NOTE: You can turn off this warning by setting the MCA parameter
|
NOTE: You can turn off this warning by setting the MCA parameter
|
||||||
btl_openib_warn_no_hca_params_found to 0.
|
btl_openib_warn_no_hca_params_found to 0.
|
||||||
|
[init-fail-no-mem]
|
||||||
|
The OpenIB BTL failed to initialize while trying to allocate some
|
||||||
|
locked memory. This typically can indicate that the memlock limits
|
||||||
|
are set too low. For most HPC installations, the memlock limits
|
||||||
|
should be set to "unlimited". The failure occured here:
|
||||||
|
|
||||||
|
Host: %s
|
||||||
|
OMPI source: %s:%d
|
||||||
|
Function: %s()
|
||||||
|
Device: %s
|
||||||
|
|
||||||
|
You may need to consult with your system administrator to get this
|
||||||
|
problem fixed. This FAQ entry on the Open MPI web site may also be
|
||||||
|
helpful:
|
||||||
|
|
||||||
|
http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages
|
||||||
|
[init-fail-create-q]
|
||||||
|
The OpenIB BTL failed to initialize while trying to create an internal
|
||||||
|
queue. This typically indicates a failed OpenFabrics installation or
|
||||||
|
faulty hardware. The failure occured here:
|
||||||
|
|
||||||
|
Host: %s
|
||||||
|
OMPI source: %s:%d
|
||||||
|
Function: %s()
|
||||||
|
Error: %s (errno=%d)
|
||||||
|
Device: %s
|
||||||
|
|
||||||
|
You may need to consult with your system administrator to get this
|
||||||
|
problem fixed.
|
||||||
[btl_openib:retry-exceeded]
|
[btl_openib:retry-exceeded]
|
||||||
The InfiniBand retry count between two MPI processes has been
|
The InfiniBand retry count between two MPI processes has been
|
||||||
exceeded. "Retry count" is defined in the InfiniBand spec 1.2
|
exceeded. "Retry count" is defined in the InfiniBand spec 1.2
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user