1
1

Make a much, much better error message for a not-uncommon failure

scenario (user/sysadmin forgot to set the memlock limits high
enough).

This commit was SVN r13289.
Этот коммит содержится в:
Jeff Squyres 2007-01-24 22:25:40 +00:00
родитель b252cb82c8
Коммит 6b69ea664d
2 изменённых файлов: 63 добавлений и 20 удалений

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
* *
* Additional copyrights may follow * Additional copyrights may follow
@ -21,6 +22,7 @@
#include <inttypes.h> #include <inttypes.h>
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/if.h" #include "opal/util/if.h"
#include "opal/util/show_help.h"
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h" #include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/btl_base_error.h" #include "ompi/mca/btl/base/btl_base_error.h"
@ -33,6 +35,7 @@
#include "ompi/mca/mpool/base/base.h" #include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/mpool/mpool.h" #include "ompi/mca/mpool/mpool.h"
#include "ompi/mca/mpool/rdma/mpool_rdma.h" #include "ompi/mca/mpool/rdma/mpool_rdma.h"
#include "orte/util/sys_info.h"
#include <errno.h> #include <errno.h>
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
@ -71,6 +74,21 @@ mca_btl_openib_module_t mca_btl_openib_module = {
int mca_btl_openib_size_queues( struct mca_btl_openib_module_t* openib_btl, size_t nprocs); int mca_btl_openib_size_queues( struct mca_btl_openib_module_t* openib_btl, size_t nprocs);
static void show_init_error(const char *file, int line,
const char *func, const char *dev)
{
if (ENOMEM == errno) {
opal_show_help("help-mpi-btl-openib.txt", "init-fail-no-mem",
true, orte_system_info.nodename,
file, line, func, dev);
} else {
opal_show_help("help-mpi-btl-openib.txt", "init-fail-create-q",
true, orte_system_info.nodename,
file, line, func, strerror(errno), errno, dev);
}
}
/* /*
* add a proc to this btl module * add a proc to this btl module
* creates an endpoint that is setup on the * creates an endpoint that is setup on the
@ -745,7 +763,7 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
/* Allocate Protection Domain */ /* Allocate Protection Domain */
openib_btl->poll_cq = false; openib_btl->poll_cq = false;
if(mca_btl_openib_component.use_srq) { if (mca_btl_openib_component.use_srq) {
struct ibv_srq_init_attr attr; struct ibv_srq_init_attr attr;
attr.attr.max_wr = mca_btl_openib_component.srq_rd_max; attr.attr.max_wr = mca_btl_openib_component.srq_rd_max;
@ -756,15 +774,17 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
openib_btl->srq[BTL_OPENIB_HP_QP] = openib_btl->srq[BTL_OPENIB_HP_QP] =
ibv_create_srq(openib_btl->hca->ib_pd, &attr); ibv_create_srq(openib_btl->hca->ib_pd, &attr);
if(NULL == openib_btl->srq[BTL_OPENIB_HP_QP]) { if (NULL == openib_btl->srq[BTL_OPENIB_HP_QP]) {
BTL_ERROR(("error in ibv_create_srq\n")); show_init_error(__FILE__, __LINE__, "ibv_create_srq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR; return OMPI_ERROR;
} }
openib_btl->srq[BTL_OPENIB_LP_QP] = openib_btl->srq[BTL_OPENIB_LP_QP] =
ibv_create_srq(openib_btl->hca->ib_pd, &attr); ibv_create_srq(openib_btl->hca->ib_pd, &attr);
if(NULL == openib_btl->srq[BTL_OPENIB_LP_QP]) { if (NULL == openib_btl->srq[BTL_OPENIB_LP_QP]) {
BTL_ERROR(("error in ibv_create_srq\n")); show_init_error(__FILE__, __LINE__, "ibv_create_srq",
ibv_get_device_name(openib_btl->hca->ib_dev));
return OMPI_ERROR; return OMPI_ERROR;
} }
@ -797,19 +817,16 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
#endif #endif
#endif /* OMPI_ENABLE_PROGRESS_THREADS */ #endif /* OMPI_ENABLE_PROGRESS_THREADS */
if(NULL == openib_btl->ib_cq[BTL_OPENIB_LP_QP]) { if (NULL == openib_btl->ib_cq[BTL_OPENIB_LP_QP]) {
BTL_ERROR(("error creating low priority cq for %s errno says %s\n", show_init_error(__FILE__, __LINE__, "ibv_create_srq",
ibv_get_device_name(openib_btl->hca->ib_dev), ibv_get_device_name(openib_btl->hca->ib_dev));
strerror(errno)));
return OMPI_ERROR; return OMPI_ERROR;
} }
#if OMPI_ENABLE_PROGRESS_THREADS == 1 #if OMPI_ENABLE_PROGRESS_THREADS == 1
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_LP_QP], 0)) { if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_LP_QP], 0)) {
BTL_ERROR(("error requesting low priority cq notification for %s" show_init_error(__FILE__, __LINE__, "ibv_req_notify_cq",
" errno says %s\n", ibv_get_device_name(openib_btl->hca->ib_dev));
ibv_get_device_name(openib_btl->hca->ib_dev),
strerror(errno)));
return OMPI_ERROR; return OMPI_ERROR;
} }
@ -835,18 +852,15 @@ int mca_btl_openib_create_cq_srq(mca_btl_openib_module_t *openib_btl)
#endif /* OMPI_ENABLE_PROGRESS_THREADS */ #endif /* OMPI_ENABLE_PROGRESS_THREADS */
if(NULL == openib_btl->ib_cq[BTL_OPENIB_HP_QP]) { if(NULL == openib_btl->ib_cq[BTL_OPENIB_HP_QP]) {
BTL_ERROR(("error creating high priority cq for %s errno says %s\n", show_init_error(__FILE__, __LINE__, "ibv_create_cq",
ibv_get_device_name(openib_btl->hca->ib_dev), ibv_get_device_name(openib_btl->hca->ib_dev));
strerror(errno)));
return OMPI_ERROR; return OMPI_ERROR;
} }
#if OMPI_ENABLE_PROGRESS_THREADS == 1 #if OMPI_ENABLE_PROGRESS_THREADS == 1
if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_HP_QP], 0)) { if(ibv_req_notify_cq(openib_btl->ib_cq[BTL_OPENIB_HP_QP], 0)) {
BTL_ERROR(("error requesting high priority cq notification for %s" show_init_error(__FILE__, __LINE__, "ibv_req_notify_cq",
" errno says %s\n", ibv_get_device_name(openib_btl->hca->ib_dev));
ibv_get_device_name(openib_btl->hca->ib_dev),
strerror(errno)));
return OMPI_ERROR; return OMPI_ERROR;
} }
OPAL_THREAD_LOCK(&openib_btl->hca->hca_lock); OPAL_THREAD_LOCK(&openib_btl->hca->hca_lock);

Просмотреть файл

@ -92,6 +92,35 @@ btl_openib_hca_param_files MCA parameter to set values for your HCA.
NOTE: You can turn off this warning by setting the MCA parameter NOTE: You can turn off this warning by setting the MCA parameter
btl_openib_warn_no_hca_params_found to 0. btl_openib_warn_no_hca_params_found to 0.
[init-fail-no-mem]
The OpenIB BTL failed to initialize while trying to allocate some
locked memory. This typically can indicate that the memlock limits
are set too low. For most HPC installations, the memlock limits
should be set to "unlimited". The failure occured here:
Host: %s
OMPI source: %s:%d
Function: %s()
Device: %s
You may need to consult with your system administrator to get this
problem fixed. This FAQ entry on the Open MPI web site may also be
helpful:
http://www.open-mpi.org/faq/?category=openfabrics#ib-locked-pages
[init-fail-create-q]
The OpenIB BTL failed to initialize while trying to create an internal
queue. This typically indicates a failed OpenFabrics installation or
faulty hardware. The failure occured here:
Host: %s
OMPI source: %s:%d
Function: %s()
Error: %s (errno=%d)
Device: %s
You may need to consult with your system administrator to get this
problem fixed.
[btl_openib:retry-exceeded] [btl_openib:retry-exceeded]
The InfiniBand retry count between two MPI processes has been The InfiniBand retry count between two MPI processes has been
exceeded. "Retry count" is defined in the InfiniBand spec 1.2 exceeded. "Retry count" is defined in the InfiniBand spec 1.2