1
1

* pretty-print an error message if a btl component loads but can't find

any NICs to use
* Make mvapi, gm, and mx components all publish information, even if there
  are no NICs available so that modex_recv doesn't hang.  If there are no
  NICs available, don't set the reachable bit, but don't do anything
  to fail.  This unfortunately doesn't cover the hangs that will result if
  different procs load different sets of components, but it's a start

This commit was SVN r7550.
Этот коммит содержится в:
Brian Barrett 2005-09-30 04:39:44 +00:00
родитель e0c3775551
Коммит 7b20370306
13 изменённых файлов: 126 добавлений и 36 удалений

@ -21,6 +21,8 @@ noinst_LTLIBRARIES = libmca_btl_base.la
# Source code files
pkgdata_DATA = help-mpi-btl-base.txt
headers = \
base.h \
btl_base_error.h

@ -20,6 +20,8 @@
#include <stdarg.h>
#include "btl_base_error.h"
#include "opal/util/show_help.h"
#include "orte/util/sys_info.h"
int mca_btl_base_debug;
@ -47,3 +49,15 @@ int mca_btl_base_out(const char* fmt, ...)
}
void mca_btl_base_error_no_nics(const char* transport,
const char* nic_name)
{
char *procid;
asprintf(&procid, "[%lu,%lu,%lu]",
ORTE_NAME_ARGS(orte_process_info.my_name));
opal_show_help("help-mpi-btl-base.txt", "btl:no-nics",
true, procid, transport, orte_system_info.nodename,
nic_name);
free(procid);
}

@ -73,3 +73,15 @@ do { \
#endif
#endif
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
extern void mca_btl_base_error_no_nics(const char* transport,
const char* nic_name);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

22
ompi/mca/btl/base/help-mpi-btl-base.txt Обычный файл

@ -0,0 +1,22 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open MPI.
#
[btl:no-nics]
%s: %s on host %s was unable to find any %ss.
Another transport will be used instead, although this may result in
lower performance.

@ -81,7 +81,7 @@ int mca_btl_gm_add_procs(
continue;
if(NULL == (gm_proc = mca_btl_gm_proc_create(ompi_proc))) {
return OMPI_ERR_OUT_OF_RESOURCE;
continue;
}
/*

@ -32,6 +32,7 @@
#include "btl_gm_frag.h"
#include "btl_gm_endpoint.h"
#include "ompi/mca/btl/base/base.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "ompi/datatype/convertor.h"
#include "btl_gm_endpoint.h"
#include "orte/util/proc_info.h"
@ -405,20 +406,24 @@ mca_btl_gm_modex_send(void)
int rc;
size_t i;
size_t size;
mca_btl_gm_addr_t *addrs;
mca_btl_gm_addr_t *addrs = NULL;
size = mca_btl_gm_component.gm_num_btls * sizeof (mca_btl_gm_addr_t);
addrs = (mca_btl_gm_addr_t *)malloc (size);
if (NULL == addrs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (0 != size) {
addrs = (mca_btl_gm_addr_t *)malloc (size);
if (NULL == addrs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < mca_btl_gm_component.gm_num_btls; i++) {
mca_btl_gm_module_t *btl = mca_btl_gm_component.gm_btls[i];
addrs[i] = btl->gm_addr;
for (i = 0; i < mca_btl_gm_component.gm_num_btls; i++) {
mca_btl_gm_module_t *btl = mca_btl_gm_component.gm_btls[i];
addrs[i] = btl->gm_addr;
}
}
rc = mca_pml_base_modex_send (&mca_btl_gm_component.super.btl_version, addrs, size);
free (addrs);
if (NULL != addrs) {
free (addrs);
}
return rc;
}
@ -439,6 +444,8 @@ mca_btl_gm_component_init (int *num_btl_modules,
/* try to initialize GM */
if( GM_SUCCESS != gm_init() ) {
opal_output( 0, "[%s:%d] error in initializing the gm library\n", __FILE__, __LINE__ );
mca_btl_gm_component.gm_num_btls = 0;
mca_btl_gm_modex_send();
return NULL;
}
@ -451,9 +458,15 @@ mca_btl_gm_component_init (int *num_btl_modules,
/* initialize gm */
if (OMPI_SUCCESS != mca_btl_gm_discover()) {
mca_btl_base_error_no_nics("Myrinet/GM", "NIC");
mca_btl_gm_component.gm_num_btls = 0;
mca_btl_gm_modex_send();
return NULL;
}
if (mca_btl_gm_component.gm_num_btls == 0) {
mca_btl_base_error_no_nics("Myrinet/GM", "NIC");
mca_btl_gm_component.gm_num_btls = 0;
mca_btl_gm_modex_send();
return NULL;
}

@ -136,8 +136,12 @@ mca_btl_gm_proc_t* mca_btl_gm_proc_create(ompi_proc_t* ompi_proc)
}
gm_proc->proc_addr_count = size/sizeof(mca_btl_gm_addr_t);
gm_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(gm_proc->proc_addr_count * sizeof(mca_btl_base_endpoint_t*));
if (0 == gm_proc->proc_addr_count) {
gm_proc->proc_endpoints = NULL;
} else {
gm_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(gm_proc->proc_addr_count * sizeof(mca_btl_base_endpoint_t*));
}
if(NULL == gm_proc->proc_endpoints) {
OBJ_RELEASE(gm_proc);
return NULL;

@ -82,7 +82,7 @@ int mca_btl_mvapi_add_procs(
mca_btl_base_endpoint_t* ib_peer;
if(NULL == (ib_proc = mca_btl_mvapi_proc_create(ompi_proc))) {
return OMPI_ERR_OUT_OF_RESOURCE;
continue;
}
/*

@ -250,20 +250,24 @@ mca_btl_mvapi_modex_send(void)
int rc;
size_t i;
size_t size;
mca_btl_mvapi_port_info_t *ports;
mca_btl_mvapi_port_info_t *ports = NULL;
size = mca_btl_mvapi_component.ib_num_btls * sizeof (mca_btl_mvapi_port_info_t);
ports = (mca_btl_mvapi_port_info_t *)malloc (size);
if (NULL == ports) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
if (size != 0) {
ports = (mca_btl_mvapi_port_info_t *)malloc (size);
if (NULL == ports) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) {
mca_btl_mvapi_module_t *btl = &mca_btl_mvapi_component.mvapi_btls[i];
ports[i] = btl->port_info;
for (i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) {
mca_btl_mvapi_module_t *btl = &mca_btl_mvapi_component.mvapi_btls[i];
ports[i] = btl->port_info;
}
}
rc = mca_pml_base_modex_send (&mca_btl_mvapi_component.super.btl_version, ports, size);
free (ports);
if (NULL != ports) {
free (ports);
}
return rc;
}
@ -304,7 +308,9 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
/* Determine the number of hca's available on the host */
vapi_ret=EVAPI_list_hcas(0, &num_hcas, NULL);
if( VAPI_EAGAIN != vapi_ret || 0 == num_hcas ) {
BTL_ERROR(("No hca's found on this host!"));
mca_btl_base_error_no_nics("MVAPI", "HCA");
mca_btl_mvapi_component.ib_num_btls = 0;
mca_btl_mvapi_modex_send();
return NULL;
}
@ -375,9 +381,8 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
}
if(0 == mca_btl_mvapi_component.ib_num_btls){
char hostname[32];
gethostname(hostname, sizeof(hostname));
BTL_ERROR(("no mvapi btl's found on this host(%s)!", hostname));
mca_btl_base_error_no_nics("MVAPI", "HCA");
mca_btl_mvapi_modex_send();
return NULL;
}
/* Allocate space for btl modules */

@ -155,8 +155,12 @@ mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_create(ompi_proc_t* ompi_proc)
mvapi_proc->proc_port_count = size/sizeof(mca_btl_mvapi_port_info_t);
mvapi_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(mvapi_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
if (0 == mvapi_proc->proc_port_count) {
mvapi_proc->proc_endpoints = NULL;
} else {
mvapi_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(mvapi_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
}
if(NULL == mvapi_proc->proc_endpoints) {
OBJ_RELEASE(mvapi_proc);

@ -82,7 +82,7 @@ int mca_btl_mx_add_procs(
}
if(NULL == (mx_proc = mca_btl_mx_proc_create(ompi_proc))) {
return OMPI_ERR_OUT_OF_RESOURCE;
continue;
}
/*

@ -32,6 +32,7 @@
#include "btl_mx_frag.h"
#include "btl_mx_endpoint.h"
#include "mca/btl/base/base.h"
#include "mca/btl/base/btl_base_error.h"
mca_btl_mx_component_t mca_btl_mx_component = {
{
@ -307,6 +308,15 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
if( (status = mx_get_info( NULL, MX_NIC_COUNT, NULL, 0,
&mca_btl_mx_component.mx_num_btls, sizeof(uint32_t))) != MX_SUCCESS ) {
opal_output(0, "mca_btl_mx_component_init: mx_get_info(MX_NIC_COUNT) failed with status=%d\n", status);
mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version,
NULL, 0);
return NULL;
}
if (0 == mca_btl_mx_component.mx_num_btls) {
mca_btl_base_error_no_nics("Myrinet/MX", "NIC");
mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version,
NULL, 0);
return NULL;
}

@ -321,10 +321,12 @@ static void mca_pml_base_modex_registry_callback(
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
continue;
}
}
if (ORTE_SUCCESS != (rc = orte_dps.unpack(&buffer, bytes, &num_bytes, ORTE_BYTE))) {
ORTE_ERROR_LOG(rc);
continue;
if (ORTE_SUCCESS != (rc = orte_dps.unpack(&buffer, bytes, &num_bytes, ORTE_BYTE))) {
ORTE_ERROR_LOG(rc);
continue;
}
} else {
bytes = NULL;
}
/*
@ -514,7 +516,7 @@ int mca_pml_base_modex_send(
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return OMPI_ERR_OUT_OF_RESOURCE;
}
memcpy((value.byteobject.bytes, data, size);
memcpy((value.byteobject.bytes, data, size));
asprintf(&((value->keyvals[0])->key), "modex-%s-%s-%d-%d",
source_component->mca_type_name,
@ -540,8 +542,10 @@ int mca_pml_base_modex_send(
if (ORTE_SUCCESS != (rc = orte_dps.pack(&buffer, &size, 1, ORTE_SIZE))) {
goto cleanup;
}
if (ORTE_SUCCESS != (rc = orte_dps.pack(&buffer, (void*)data, size, ORTE_BYTE))) {
goto cleanup;
if (0 != size) {
if (ORTE_SUCCESS != (rc = orte_dps.pack(&buffer, (void*)data, size, ORTE_BYTE))) {
goto cleanup;
}
}
if (ORTE_SUCCESS != (rc = orte_dps.unload(&buffer, (void**)&value.byteobject.bytes,
(size_t*)&value.byteobject.size))) {