1
1

* pretty-print an error message if a btl component loads but can't find

any NICs to use
* Make mvapi, gm, and mx components all publish information, even if there
  are no NICs available so that modex_recv doesn't hang.  If there are no
  NICs available, don't set the reachable bit, but don't do anything
  to fail.  This unfortunately doesn't cover the hangs that will result if
  different procs load different sets of components, but it's a start

This commit was SVN r7550.
Этот коммит содержится в:
Brian Barrett 2005-09-30 04:39:44 +00:00
родитель e0c3775551
Коммит 7b20370306
13 изменённых файлов: 126 добавлений и 36 удалений

Просмотреть файл

@ -21,6 +21,8 @@ noinst_LTLIBRARIES = libmca_btl_base.la
# Source code files # Source code files
pkgdata_DATA = help-mpi-btl-base.txt
headers = \ headers = \
base.h \ base.h \
btl_base_error.h btl_base_error.h

Просмотреть файл

@ -20,6 +20,8 @@
#include <stdarg.h> #include <stdarg.h>
#include "btl_base_error.h" #include "btl_base_error.h"
#include "opal/util/show_help.h"
#include "orte/util/sys_info.h"
int mca_btl_base_debug; int mca_btl_base_debug;
@ -47,3 +49,15 @@ int mca_btl_base_out(const char* fmt, ...)
} }
void mca_btl_base_error_no_nics(const char* transport,
const char* nic_name)
{
char *procid;
asprintf(&procid, "[%lu,%lu,%lu]",
ORTE_NAME_ARGS(orte_process_info.my_name));
opal_show_help("help-mpi-btl-base.txt", "btl:no-nics",
true, procid, transport, orte_system_info.nodename,
nic_name);
free(procid);
}

Просмотреть файл

@ -73,3 +73,15 @@ do { \
#endif #endif
#endif #endif
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
extern void mca_btl_base_error_no_nics(const char* transport,
const char* nic_name);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

22
ompi/mca/btl/base/help-mpi-btl-base.txt Обычный файл
Просмотреть файл

@ -0,0 +1,22 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University.
# All rights reserved.
# Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
# All rights reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for Open MPI.
#
[btl:no-nics]
%s: %s on host %s was unable to find any %ss.
Another transport will be used instead, although this may result in
lower performance.

Просмотреть файл

@ -81,7 +81,7 @@ int mca_btl_gm_add_procs(
continue; continue;
if(NULL == (gm_proc = mca_btl_gm_proc_create(ompi_proc))) { if(NULL == (gm_proc = mca_btl_gm_proc_create(ompi_proc))) {
return OMPI_ERR_OUT_OF_RESOURCE; continue;
} }
/* /*

Просмотреть файл

@ -32,6 +32,7 @@
#include "btl_gm_frag.h" #include "btl_gm_frag.h"
#include "btl_gm_endpoint.h" #include "btl_gm_endpoint.h"
#include "ompi/mca/btl/base/base.h" #include "ompi/mca/btl/base/base.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#include "ompi/datatype/convertor.h" #include "ompi/datatype/convertor.h"
#include "btl_gm_endpoint.h" #include "btl_gm_endpoint.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
@ -405,20 +406,24 @@ mca_btl_gm_modex_send(void)
int rc; int rc;
size_t i; size_t i;
size_t size; size_t size;
mca_btl_gm_addr_t *addrs; mca_btl_gm_addr_t *addrs = NULL;
size = mca_btl_gm_component.gm_num_btls * sizeof (mca_btl_gm_addr_t); size = mca_btl_gm_component.gm_num_btls * sizeof (mca_btl_gm_addr_t);
addrs = (mca_btl_gm_addr_t *)malloc (size); if (0 != size) {
if (NULL == addrs) { addrs = (mca_btl_gm_addr_t *)malloc (size);
return OMPI_ERR_OUT_OF_RESOURCE; if (NULL == addrs) {
} return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < mca_btl_gm_component.gm_num_btls; i++) { for (i = 0; i < mca_btl_gm_component.gm_num_btls; i++) {
mca_btl_gm_module_t *btl = mca_btl_gm_component.gm_btls[i]; mca_btl_gm_module_t *btl = mca_btl_gm_component.gm_btls[i];
addrs[i] = btl->gm_addr; addrs[i] = btl->gm_addr;
}
} }
rc = mca_pml_base_modex_send (&mca_btl_gm_component.super.btl_version, addrs, size); rc = mca_pml_base_modex_send (&mca_btl_gm_component.super.btl_version, addrs, size);
free (addrs); if (NULL != addrs) {
free (addrs);
}
return rc; return rc;
} }
@ -439,6 +444,8 @@ mca_btl_gm_component_init (int *num_btl_modules,
/* try to initialize GM */ /* try to initialize GM */
if( GM_SUCCESS != gm_init() ) { if( GM_SUCCESS != gm_init() ) {
opal_output( 0, "[%s:%d] error in initializing the gm library\n", __FILE__, __LINE__ ); opal_output( 0, "[%s:%d] error in initializing the gm library\n", __FILE__, __LINE__ );
mca_btl_gm_component.gm_num_btls = 0;
mca_btl_gm_modex_send();
return NULL; return NULL;
} }
@ -451,9 +458,15 @@ mca_btl_gm_component_init (int *num_btl_modules,
/* initialize gm */ /* initialize gm */
if (OMPI_SUCCESS != mca_btl_gm_discover()) { if (OMPI_SUCCESS != mca_btl_gm_discover()) {
mca_btl_base_error_no_nics("Myrinet/GM", "NIC");
mca_btl_gm_component.gm_num_btls = 0;
mca_btl_gm_modex_send();
return NULL; return NULL;
} }
if (mca_btl_gm_component.gm_num_btls == 0) { if (mca_btl_gm_component.gm_num_btls == 0) {
mca_btl_base_error_no_nics("Myrinet/GM", "NIC");
mca_btl_gm_component.gm_num_btls = 0;
mca_btl_gm_modex_send();
return NULL; return NULL;
} }

Просмотреть файл

@ -136,8 +136,12 @@ mca_btl_gm_proc_t* mca_btl_gm_proc_create(ompi_proc_t* ompi_proc)
} }
gm_proc->proc_addr_count = size/sizeof(mca_btl_gm_addr_t); gm_proc->proc_addr_count = size/sizeof(mca_btl_gm_addr_t);
gm_proc->proc_endpoints = (mca_btl_base_endpoint_t**) if (0 == gm_proc->proc_addr_count) {
malloc(gm_proc->proc_addr_count * sizeof(mca_btl_base_endpoint_t*)); gm_proc->proc_endpoints = NULL;
} else {
gm_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(gm_proc->proc_addr_count * sizeof(mca_btl_base_endpoint_t*));
}
if(NULL == gm_proc->proc_endpoints) { if(NULL == gm_proc->proc_endpoints) {
OBJ_RELEASE(gm_proc); OBJ_RELEASE(gm_proc);
return NULL; return NULL;

Просмотреть файл

@ -82,7 +82,7 @@ int mca_btl_mvapi_add_procs(
mca_btl_base_endpoint_t* ib_peer; mca_btl_base_endpoint_t* ib_peer;
if(NULL == (ib_proc = mca_btl_mvapi_proc_create(ompi_proc))) { if(NULL == (ib_proc = mca_btl_mvapi_proc_create(ompi_proc))) {
return OMPI_ERR_OUT_OF_RESOURCE; continue;
} }
/* /*

Просмотреть файл

@ -250,20 +250,24 @@ mca_btl_mvapi_modex_send(void)
int rc; int rc;
size_t i; size_t i;
size_t size; size_t size;
mca_btl_mvapi_port_info_t *ports; mca_btl_mvapi_port_info_t *ports = NULL;
size = mca_btl_mvapi_component.ib_num_btls * sizeof (mca_btl_mvapi_port_info_t); size = mca_btl_mvapi_component.ib_num_btls * sizeof (mca_btl_mvapi_port_info_t);
ports = (mca_btl_mvapi_port_info_t *)malloc (size); if (size != 0) {
if (NULL == ports) { ports = (mca_btl_mvapi_port_info_t *)malloc (size);
return OMPI_ERR_OUT_OF_RESOURCE; if (NULL == ports) {
} return OMPI_ERR_OUT_OF_RESOURCE;
}
for (i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) { for (i = 0; i < mca_btl_mvapi_component.ib_num_btls; i++) {
mca_btl_mvapi_module_t *btl = &mca_btl_mvapi_component.mvapi_btls[i]; mca_btl_mvapi_module_t *btl = &mca_btl_mvapi_component.mvapi_btls[i];
ports[i] = btl->port_info; ports[i] = btl->port_info;
}
} }
rc = mca_pml_base_modex_send (&mca_btl_mvapi_component.super.btl_version, ports, size); rc = mca_pml_base_modex_send (&mca_btl_mvapi_component.super.btl_version, ports, size);
free (ports); if (NULL != ports) {
free (ports);
}
return rc; return rc;
} }
@ -304,7 +308,9 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
/* Determine the number of hca's available on the host */ /* Determine the number of hca's available on the host */
vapi_ret=EVAPI_list_hcas(0, &num_hcas, NULL); vapi_ret=EVAPI_list_hcas(0, &num_hcas, NULL);
if( VAPI_EAGAIN != vapi_ret || 0 == num_hcas ) { if( VAPI_EAGAIN != vapi_ret || 0 == num_hcas ) {
BTL_ERROR(("No hca's found on this host!")); mca_btl_base_error_no_nics("MVAPI", "HCA");
mca_btl_mvapi_component.ib_num_btls = 0;
mca_btl_mvapi_modex_send();
return NULL; return NULL;
} }
@ -375,9 +381,8 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
} }
if(0 == mca_btl_mvapi_component.ib_num_btls){ if(0 == mca_btl_mvapi_component.ib_num_btls){
char hostname[32]; mca_btl_base_error_no_nics("MVAPI", "HCA");
gethostname(hostname, sizeof(hostname)); mca_btl_mvapi_modex_send();
BTL_ERROR(("no mvapi btl's found on this host(%s)!", hostname));
return NULL; return NULL;
} }
/* Allocate space for btl modules */ /* Allocate space for btl modules */

Просмотреть файл

@ -155,8 +155,12 @@ mca_btl_mvapi_proc_t* mca_btl_mvapi_proc_create(ompi_proc_t* ompi_proc)
mvapi_proc->proc_port_count = size/sizeof(mca_btl_mvapi_port_info_t); mvapi_proc->proc_port_count = size/sizeof(mca_btl_mvapi_port_info_t);
mvapi_proc->proc_endpoints = (mca_btl_base_endpoint_t**) if (0 == mvapi_proc->proc_port_count) {
malloc(mvapi_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*)); mvapi_proc->proc_endpoints = NULL;
} else {
mvapi_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
malloc(mvapi_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*));
}
if(NULL == mvapi_proc->proc_endpoints) { if(NULL == mvapi_proc->proc_endpoints) {
OBJ_RELEASE(mvapi_proc); OBJ_RELEASE(mvapi_proc);

Просмотреть файл

@ -82,7 +82,7 @@ int mca_btl_mx_add_procs(
} }
if(NULL == (mx_proc = mca_btl_mx_proc_create(ompi_proc))) { if(NULL == (mx_proc = mca_btl_mx_proc_create(ompi_proc))) {
return OMPI_ERR_OUT_OF_RESOURCE; continue;
} }
/* /*

Просмотреть файл

@ -32,6 +32,7 @@
#include "btl_mx_frag.h" #include "btl_mx_frag.h"
#include "btl_mx_endpoint.h" #include "btl_mx_endpoint.h"
#include "mca/btl/base/base.h" #include "mca/btl/base/base.h"
#include "mca/btl/base/btl_base_error.h"
mca_btl_mx_component_t mca_btl_mx_component = { mca_btl_mx_component_t mca_btl_mx_component = {
{ {
@ -307,6 +308,15 @@ mca_btl_base_module_t** mca_btl_mx_component_init(int *num_btl_modules,
if( (status = mx_get_info( NULL, MX_NIC_COUNT, NULL, 0, if( (status = mx_get_info( NULL, MX_NIC_COUNT, NULL, 0,
&mca_btl_mx_component.mx_num_btls, sizeof(uint32_t))) != MX_SUCCESS ) { &mca_btl_mx_component.mx_num_btls, sizeof(uint32_t))) != MX_SUCCESS ) {
opal_output(0, "mca_btl_mx_component_init: mx_get_info(MX_NIC_COUNT) failed with status=%d\n", status); opal_output(0, "mca_btl_mx_component_init: mx_get_info(MX_NIC_COUNT) failed with status=%d\n", status);
mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version,
NULL, 0);
return NULL;
}
if (0 == mca_btl_mx_component.mx_num_btls) {
mca_btl_base_error_no_nics("Myrinet/MX", "NIC");
mca_pml_base_modex_send(&mca_btl_mx_component.super.btl_version,
NULL, 0);
return NULL; return NULL;
} }

Просмотреть файл

@ -321,10 +321,12 @@ static void mca_pml_base_modex_registry_callback(
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
continue; continue;
} }
} if (ORTE_SUCCESS != (rc = orte_dps.unpack(&buffer, bytes, &num_bytes, ORTE_BYTE))) {
if (ORTE_SUCCESS != (rc = orte_dps.unpack(&buffer, bytes, &num_bytes, ORTE_BYTE))) { ORTE_ERROR_LOG(rc);
ORTE_ERROR_LOG(rc); continue;
continue; }
} else {
bytes = NULL;
} }
/* /*
@ -514,7 +516,7 @@ int mca_pml_base_modex_send(
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return OMPI_ERR_OUT_OF_RESOURCE; return OMPI_ERR_OUT_OF_RESOURCE;
} }
memcpy((value.byteobject.bytes, data, size); memcpy((value.byteobject.bytes, data, size));
asprintf(&((value->keyvals[0])->key), "modex-%s-%s-%d-%d", asprintf(&((value->keyvals[0])->key), "modex-%s-%s-%d-%d",
source_component->mca_type_name, source_component->mca_type_name,
@ -540,8 +542,10 @@ int mca_pml_base_modex_send(
if (ORTE_SUCCESS != (rc = orte_dps.pack(&buffer, &size, 1, ORTE_SIZE))) { if (ORTE_SUCCESS != (rc = orte_dps.pack(&buffer, &size, 1, ORTE_SIZE))) {
goto cleanup; goto cleanup;
} }
if (ORTE_SUCCESS != (rc = orte_dps.pack(&buffer, (void*)data, size, ORTE_BYTE))) { if (0 != size) {
goto cleanup; if (ORTE_SUCCESS != (rc = orte_dps.pack(&buffer, (void*)data, size, ORTE_BYTE))) {
goto cleanup;
}
} }
if (ORTE_SUCCESS != (rc = orte_dps.unload(&buffer, (void**)&value.byteobject.bytes, if (ORTE_SUCCESS != (rc = orte_dps.unload(&buffer, (void**)&value.byteobject.bytes,
(size_t*)&value.byteobject.size))) { (size_t*)&value.byteobject.size))) {