1
1

Print a [much] better error message when MPI processes are unable to

reach each other (this problem just bit me; I had forgotten how horrid
our previous error message was).

This commit was SVN r19548.
Этот коммит содержится в:
Jeff Squyres 2008-09-11 20:52:58 +00:00
родитель eeabae49b9
Коммит f794580bbe
2 изменённых файлов: 51 добавлений и 10 удалений

Просмотреть файл

@ -24,6 +24,7 @@
#include <stdlib.h>
#include <string.h>
#include "opal/util/argv.h"
#include "orte/util/show_help.h"
#include "ompi/class/ompi_bitmap.h"
#include "ompi/mca/bml/bml.h"
@ -59,6 +60,9 @@ mca_bml_r2_module_t mca_bml_r2 = {
};
/* Names of all the BTL components that this BML is aware of */
static char *btl_names = NULL;
static inline unsigned int bml_base_log2(unsigned long val) {
unsigned int count = 0;
@ -101,9 +105,11 @@ int mca_bml_r2_progress( void )
static int mca_bml_r2_add_btls( void )
{
int i;
opal_list_t *btls = NULL;
mca_btl_base_selected_module_t* selected_btl;
size_t num_btls = 0;
char **btl_names_argv = NULL;
if(true == mca_bml_r2.btls_added) {
return OMPI_SUCCESS;
@ -129,6 +135,23 @@ static int mca_bml_r2_add_btls( void )
selected_btl = (mca_btl_base_selected_module_t*)opal_list_get_next(selected_btl)) {
mca_btl_base_module_t *btl = selected_btl->btl_module;
mca_bml_r2.btl_modules[mca_bml_r2.num_btl_modules++] = btl;
for (i = 0; NULL != btl_names_argv && NULL != btl_names_argv[i]; ++i) {
if (0 ==
strcmp(btl_names_argv[i],
btl->btl_component->btl_version.mca_component_name)) {
break;
}
}
if (NULL == btl_names_argv || NULL == btl_names_argv[i]) {
opal_argv_append_nosize(&btl_names_argv,
btl->btl_component->btl_version.mca_component_name);
}
}
if (NULL != btl_names_argv) {
btl_names = opal_argv_join(btl_names_argv, ' ');
opal_argv_free(btl_names_argv);
} else {
btl_names = strdup("no devices available");
}
/* sort r2 list by exclusivity */
@ -206,11 +229,11 @@ int mca_bml_r2_add_procs( size_t nprocs,
if (NULL == btl_endpoints) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
int btl_inuse = 0;
/* if the r2 can reach the destination proc it sets the
* corresponding bit (proc index) in the reachable bitmap
* and can return addressing information for each proc
@ -444,12 +467,16 @@ int mca_bml_r2_add_procs( size_t nprocs,
if (mca_bml_r2.show_unreach_errors &&
OMPI_ERR_UNREACH == ret) {
orte_show_help("help-mca-bml-r2",
"unreachable proc",
true, ORTE_NAME_PRINT(&(ompi_proc_local_proc->proc_name)),
ORTE_NAME_PRINT(&(unreach_proc->proc_name)), NULL);
true,
(ompi_proc_local_proc->proc_hostname ?
ompi_proc_local_proc->proc_hostname :
ORTE_NAME_PRINT(&(ompi_proc_local_proc->proc_name))),
(unreach_proc->proc_hostname ?
unreach_proc->proc_hostname :
ORTE_NAME_PRINT(&(unreach_proc->proc_name))),
btl_names);
}
free(new_procs);
@ -538,6 +565,11 @@ int mca_bml_r2_finalize( void ) {
size_t p, num_procs;
opal_list_item_t* w_item;
if (NULL != btl_names) {
free(btl_names);
btl_names = NULL;
}
/* Similar to mca_bml_r2_del_btl ... */
procs = ompi_proc_all(&num_procs);
if(NULL == procs)

Просмотреть файл

@ -10,6 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -19,7 +20,15 @@
# This is the US/English general help file for Open MPI.
#
[unreachable proc]
Process %s is unable to reach %s for MPI communication.
If you specified the use of a BTL component, you may have
forgotten a component (such as "self") in the list of
usable components.
At least one pair of MPI processes are unable to reach each other for
MPI communications. This means that no Open MPI device has inidicated
that it can be used to communicate between these processes. This is
an error; Open MPI requires that all MPI processes be able to reach
each other. This error can sometimes be the result of forgetting to
specify the "self" BTL.
Process 1 is on host: %s
Process 2 is on host: %s
BTLs attempted: %s
Your MPI job is now going to abort; sorry.