diff --git a/ompi/runtime/help-mpi-runtime.txt b/ompi/runtime/help-mpi-runtime.txt index 67dd78ebf0..8df6955dd8 100644 --- a/ompi/runtime/help-mpi-runtime.txt +++ b/ompi/runtime/help-mpi-runtime.txt @@ -10,7 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -29,6 +29,23 @@ developer): %s --> Returned "%s" (%d) instead of "Success" (0) +# +[mpi_init:startup:pml-add-procs-fail] + +MPI_INIT has failed because at least one MPI process is unreachable +from another. This *usually* means that an underlying communication +plugin -- such as a BTL or an MTL -- has either not loaded or not +allowed itself to be used. Your MPI job will now abort. + +You may wish to try to narrow down the problem; + + * Check the output of ompi_info to see which BTL/MTL plugins are + available. + * Run your application with MPI_THREAD_SINGLE. + * Set the MCA parameter btl_base_verbose to 100 (or mtl_base_verbose, + if using MTL-based communications) to see exactly which + communication plugins were considered and/or discarded. +# [mpi-param-check-enabled-but-compiled-out] WARNING: The MCA parameter mpi_param_check has been set to true, but parameter checking has been compiled out of Open MPI. The diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 86cf293cc6..123e97d193 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2006-2009 University of Houston. All rights reserved. @@ -723,7 +723,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) } ret = MCA_PML_CALL(add_procs(procs, nprocs)); free(procs); - if( OMPI_SUCCESS != ret ) { + /* If we got "unreachable", then print a specific error message. + Otherwise, if we got some other failure, fall through to print + a generic message. */ + if (OMPI_ERR_UNREACH == ret) { + orte_show_help("help-mpi-runtime", + "mpi_init:startup:pml-add-procs-fail", true); + error = NULL; + goto error; + } else if (OMPI_SUCCESS != ret) { error = "PML add procs failed"; goto error; } @@ -892,16 +900,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) error: if (ret != OMPI_SUCCESS) { - const char *err_msg = opal_strerror(ret); - /* If ORTE was not setup yet, don't use orte_show_help */ - if (orte_setup) { - orte_show_help("help-mpi-runtime", - "mpi_init:startup:internal-failure", true, - "MPI_INIT", "MPI_INIT", error, err_msg, ret); - } else { - opal_show_help("help-mpi-runtime", - "mpi_init:startup:internal-failure", true, - "MPI_INIT", "MPI_INIT", error, err_msg, ret); + /* Only print a message if one was not already printed */ + if (NULL != error) { + const char *err_msg = opal_strerror(ret); + /* If ORTE was not setup yet, don't use orte_show_help */ + if (orte_setup) { + orte_show_help("help-mpi-runtime", + "mpi_init:startup:internal-failure", true, + "MPI_INIT", "MPI_INIT", error, err_msg, ret); + } else { + opal_show_help("help-mpi-runtime", + "mpi_init:startup:internal-failure", true, + "MPI_INIT", "MPI_INIT", error, err_msg, ret); + } } return ret; }