1
1

When we abort during MPI_Init, we currently emit a totally incorrect error message stating that we were unable to aggregate error messages and cannot guarantee all other processes were killed. This simply isn't true IF the rte has been initialized.

So track that the rte has reached that point, and only emit the new message if it is accurate.

Note that we still generate a TON of output for a minor error:

Ralphs-iMac:examples rhc$ mpirun -n 3 -mca btl sm ./hello_c
--------------------------------------------------------------------------
At least one pair of MPI processes are unable to reach each other for
MPI communications.  This means that no Open MPI device has indicated
that it can be used to communicate between these processes.  This is
an error; Open MPI requires that all MPI processes be able to reach
each other.  This error can sometimes be the result of forgetting to
specify the "self" BTL.

  Process 1 ([[50239,1],2]) is on host: Ralphs-iMac
  Process 2 ([[50239,1],2]) is on host: Ralphs-iMac
  BTLs attempted: sm

Your MPI job is now going to abort; sorry.
--------------------------------------------------------------------------
*** An error occurred in MPI_Init
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
***    and potentially your MPI job)
*** An error occurred in MPI_Init
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
***    and potentially your MPI job)
*** An error occurred in MPI_Init
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
***    and potentially your MPI job)
--------------------------------------------------------------------------
MPI_INIT has failed because at least one MPI process is unreachable
from another.  This *usually* means that an underlying communication
plugin -- such as a BTL or an MTL -- has either not loaded or not
allowed itself to be used.  Your MPI job will now abort.

You may wish to try to narrow down the problem;

 * Check the output of ompi_info to see which BTL/MTL plugins are
   available.
 * Run your application with MPI_THREAD_SINGLE.
 * Set the MCA parameter btl_base_verbose to 100 (or mtl_base_verbose,
   if using MTL-based communications) to see exactly which
   communication plugins were considered and/or discarded.
--------------------------------------------------------------------------
-------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code.. Per user-direction, the job has been aborted.
-------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[50239,1],2]
  Exit code:    1
--------------------------------------------------------------------------
[Ralphs-iMac.local:23227] 2 more processes have sent help message help-mca-bml-r2.txt / unreachable proc
[Ralphs-iMac.local:23227] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
[Ralphs-iMac.local:23227] 2 more processes have sent help message help-mpi-runtime / mpi_init:startup:pml-add-procs-fail
Ralphs-iMac:examples rhc$ 

Hopefully, we can agree on a way to reduce this verbage!

This commit was SVN r31686.

The following SVN revision numbers were found above:
  r2 --> open-mpi/ompi@58fdc18855
Этот коммит содержится в:
Ralph Castain 2014-05-08 15:48:16 +00:00
родитель aaae4841e9
Коммит ab4f8585b0
4 изменённых файлов: 10 добавлений и 13 удалений

Просмотреть файл

@ -14,6 +14,7 @@
* reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2009 University of Houston. All rights reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -52,6 +53,8 @@ OMPI_DECLSPEC extern bool ompi_mpi_init_started;
OMPI_DECLSPEC extern bool ompi_mpi_initialized;
/** Has mpi been finalized? */
OMPI_DECLSPEC extern bool ompi_mpi_finalized;
/** Has the RTE been initialized? */
OMPI_DECLSPEC extern bool ompi_rte_initialized;
/** Do we have multiple threads? */
OMPI_DECLSPEC extern bool ompi_mpi_thread_multiple;

Просмотреть файл

@ -129,7 +129,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
initialized period of time, so no need to check that here.
Sorry, Charlie... */
if (!ompi_mpi_initialized || ompi_mpi_finalized) {
if ((!ompi_mpi_initialized || ompi_mpi_finalized) && !ompi_rte_initialized) {
fprintf(stderr, "[%s:%d] Local abort %s completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
host, (int) pid, ompi_mpi_finalized ?
"after MPI_FINALIZE" : "before MPI_INIT");

Просмотреть файл

@ -422,6 +422,7 @@ int ompi_mpi_finalize(void)
if (OMPI_SUCCESS != (ret = ompi_rte_finalize())) {
return ret;
}
ompi_rte_initialized = false;
/* now close the rte framework */
if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_rte_base_framework) ) ) {

Просмотреть файл

@ -123,6 +123,7 @@ const char ompi_version_string[] = OMPI_IDENT_STRING;
bool ompi_mpi_init_started = false;
bool ompi_mpi_initialized = false;
bool ompi_mpi_finalized = false;
bool ompi_rte_initialized = false;
bool ompi_mpi_thread_multiple = false;
int ompi_mpi_thread_requested = MPI_THREAD_SINGLE;
@ -380,7 +381,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
size_t nprocs;
char *error = NULL;
struct timeval ompistart, ompistop;
bool rte_setup = false;
ompi_rte_collective_t *coll;
char *cmd=NULL, *av=NULL;
@ -470,7 +470,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
error = "ompi_mpi_init: ompi_rte_init failed";
goto error;
}
rte_setup = true;
ompi_rte_initialized = true;
/* check for timing request - get stop time and report elapsed time if so */
if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) {
@ -955,16 +955,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
/* Only print a message if one was not already printed */
if (NULL != error) {
const char *err_msg = opal_strerror(ret);
/* If RTE was not setup yet, don't use opal_show_help */
if (rte_setup) {
opal_show_help("help-mpi-runtime",
"mpi_init:startup:internal-failure", true,
"MPI_INIT", "MPI_INIT", error, err_msg, ret);
} else {
opal_show_help("help-mpi-runtime",
"mpi_init:startup:internal-failure", true,
"MPI_INIT", "MPI_INIT", error, err_msg, ret);
}
opal_show_help("help-mpi-runtime",
"mpi_init:startup:internal-failure", true,
"MPI_INIT", "MPI_INIT", error, err_msg, ret);
}
return ret;
}