1
1

Fix a few issues with error messages:

* If something goes wrong during ompi_mpi_init, don't erroneously
   report that it is illegal to invoke MPI_INIT* before MPI_INIT
 * Aggregate help messages when possible when something goes wring
   during ompi_mpi_init

This commit was SVN r24492.
Этот коммит содержится в:
Jeff Squyres 2011-03-07 16:45:45 +00:00
родитель 63f38e38bb
Коммит 79cf382ff3
8 изменённых файлов: 80 добавлений и 25 удалений

Просмотреть файл

@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 University of Houston. All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
@ -230,7 +230,7 @@ static void backend_fatal_no_aggregate(char *type,
/* Per #2152, print out in plain english if something was invoked
before MPI_INIT* or after MPI_FINALIZE */
if (!ompi_mpi_initialized) {
if (!ompi_mpi_init_started && !ompi_mpi_initialized) {
if (NULL != arg) {
out("*** The %s() function was called before MPI_INIT was invoked.\n"
"*** This is disallowed by the MPI standard.\n", arg);
@ -302,7 +302,7 @@ static void backend_fatal_no_aggregate(char *type,
out("*** Error code: %d (no associated error message)\n", intbuf);
}
}
out("*** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)\n", NULL);
out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL);
}
va_end(arglist);
}
@ -318,8 +318,7 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm,
meaning that there is a better chance that the error message
will actually get printed). Note that we can only do
aggregation after MPI_INIT and before MPI_FINALIZE. */
if (orte_help_want_aggregate && ompi_mpi_initialized &&
!ompi_mpi_finalized) {
if (orte_help_want_aggregate && orte_show_help_is_available()) {
backend_fatal_aggregate(type, comm, name, error_code, arglist);
} else {
backend_fatal_no_aggregate(type, comm, name, error_code, arglist);

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -25,11 +25,11 @@
%s *** An error occurred %s %s
%s *** on %s %s
%s *** %s
%s *** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)
%s *** MPI_ERRORS_ARE_FATAL: your MPI job will now abort
#
[mpi_errors_are_fatal unknown handle]
%s *** An error occurred %s %s
%s *** on a NULL %s
%s *** %s
%s *** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)
%s *** MPI_ERRORS_ARE_FATAL: your MPI job will now abort
#

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -31,7 +31,6 @@ developer):
--> Returned "%s" (%d) instead of "Success" (0)
#
[mpi_init:startup:pml-add-procs-fail]
MPI_INIT has failed because at least one MPI process is unreachable
from another. This *usually* means that an underlying communication
plugin -- such as a BTL or an MTL -- has either not loaded or not
@ -64,12 +63,14 @@ Typical causes for this problem include:
which case Open MPI will not bind any processes on that node
- A startup mechanism was used which did not tell Open MPI which
processors to bind processes to
#
[mpi_finalize:invoked_multiple_times]
The function MPI_FINALIZE was invoked multiple times in a single
process on host %s, PID %d.
This indicates an erroneous MPI program; MPI_FINALIZE is only allowed
to be invoked exactly once in a process.
#
[proc:heterogeneous-support-unavailable]
The build of Open MPI running on host %s was not
compiled with heterogeneous support. A process running on host
@ -109,3 +110,12 @@ The process that invoked fork was:
If you are *absolutely sure* that your application will successfully
and correctly survive a call to fork(), you may disable this warning
by setting the mpi_warn_on_fork MCA parameter to 0.
#
[ompi mpi abort:cannot guarantee all killed]
An MPI process is aborting at a time when it cannot guarantee that all
of its peer processes in the job will be killed properly. You should
double check that everything has shut down cleanly.
Reason: %s
Local host: %s
PID: %d

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
@ -44,6 +44,8 @@ struct opal_thread_t;
/* Global variables and symbols for the MPI layer */
/** Did mpi start to initialize? */
OMPI_DECLSPEC extern bool ompi_mpi_init_started;
/** Is mpi initialized? */
OMPI_DECLSPEC extern bool ompi_mpi_initialized;
/** Has mpi been finalized? */

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -37,6 +37,7 @@
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "ompi/communicator/communicator.h"
#include "ompi/runtime/mpiruntime.h"
@ -74,8 +75,8 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
}
pid = getpid();
/* Should we print a stack trace? */
/* Should we print a stack trace? Not aggregated because they
might be different on all processes. */
if (ompi_mpi_abort_print_stack) {
char **messages;
int len, i;
@ -134,9 +135,21 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
communicators are not setup yet). Sorry, Charlie... */
if (!orte_initialized || !ompi_mpi_initialized || ompi_mpi_finalized) {
fprintf(stderr, "[%s:%d] Abort %s completed successfully; not able to guarantee that all other processes were killed!\n",
host, (int) pid, ompi_mpi_finalized ?
"after MPI_FINALIZE" : "before MPI_INIT");
if (orte_show_help_is_available()) {
orte_show_help("help-mpi-runtime.txt",
"ompi mpi abort:cannot guarantee all killed",
true,
(ompi_mpi_finalized ?
"After MPI_FINALIZE was invoked" :
(ompi_mpi_init_started ?
"Before MPI_INIT completed" :
"Before MPI_INIT was invoked")),
host, (int) pid);
} else {
fprintf(stderr, "[%s:%d] Local abort %s completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
host, (int) pid, ompi_mpi_finalized ?
"after MPI_FINALIZE" : "before MPI_INIT");
}
exit(errcode);
}

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2009 University of Houston. All rights reserved.
@ -114,6 +114,7 @@ const char ompi_version_string[] = OMPI_IDENT_STRING;
* Global variables and symbols for the MPI layer
*/
bool ompi_mpi_init_started = false;
bool ompi_mpi_initialized = false;
bool ompi_mpi_finalized = false;
@ -297,6 +298,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
* for the modex in order to work in heterogeneous environments. */
uint8_t threadlevel_bf;
/* Indicate that we have *started* MPI_INIT*. MPI_FINALIZE has
something sorta similar in a static local variable in
ompi_mpi_finalize(). */
ompi_mpi_init_started = true;
/* Setup enough to check get/set MCA params */
if (ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
@ -764,7 +770,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));
/*
* Dump all MCA parameters if requested
*/
@ -927,8 +932,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
error = "ompi_mpiext_init";
goto error;
}
/* Fall through */
error:
if (ret != OMPI_SUCCESS) {
/* Only print a message if one was not already printed */

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -40,7 +40,24 @@
#include "orte/util/show_help.h"
bool orte_help_want_aggregate;
bool orte_help_want_aggregate = false;
/*
* Local variable to know whether aggregated show_help is available or
* not
*/
static bool ready = false;
/*
* Same for systems with or without full ORTE support
*/
bool orte_show_help_is_available(void)
{
/* This is a function only to give us forward flexibility in case
we need a more complicated check someday. */
return ready;
}
/************************************************************************/
@ -50,11 +67,13 @@ bool orte_help_want_aggregate;
int orte_show_help_init(void)
{
ready = true;
return ORTE_SUCCESS;
}
void orte_show_help_finalize(void)
{
ready = false;
return;
}
@ -128,7 +147,6 @@ static struct timeval show_help_interval = { 5, 0 };
static time_t show_help_time_last_displayed = 0;
static bool show_help_timer_set = false;
static opal_event_t show_help_timer_event;
static bool ready;
static opal_show_help_fn_t save_help = NULL;
@ -582,12 +600,12 @@ int orte_show_help_init(void)
if (ready) {
return ORTE_SUCCESS;
}
ready = true;
OBJ_CONSTRUCT(&abd_tuples, opal_list_t);
save_help = opal_show_help;
opal_show_help = orte_show_help;
ready = true;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,6 +53,14 @@ BEGIN_C_DECLS
*/
ORTE_DECLSPEC int orte_show_help_init(void);
/**
* Allow other parts of the code base to know if the ORTE show_help
* system is available or not (does not necessarily indicate that
* aggregating is available; on no-ORTE systems, ORTE show_help is
* available, but aggregating is not).
*/
ORTE_DECLSPEC bool orte_show_help_is_available(void);
/**
* Shut down the output stream system.
*