Fix a few issues with error messages:
* If something goes wrong during ompi_mpi_init, don't erroneously report that it is illegal to invoke MPI_INIT* before MPI_INIT * Aggregate help messages when possible when something goes wring during ompi_mpi_init This commit was SVN r24492.
Этот коммит содержится в:
родитель
63f38e38bb
Коммит
79cf382ff3
@ -10,7 +10,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006 University of Houston. All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -230,7 +230,7 @@ static void backend_fatal_no_aggregate(char *type,
|
||||
|
||||
/* Per #2152, print out in plain english if something was invoked
|
||||
before MPI_INIT* or after MPI_FINALIZE */
|
||||
if (!ompi_mpi_initialized) {
|
||||
if (!ompi_mpi_init_started && !ompi_mpi_initialized) {
|
||||
if (NULL != arg) {
|
||||
out("*** The %s() function was called before MPI_INIT was invoked.\n"
|
||||
"*** This is disallowed by the MPI standard.\n", arg);
|
||||
@ -302,7 +302,7 @@ static void backend_fatal_no_aggregate(char *type,
|
||||
out("*** Error code: %d (no associated error message)\n", intbuf);
|
||||
}
|
||||
}
|
||||
out("*** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)\n", NULL);
|
||||
out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL);
|
||||
}
|
||||
va_end(arglist);
|
||||
}
|
||||
@ -318,8 +318,7 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm,
|
||||
meaning that there is a better chance that the error message
|
||||
will actually get printed). Note that we can only do
|
||||
aggregation after MPI_INIT and before MPI_FINALIZE. */
|
||||
if (orte_help_want_aggregate && ompi_mpi_initialized &&
|
||||
!ompi_mpi_finalized) {
|
||||
if (orte_help_want_aggregate && orte_show_help_is_available()) {
|
||||
backend_fatal_aggregate(type, comm, name, error_code, arglist);
|
||||
} else {
|
||||
backend_fatal_no_aggregate(type, comm, name, error_code, arglist);
|
||||
|
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -25,11 +25,11 @@
|
||||
%s *** An error occurred %s %s
|
||||
%s *** on %s %s
|
||||
%s *** %s
|
||||
%s *** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)
|
||||
%s *** MPI_ERRORS_ARE_FATAL: your MPI job will now abort
|
||||
#
|
||||
[mpi_errors_are_fatal unknown handle]
|
||||
%s *** An error occurred %s %s
|
||||
%s *** on a NULL %s
|
||||
%s *** %s
|
||||
%s *** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)
|
||||
%s *** MPI_ERRORS_ARE_FATAL: your MPI job will now abort
|
||||
#
|
||||
|
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2007-2011 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -31,7 +31,6 @@ developer):
|
||||
--> Returned "%s" (%d) instead of "Success" (0)
|
||||
#
|
||||
[mpi_init:startup:pml-add-procs-fail]
|
||||
|
||||
MPI_INIT has failed because at least one MPI process is unreachable
|
||||
from another. This *usually* means that an underlying communication
|
||||
plugin -- such as a BTL or an MTL -- has either not loaded or not
|
||||
@ -64,12 +63,14 @@ Typical causes for this problem include:
|
||||
which case Open MPI will not bind any processes on that node
|
||||
- A startup mechanism was used which did not tell Open MPI which
|
||||
processors to bind processes to
|
||||
#
|
||||
[mpi_finalize:invoked_multiple_times]
|
||||
The function MPI_FINALIZE was invoked multiple times in a single
|
||||
process on host %s, PID %d.
|
||||
|
||||
This indicates an erroneous MPI program; MPI_FINALIZE is only allowed
|
||||
to be invoked exactly once in a process.
|
||||
#
|
||||
[proc:heterogeneous-support-unavailable]
|
||||
The build of Open MPI running on host %s was not
|
||||
compiled with heterogeneous support. A process running on host
|
||||
@ -109,3 +110,12 @@ The process that invoked fork was:
|
||||
If you are *absolutely sure* that your application will successfully
|
||||
and correctly survive a call to fork(), you may disable this warning
|
||||
by setting the mpi_warn_on_fork MCA parameter to 0.
|
||||
#
|
||||
[ompi mpi abort:cannot guarantee all killed]
|
||||
An MPI process is aborting at a time when it cannot guarantee that all
|
||||
of its peer processes in the job will be killed properly. You should
|
||||
double check that everything has shut down cleanly.
|
||||
|
||||
Reason: %s
|
||||
Local host: %s
|
||||
PID: %d
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
@ -44,6 +44,8 @@ struct opal_thread_t;
|
||||
|
||||
/* Global variables and symbols for the MPI layer */
|
||||
|
||||
/** Did mpi start to initialize? */
|
||||
OMPI_DECLSPEC extern bool ompi_mpi_init_started;
|
||||
/** Is mpi initialized? */
|
||||
OMPI_DECLSPEC extern bool ompi_mpi_initialized;
|
||||
/** Has mpi been finalized? */
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -37,6 +37,7 @@
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "ompi/communicator/communicator.h"
|
||||
#include "ompi/runtime/mpiruntime.h"
|
||||
@ -74,8 +75,8 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
}
|
||||
pid = getpid();
|
||||
|
||||
/* Should we print a stack trace? */
|
||||
|
||||
/* Should we print a stack trace? Not aggregated because they
|
||||
might be different on all processes. */
|
||||
if (ompi_mpi_abort_print_stack) {
|
||||
char **messages;
|
||||
int len, i;
|
||||
@ -134,9 +135,21 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
|
||||
communicators are not setup yet). Sorry, Charlie... */
|
||||
|
||||
if (!orte_initialized || !ompi_mpi_initialized || ompi_mpi_finalized) {
|
||||
fprintf(stderr, "[%s:%d] Abort %s completed successfully; not able to guarantee that all other processes were killed!\n",
|
||||
host, (int) pid, ompi_mpi_finalized ?
|
||||
"after MPI_FINALIZE" : "before MPI_INIT");
|
||||
if (orte_show_help_is_available()) {
|
||||
orte_show_help("help-mpi-runtime.txt",
|
||||
"ompi mpi abort:cannot guarantee all killed",
|
||||
true,
|
||||
(ompi_mpi_finalized ?
|
||||
"After MPI_FINALIZE was invoked" :
|
||||
(ompi_mpi_init_started ?
|
||||
"Before MPI_INIT completed" :
|
||||
"Before MPI_INIT was invoked")),
|
||||
host, (int) pid);
|
||||
} else {
|
||||
fprintf(stderr, "[%s:%d] Local abort %s completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
|
||||
host, (int) pid, ompi_mpi_finalized ?
|
||||
"after MPI_FINALIZE" : "before MPI_INIT");
|
||||
}
|
||||
exit(errcode);
|
||||
}
|
||||
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2009 University of Houston. All rights reserved.
|
||||
@ -114,6 +114,7 @@ const char ompi_version_string[] = OMPI_IDENT_STRING;
|
||||
* Global variables and symbols for the MPI layer
|
||||
*/
|
||||
|
||||
bool ompi_mpi_init_started = false;
|
||||
bool ompi_mpi_initialized = false;
|
||||
bool ompi_mpi_finalized = false;
|
||||
|
||||
@ -297,6 +298,11 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
* for the modex in order to work in heterogeneous environments. */
|
||||
uint8_t threadlevel_bf;
|
||||
|
||||
/* Indicate that we have *started* MPI_INIT*. MPI_FINALIZE has
|
||||
something sorta similar in a static local variable in
|
||||
ompi_mpi_finalize(). */
|
||||
ompi_mpi_init_started = true;
|
||||
|
||||
/* Setup enough to check get/set MCA params */
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
|
||||
@ -764,7 +770,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
|
||||
MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));
|
||||
|
||||
|
||||
/*
|
||||
* Dump all MCA parameters if requested
|
||||
*/
|
||||
@ -927,8 +932,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
error = "ompi_mpiext_init";
|
||||
goto error;
|
||||
}
|
||||
|
||||
|
||||
/* Fall through */
|
||||
error:
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
/* Only print a message if one was not already printed */
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -40,7 +40,24 @@
|
||||
|
||||
#include "orte/util/show_help.h"
|
||||
|
||||
bool orte_help_want_aggregate;
|
||||
bool orte_help_want_aggregate = false;
|
||||
|
||||
/*
|
||||
* Local variable to know whether aggregated show_help is available or
|
||||
* not
|
||||
*/
|
||||
static bool ready = false;
|
||||
|
||||
/*
|
||||
* Same for systems with or without full ORTE support
|
||||
*/
|
||||
bool orte_show_help_is_available(void)
|
||||
{
|
||||
/* This is a function only to give us forward flexibility in case
|
||||
we need a more complicated check someday. */
|
||||
|
||||
return ready;
|
||||
}
|
||||
|
||||
/************************************************************************/
|
||||
|
||||
@ -50,11 +67,13 @@ bool orte_help_want_aggregate;
|
||||
|
||||
int orte_show_help_init(void)
|
||||
{
|
||||
ready = true;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
void orte_show_help_finalize(void)
|
||||
{
|
||||
ready = false;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -128,7 +147,6 @@ static struct timeval show_help_interval = { 5, 0 };
|
||||
static time_t show_help_time_last_displayed = 0;
|
||||
static bool show_help_timer_set = false;
|
||||
static opal_event_t show_help_timer_event;
|
||||
static bool ready;
|
||||
|
||||
static opal_show_help_fn_t save_help = NULL;
|
||||
|
||||
@ -582,12 +600,12 @@ int orte_show_help_init(void)
|
||||
if (ready) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
ready = true;
|
||||
|
||||
OBJ_CONSTRUCT(&abd_tuples, opal_list_t);
|
||||
|
||||
save_help = opal_show_help;
|
||||
opal_show_help = orte_show_help;
|
||||
ready = true;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -53,6 +53,14 @@ BEGIN_C_DECLS
|
||||
*/
|
||||
ORTE_DECLSPEC int orte_show_help_init(void);
|
||||
|
||||
/**
|
||||
* Allow other parts of the code base to know if the ORTE show_help
|
||||
* system is available or not (does not necessarily indicate that
|
||||
* aggregating is available; on no-ORTE systems, ORTE show_help is
|
||||
* available, but aggregating is not).
|
||||
*/
|
||||
ORTE_DECLSPEC bool orte_show_help_is_available(void);
|
||||
|
||||
/**
|
||||
* Shut down the output stream system.
|
||||
*
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user