diff --git a/ompi/errhandler/Makefile.am b/ompi/errhandler/Makefile.am index db155d3ece..11de1c9cc3 100644 --- a/ompi/errhandler/Makefile.am +++ b/ompi/errhandler/Makefile.am @@ -10,6 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -19,6 +20,8 @@ # This makefile.am does not stand on its own - it is included from ompi/Makefile.am +dist_pkgdata_DATA += errhandler/help-mpi-errors.txt + headers += \ errhandler/errcode.h \ errhandler/errcode-internal.h \ diff --git a/ompi/errhandler/errhandler_predefined.c b/ompi/errhandler/errhandler_predefined.c index 1a7a8d6de0..f57084876c 100644 --- a/ompi/errhandler/errhandler_predefined.c +++ b/ompi/errhandler/errhandler_predefined.c @@ -21,8 +21,15 @@ #include "ompi_config.h" #include #include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif #include "orte/util/show_help.h" +#include "orte/runtime/orte_globals.h" #include "ompi/errhandler/errhandler.h" #include "ompi/errhandler/errhandler_predefined.h" #include "ompi/errhandler/errcode.h" @@ -153,9 +160,66 @@ static void out(char *str, char *arg) } } -static void backend_fatal(char *type, struct ompi_communicator_t *comm, - char *name, int *error_code, - va_list arglist) +/* + * Use orte_show_help() to aggregate the error messages (i.e., show it + * once rather than N times). + */ +static void backend_fatal_aggregate(char *type, + struct ompi_communicator_t *comm, + char *name, int *error_code, + va_list arglist) +{ + char *arg, *prefix, *err_msg = "Unknown error"; + bool err_msg_need_free = false; + char hostname[MAXHOSTNAMELEN + 1]; + pid_t pid; + + arg = va_arg(arglist, char*); + va_end(arglist); + + gethostname(hostname, sizeof(hostname) - 1); + hostname[MAXHOSTNAMELEN] = '\0'; + pid = getpid(); + + asprintf(&prefix, "[%s:%d]", hostname, (int) pid); + + if (NULL != error_code) { + err_msg = ompi_mpi_errnum_get_string(*error_code); + if (NULL == err_msg) { + err_msg_need_free = true; + asprintf(&err_msg, "Error code: %d (no associated error message)", + *error_code); + } + } + + if (NULL != name && ompi_mpi_initialized && !ompi_mpi_finalized) { + orte_show_help("help-mpi-errors.txt", + "mpi_errors_are_fatal", false, + prefix, (NULL == arg) ? "" : "in", + (NULL == arg) ? "" : arg, + prefix, type, name, prefix, err_msg, prefix); + } else if (NULL == name) { + orte_show_help("help-mpi-errors.txt", + "mpi_errors_are_fatal unknown handle", false, + prefix, (NULL == arg) ? "" : "in", + (NULL == arg) ? "" : arg, + prefix, type, prefix, err_msg, prefix); + } + + if (err_msg_need_free) { + free(err_msg); + } +} + +/* + * THESE MESSAGES ARE COORDINATED WITH FIXED STRINGS IN + * help-mpi-errors.txt! Do not change these messages without also + * changing help-mpi-errors.txt! + */ +static void backend_fatal_no_aggregate(char *type, + struct ompi_communicator_t *comm, + char *name, int *error_code, + va_list arglist) { int len; char *arg; @@ -163,12 +227,14 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm, fflush(stdout); fflush(stderr); + arg = va_arg(arglist, char*); if (NULL != arg) { out("*** An error occurred in %s\n", arg); } else { out("*** An error occurred\n", NULL); } + va_end(arglist); if (NULL != name && ompi_mpi_initialized && !ompi_mpi_finalized) { /* Don't use asprintf() here because there may be stack / heap @@ -206,10 +272,29 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm, out("*** Error code: %d (no associated error message)\n", intbuf); } } - out("*** MPI_ERRORS_ARE_FATAL (goodbye)\n", NULL); - va_end(arglist); + out("*** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)\n", NULL); +} - /* Should we do something more intelligent here? */ +static void backend_fatal(char *type, struct ompi_communicator_t *comm, + char *name, int *error_code, + va_list arglist) +{ + /* Do we want help message aggregation? Usually yes, but it uses + malloc(), which may cause further errors if we're exiting due + to a memory problem. So we also have the option to *not* + aggregate (which doesn't use malloc during its call stack, + meaning that there is a better chance that the error message + will actually get printed). Note that we can only do + aggregation after MPI_INIT and before MPI_FINALIZE. */ + if (orte_help_want_aggregate && ompi_mpi_initialized && + !ompi_mpi_finalized) { + backend_fatal_aggregate(type, comm, name, error_code, arglist); + } else { + backend_fatal_no_aggregate(type, comm, name, error_code, arglist); + } + + /* Should we do something more intelligent than just using + COMM_SELF? */ if (comm == NULL) { comm = &ompi_mpi_comm_self; } diff --git a/ompi/errhandler/help-mpi-errors.txt b/ompi/errhandler/help-mpi-errors.txt new file mode 100644 index 0000000000..34de97c82e --- /dev/null +++ b/ompi/errhandler/help-mpi-errors.txt @@ -0,0 +1,35 @@ +# -*- text -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# *** THESE MESSAGES ARE COORDINATED WITH FIXED STRINGS IN +# *** errhandler_predefined.c! Do not change these messages without also +# *** changing errhandler_predefined.c! +# +[mpi_errors_are_fatal] +%s *** An error occurred %s %s +%s *** on %s %s +%s *** %s +%s *** MPI_ERRORS_ARE_FATAL (your MPI job will now abort) +# +[mpi_errors_are_fatal unknown handle] +%s *** An error occurred %s %s +%s *** on a NULL %s +%s *** %s +%s *** MPI_ERRORS_ARE_FATAL (your MPI job will now abort) +#