2004-03-19 09:12:43 +03:00
|
|
|
/*
|
2005-11-05 22:57:48 +03:00
|
|
|
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
|
|
|
* University Research and Technology
|
|
|
|
* Corporation. All rights reserved.
|
2014-07-03 04:34:44 +04:00
|
|
|
* Copyright (c) 2004-2014 The University of Tennessee and The University
|
2005-11-05 22:57:48 +03:00
|
|
|
* of Tennessee Research Foundation. All rights
|
|
|
|
* reserved.
|
2004-11-28 23:09:25 +03:00
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
2005-03-24 15:43:37 +03:00
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2006-12-05 22:07:02 +03:00
|
|
|
* Copyright (c) 2006 University of Houston. All rights reserved.
|
2013-01-28 03:25:10 +04:00
|
|
|
* Copyright (c) 2008-2013 Cisco Systems, Inc. All rights reserved.
|
2009-02-24 20:17:33 +03:00
|
|
|
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
|
2012-06-27 05:28:28 +04:00
|
|
|
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
2013-01-28 03:25:10 +04:00
|
|
|
* Copyright (c) 2012 Los Alamos National Security, LLC.
|
|
|
|
* All rights reserved.
|
2004-11-22 04:38:40 +03:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
2004-03-19 09:12:43 +03:00
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2004-10-20 05:03:09 +04:00
|
|
|
#include "ompi_config.h"
|
2004-03-19 09:12:43 +03:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdarg.h>
|
2008-09-23 21:19:24 +04:00
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_SYS_PARAM_H
|
|
|
|
#include <sys/param.h>
|
|
|
|
#endif
|
2004-03-19 09:12:43 +03:00
|
|
|
|
2013-02-13 01:10:11 +04:00
|
|
|
#include "opal/util/show_help.h"
|
2013-01-28 03:25:10 +04:00
|
|
|
#include "ompi/mca/rte/rte.h"
|
2006-02-12 04:33:29 +03:00
|
|
|
#include "ompi/errhandler/errhandler_predefined.h"
|
|
|
|
#include "ompi/errhandler/errcode.h"
|
|
|
|
#include "ompi/communicator/communicator.h"
|
|
|
|
#include "ompi/file/file.h"
|
|
|
|
#include "ompi/win/win.h"
|
2005-07-04 06:16:57 +04:00
|
|
|
#include "opal/util/printf.h"
|
2009-02-14 05:26:12 +03:00
|
|
|
#include "opal/util/output.h"
|
2004-03-19 09:12:43 +03:00
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
/*
|
|
|
|
* Local functions
|
|
|
|
*/
|
2004-11-01 19:05:31 +03:00
|
|
|
static void backend_fatal(char *type, struct ompi_communicator_t *comm,
|
|
|
|
char *name, int *error_code, va_list arglist);
|
2004-09-21 03:01:40 +04:00
|
|
|
static void out(char *str, char *arg);
|
|
|
|
|
|
|
|
|
2004-09-06 16:06:27 +04:00
|
|
|
void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
|
|
|
|
int *error_code, ...)
|
2004-03-19 09:12:43 +03:00
|
|
|
{
|
2004-09-21 03:01:40 +04:00
|
|
|
char *name;
|
2004-11-01 19:05:31 +03:00
|
|
|
struct ompi_communicator_t *abort_comm;
|
2004-03-19 09:12:43 +03:00
|
|
|
va_list arglist;
|
2004-06-23 00:21:35 +04:00
|
|
|
|
2004-03-19 09:12:43 +03:00
|
|
|
va_start(arglist, error_code);
|
2004-06-18 19:47:17 +04:00
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
if (NULL != comm) {
|
|
|
|
name = (*comm)->c_name;
|
2004-11-01 19:05:31 +03:00
|
|
|
abort_comm = *comm;
|
2004-06-23 00:21:35 +04:00
|
|
|
} else {
|
2004-09-21 03:01:40 +04:00
|
|
|
name = NULL;
|
2004-11-01 19:05:31 +03:00
|
|
|
abort_comm = NULL;
|
2004-06-23 00:21:35 +04:00
|
|
|
}
|
2004-11-01 19:05:31 +03:00
|
|
|
backend_fatal("communicator", abort_comm, name, error_code, arglist);
|
2007-07-30 20:08:18 +04:00
|
|
|
va_end(arglist);
|
2004-03-19 09:12:43 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-09-06 16:06:27 +04:00
|
|
|
void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
|
|
|
|
int *error_code, ...)
|
|
|
|
{
|
2004-09-21 03:01:40 +04:00
|
|
|
char *name;
|
2004-11-01 19:05:31 +03:00
|
|
|
struct ompi_communicator_t *abort_comm;
|
2004-09-06 16:06:27 +04:00
|
|
|
va_list arglist;
|
|
|
|
|
|
|
|
va_start(arglist, error_code);
|
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
if (NULL != file) {
|
|
|
|
name = (*file)->f_filename;
|
2004-11-01 19:05:31 +03:00
|
|
|
abort_comm = (*file)->f_comm;
|
2004-09-06 16:06:27 +04:00
|
|
|
} else {
|
2004-09-21 03:01:40 +04:00
|
|
|
name = NULL;
|
2004-11-01 19:05:31 +03:00
|
|
|
abort_comm = NULL;
|
2004-09-06 16:06:27 +04:00
|
|
|
}
|
2004-11-01 19:05:31 +03:00
|
|
|
backend_fatal("file", abort_comm, name, error_code, arglist);
|
2007-07-30 20:08:18 +04:00
|
|
|
va_end(arglist);
|
2004-09-06 16:06:27 +04:00
|
|
|
}
|
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
|
2004-09-06 16:06:27 +04:00
|
|
|
void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
|
|
|
|
int *error_code, ...)
|
|
|
|
{
|
2004-09-21 03:01:40 +04:00
|
|
|
char *name;
|
2004-11-01 19:05:31 +03:00
|
|
|
struct ompi_communicator_t *abort_comm = NULL;
|
2004-09-06 16:06:27 +04:00
|
|
|
va_list arglist;
|
|
|
|
|
|
|
|
va_start(arglist, error_code);
|
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
if (NULL != win) {
|
|
|
|
name = (*win)->w_name;
|
2004-09-06 16:06:27 +04:00
|
|
|
} else {
|
2004-09-21 03:01:40 +04:00
|
|
|
name = NULL;
|
2004-09-06 16:06:27 +04:00
|
|
|
}
|
2004-11-01 19:05:31 +03:00
|
|
|
backend_fatal("win", abort_comm, name, error_code, arglist);
|
2007-07-30 20:08:18 +04:00
|
|
|
va_end(arglist);
|
2004-09-06 16:06:27 +04:00
|
|
|
}
|
2004-09-21 03:01:40 +04:00
|
|
|
|
2004-09-06 16:06:27 +04:00
|
|
|
void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
|
|
|
|
int *error_code, ...)
|
|
|
|
{
|
2005-10-15 02:06:25 +04:00
|
|
|
/* Don't need anything more -- just need this function to exist */
|
|
|
|
/* Silence some compiler warnings */
|
|
|
|
|
|
|
|
va_list arglist;
|
|
|
|
va_start(arglist, error_code);
|
|
|
|
va_end(arglist);
|
2004-09-06 16:06:27 +04:00
|
|
|
}
|
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
|
2004-09-06 16:06:27 +04:00
|
|
|
void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
|
|
|
|
int *error_code, ...)
|
|
|
|
{
|
|
|
|
/* Don't need anything more -- just need this function to exist */
|
2005-10-15 02:06:25 +04:00
|
|
|
/* Silence some compiler warnings */
|
|
|
|
|
|
|
|
va_list arglist;
|
|
|
|
va_start(arglist, error_code);
|
|
|
|
va_end(arglist);
|
2004-09-06 16:06:27 +04:00
|
|
|
}
|
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
|
2004-09-06 16:06:27 +04:00
|
|
|
void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
|
|
|
|
int *error_code, ...)
|
2004-03-19 09:12:43 +03:00
|
|
|
{
|
2005-10-15 02:06:25 +04:00
|
|
|
/* Don't need anything more -- just need this function to exist */
|
|
|
|
/* Silence some compiler warnings */
|
|
|
|
|
|
|
|
va_list arglist;
|
|
|
|
va_start(arglist, error_code);
|
|
|
|
va_end(arglist);
|
2004-03-19 09:12:43 +03:00
|
|
|
}
|
2004-09-21 03:01:40 +04:00
|
|
|
|
|
|
|
|
|
|
|
static void out(char *str, char *arg)
|
|
|
|
{
|
|
|
|
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
|
|
|
|
if (NULL != arg) {
|
2008-06-09 18:53:58 +04:00
|
|
|
opal_output(0, str, arg);
|
2004-09-21 03:01:40 +04:00
|
|
|
} else {
|
2009-10-23 06:43:13 +04:00
|
|
|
opal_output(0, "%s", str);
|
2004-09-21 03:01:40 +04:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (NULL != arg) {
|
|
|
|
fprintf(stderr, str, arg);
|
|
|
|
} else {
|
2009-10-23 06:43:13 +04:00
|
|
|
fprintf(stderr, "%s", str);
|
2004-09-21 03:01:40 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-09-23 21:19:24 +04:00
|
|
|
/*
|
2013-02-13 01:10:11 +04:00
|
|
|
* Use opal_show_help() to aggregate the error messages (i.e., show it
|
2010-01-07 21:16:39 +03:00
|
|
|
* once rather than N times).
|
|
|
|
*
|
|
|
|
* Note that this function will only be invoked for errors during the
|
|
|
|
* MPI application (i.e., after MPI_INIT and before MPI_FINALIZE). So
|
|
|
|
* there's no need to handle the pre-MPI_INIT and post-MPI_FINALIZE
|
|
|
|
* errors here.
|
2008-09-23 21:19:24 +04:00
|
|
|
*/
|
|
|
|
static void backend_fatal_aggregate(char *type,
|
|
|
|
struct ompi_communicator_t *comm,
|
|
|
|
char *name, int *error_code,
|
|
|
|
va_list arglist)
|
|
|
|
{
|
|
|
|
char *arg, *prefix, *err_msg = "Unknown error";
|
|
|
|
bool err_msg_need_free = false;
|
|
|
|
|
2013-01-28 03:25:10 +04:00
|
|
|
assert(ompi_mpi_initialized && !ompi_mpi_finalized);
|
|
|
|
|
2008-09-23 21:19:24 +04:00
|
|
|
arg = va_arg(arglist, char*);
|
|
|
|
va_end(arglist);
|
|
|
|
|
2013-01-28 03:25:10 +04:00
|
|
|
asprintf(&prefix, "[%s:%d]", ompi_process_info.nodename,
|
|
|
|
(int) ompi_process_info.pid);
|
2008-09-23 21:19:24 +04:00
|
|
|
|
|
|
|
if (NULL != error_code) {
|
|
|
|
err_msg = ompi_mpi_errnum_get_string(*error_code);
|
|
|
|
if (NULL == err_msg) {
|
|
|
|
err_msg_need_free = true;
|
|
|
|
asprintf(&err_msg, "Error code: %d (no associated error message)",
|
|
|
|
*error_code);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-01-28 03:25:10 +04:00
|
|
|
if (NULL != name) {
|
2013-02-13 01:10:11 +04:00
|
|
|
opal_show_help("help-mpi-errors.txt",
|
2008-09-23 21:19:24 +04:00
|
|
|
"mpi_errors_are_fatal", false,
|
|
|
|
prefix, (NULL == arg) ? "" : "in",
|
|
|
|
(NULL == arg) ? "" : arg,
|
2013-01-28 03:25:10 +04:00
|
|
|
prefix, OMPI_PROC_MY_NAME->jobid, OMPI_PROC_MY_NAME->vpid,
|
2011-06-15 17:10:13 +04:00
|
|
|
prefix, type, name, prefix, err_msg, prefix, type, prefix);
|
2013-01-28 03:25:10 +04:00
|
|
|
} else {
|
2013-02-13 01:10:11 +04:00
|
|
|
opal_show_help("help-mpi-errors.txt",
|
2008-09-23 21:19:24 +04:00
|
|
|
"mpi_errors_are_fatal unknown handle", false,
|
|
|
|
prefix, (NULL == arg) ? "" : "in",
|
|
|
|
(NULL == arg) ? "" : arg,
|
2013-01-28 03:25:10 +04:00
|
|
|
prefix, OMPI_PROC_MY_NAME->jobid, OMPI_PROC_MY_NAME->vpid,
|
2011-06-15 17:10:13 +04:00
|
|
|
prefix, type, prefix, err_msg, prefix, type, prefix);
|
2008-09-23 21:19:24 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (err_msg_need_free) {
|
|
|
|
free(err_msg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-01-07 21:16:39 +03:00
|
|
|
* Note that this function has to handle pre-MPI_INIT and
|
|
|
|
* post-MPI_FINALIZE errors, which backend_fatal_aggregate() does not
|
|
|
|
* have to handle.
|
2013-01-28 03:25:10 +04:00
|
|
|
*
|
|
|
|
* This function also intentionally does not call malloc(), just in
|
|
|
|
* case we're being called due to some kind of stack/memory error --
|
|
|
|
* we *might* be able to get a message out if we're not further
|
|
|
|
* corrupting the stack by calling malloc()...
|
2008-09-23 21:19:24 +04:00
|
|
|
*/
|
|
|
|
static void backend_fatal_no_aggregate(char *type,
|
|
|
|
struct ompi_communicator_t *comm,
|
|
|
|
char *name, int *error_code,
|
|
|
|
va_list arglist)
|
2004-09-21 03:01:40 +04:00
|
|
|
{
|
|
|
|
char *arg;
|
|
|
|
|
2013-01-28 03:25:10 +04:00
|
|
|
assert(!ompi_mpi_initialized || ompi_mpi_finalized);
|
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
fflush(stdout);
|
|
|
|
fflush(stderr);
|
2008-09-23 21:19:24 +04:00
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
arg = va_arg(arglist, char*);
|
|
|
|
|
2010-01-07 21:16:39 +03:00
|
|
|
/* Per #2152, print out in plain english if something was invoked
|
|
|
|
before MPI_INIT* or after MPI_FINALIZE */
|
2011-03-07 19:45:45 +03:00
|
|
|
if (!ompi_mpi_init_started && !ompi_mpi_initialized) {
|
2010-01-07 21:16:39 +03:00
|
|
|
if (NULL != arg) {
|
|
|
|
out("*** The %s() function was called before MPI_INIT was invoked.\n"
|
|
|
|
"*** This is disallowed by the MPI standard.\n", arg);
|
|
|
|
} else {
|
|
|
|
out("*** An MPI function was called before MPI_INIT was invoked.\n"
|
|
|
|
"*** This is disallowed by the MPI standard.\n"
|
|
|
|
"*** Unfortunately, no further information is available on *which* MPI\n"
|
|
|
|
"*** function was invoked, sorry. :-(\n", NULL);
|
2008-08-06 16:15:49 +04:00
|
|
|
}
|
2010-01-07 21:16:39 +03:00
|
|
|
out("*** Your MPI job will now abort.\n", NULL);
|
2004-09-21 03:01:40 +04:00
|
|
|
} else if (ompi_mpi_finalized) {
|
2010-01-07 21:16:39 +03:00
|
|
|
if (NULL != arg) {
|
|
|
|
out("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
|
|
|
|
"*** This is disallowed by the MPI standard.\n", arg);
|
|
|
|
} else {
|
|
|
|
out("*** An MPI function was called after MPI_FINALIZE was invoked.\n"
|
|
|
|
"*** This is disallowed by the MPI standard.\n"
|
|
|
|
"*** Unfortunately, no further information is available on *which* MPI\n"
|
|
|
|
"*** function was invoked, sorry. :-(\n", NULL);
|
|
|
|
}
|
|
|
|
out("*** Your MPI job will now abort.\n", NULL);
|
2004-09-21 03:01:40 +04:00
|
|
|
}
|
|
|
|
|
2010-01-07 21:16:39 +03:00
|
|
|
else {
|
|
|
|
int len;
|
|
|
|
char str[MPI_MAX_PROCESSOR_NAME * 2];
|
|
|
|
|
|
|
|
/* THESE MESSAGES ARE COORDINATED WITH FIXED STRINGS IN
|
|
|
|
help-mpi-errors.txt! Do not change these messages without
|
|
|
|
also changing help-mpi-errors.txt! */
|
|
|
|
|
|
|
|
/* This is after MPI_INIT* and before MPI_FINALIZE, so print
|
|
|
|
the error message normally */
|
|
|
|
if (NULL != arg) {
|
|
|
|
out("*** An error occurred in %s\n", arg);
|
2004-09-21 03:01:40 +04:00
|
|
|
} else {
|
2010-01-07 21:16:39 +03:00
|
|
|
out("*** An error occurred\n", NULL);
|
2004-09-21 03:01:40 +04:00
|
|
|
}
|
2010-01-07 21:16:39 +03:00
|
|
|
|
|
|
|
if (NULL != name) {
|
|
|
|
/* Don't use asprintf() here because there may be stack /
|
|
|
|
heap corruption by the time we're invoked, so just do
|
|
|
|
it on the stack */
|
|
|
|
str[0] = '\0';
|
|
|
|
len = sizeof(str) - 1;
|
|
|
|
strncat(str, type, len);
|
|
|
|
|
|
|
|
len -= strlen(type);
|
|
|
|
if (len > 0) {
|
|
|
|
strncat(str, " ", len);
|
|
|
|
|
|
|
|
--len;
|
|
|
|
if (len > 0) {
|
|
|
|
strncat(str, name, len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out("*** on %s", str);
|
|
|
|
} else if (NULL == name) {
|
|
|
|
out("*** on a NULL %s\n", type);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NULL != error_code) {
|
|
|
|
char *tmp = ompi_mpi_errnum_get_string(*error_code);
|
|
|
|
if (NULL != tmp) {
|
|
|
|
out("*** %s\n", tmp);
|
|
|
|
} else {
|
|
|
|
char intbuf[32];
|
|
|
|
snprintf(intbuf, 32, "%d", *error_code);
|
|
|
|
out("*** Error code: %d (no associated error message)\n", intbuf);
|
|
|
|
}
|
|
|
|
}
|
2011-06-15 17:10:13 +04:00
|
|
|
/* out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL); */
|
|
|
|
out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type);
|
|
|
|
out("*** and potentially your MPI job)\n", NULL);
|
|
|
|
|
2004-09-21 03:01:40 +04:00
|
|
|
}
|
2010-01-07 21:16:39 +03:00
|
|
|
va_end(arglist);
|
2008-09-23 21:19:24 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void backend_fatal(char *type, struct ompi_communicator_t *comm,
|
|
|
|
char *name, int *error_code,
|
|
|
|
va_list arglist)
|
|
|
|
{
|
2013-01-28 03:25:10 +04:00
|
|
|
/* We only want aggregation after MPI_INIT and before
|
|
|
|
MPI_FINALIZE. */
|
|
|
|
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
|
2008-09-23 21:19:24 +04:00
|
|
|
backend_fatal_aggregate(type, comm, name, error_code, arglist);
|
|
|
|
} else {
|
|
|
|
backend_fatal_no_aggregate(type, comm, name, error_code, arglist);
|
|
|
|
}
|
2004-09-21 03:01:40 +04:00
|
|
|
|
2014-07-03 04:34:44 +04:00
|
|
|
/* In most instances the communicator will be valid. If not, we are either early in
|
|
|
|
* the initialization or we are dealing with a window. Thus, it is good enough to abort
|
|
|
|
* on MPI_COMM_SELF, the error will propagate.
|
|
|
|
*/
|
2004-11-01 19:05:31 +03:00
|
|
|
if (comm == NULL) {
|
2009-02-24 20:17:33 +03:00
|
|
|
comm = &ompi_mpi_comm_self.comm;
|
2004-11-01 19:05:31 +03:00
|
|
|
}
|
|
|
|
|
2008-09-22 21:41:39 +04:00
|
|
|
if (NULL != error_code) {
|
2014-07-03 04:34:44 +04:00
|
|
|
ompi_mpi_abort(comm, *error_code);
|
2008-09-22 21:41:39 +04:00
|
|
|
} else {
|
2014-07-03 04:34:44 +04:00
|
|
|
ompi_mpi_abort(comm, 1);
|
2008-09-22 21:41:39 +04:00
|
|
|
}
|
2004-09-21 03:01:40 +04:00
|
|
|
}
|