1
1
openmpi/oshmem/runtime/oshmem_shmem_abort.c
igor.ivanov@itseez.com c15bf147bf opal: Add opal_abort_print_stack mca variable with aliases for ompi/oshmem
This commit allows to control output during abnormal oshmem/ompi application
termination.
Fixed issue in backtrace output. HAVE_BACKTRACE was never set so user was limited
in control of this variable.
Two related mca variables are moved to opal layer. Corresponding aliases are
added for ompi and oshmem.
2015-11-25 18:18:33 +02:00

146 строки
4.1 KiB
C

/*
* Copyright (c) 2013 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "oshmem_config.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#include "opal/mca/backtrace/backtrace.h"
#include "opal/runtime/opal_params.h"
#include "orte/util/proc_info.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "oshmem/runtime/params.h"
#include "oshmem/runtime/runtime.h"
#include "oshmem/constants.h"
#include "oshmem/proc/proc.h"
static bool have_been_invoked = false;
int oshmem_shmem_abort(int errcode)
{
char *host, hostname[MAXHOSTNAMELEN];
pid_t pid = 0;
/* Protection for recursive invocation */
if (have_been_invoked) {
return OSHMEM_SUCCESS;
}
have_been_invoked = true;
/* If ORTE is initialized, use its nodename. Otherwise, call
gethostname. */
if (orte_initialized) {
host = orte_process_info.nodename;
} else {
gethostname(hostname, sizeof(hostname));
host = hostname;
}
pid = getpid();
orte_show_help("help-shmem-api.txt",
"shmem-abort",
true,
ORTE_PROC_MY_NAME->vpid,
pid,
host,
errcode);
/* Should we print a stack trace? Not aggregated because they
might be different on all processes. */
if (opal_abort_print_stack) {
char **messages;
int len, i;
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
for (i = 0; i < len; ++i) {
fprintf(stderr,
"[%s:%d] [%d] func:%s\n",
host,
(int) pid,
i,
messages[i]);
fflush(stderr);
}
free(messages);
} else {
/* This will print an message if it's unable to print the
backtrace, so we don't need an additional "else" clause
if opal_backtrace_print() is not supported. */
opal_backtrace_print(stderr, NULL, 1);
}
}
/* Should we wait for a while before aborting? */
if (0 != opal_abort_delay) {
if (opal_abort_delay < 0) {
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
host, (int) pid);
fflush(stderr);
while (1) {
sleep(5);
}
} else {
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
host, (int) pid, opal_abort_delay);
do {
sleep(1);
} while (--opal_abort_delay > 0);
}
}
if (!orte_initialized || !oshmem_shmem_initialized) {
if (orte_show_help_is_available()) {
/* TODO help message from SHMEM not from MPI is needed*/
orte_show_help("help-shmem-runtime.txt",
"oshmem shmem abort:cannot guarantee all killed",
true,
host,
(int) pid);
} else {
fprintf(stderr,
"[%s:%d] Local abort completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
host,
(int) pid);
}
oshmem_shmem_aborted = true;
exit(errcode);
}
/* abort local procs in the communicator. If the communicator is
an intercommunicator AND the abort has explicitly requested
that we abort the remote procs, then do that as well. */
oshmem_shmem_aborted = true;
/* now that we've aborted everyone else, gracefully die. */
orte_errmgr.abort(errcode, NULL );
return OSHMEM_SUCCESS;
}